SSE2/SSSE3 optimizations and unit test for sub_pixel_avg_variance().
Encoding of bus @ 1500kbps (first 50 frames) goes from 3min57 to 3min35, i.e. approximately a 10.5% speedup. Note that the SIMD versions which use a bilinear filter (x_offset & 7 || y_offset & 7) aren't perfectly interleaved, and can probably be improved further in the future. I've marked this with a few TODOs/FIXMEs in the code. Change-Id: I5c9e900c0f0d32e431a50fecae213b510b2549f9
This commit is contained in:
parent
8fb6c58191
commit
1e6a32f1af
@ -76,6 +76,34 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
|
|||||||
return sse - (((int64_t) se * se) >> (l2w + l2h));
|
return sse - (((int64_t) se * se) >> (l2w + l2h));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
|
||||||
|
const uint8_t *src,
|
||||||
|
const uint8_t *second_pred,
|
||||||
|
int l2w, int l2h,
|
||||||
|
int xoff, int yoff,
|
||||||
|
unsigned int *sse_ptr) {
|
||||||
|
int se = 0;
|
||||||
|
unsigned int sse = 0;
|
||||||
|
const int w = 1 << l2w, h = 1 << l2h;
|
||||||
|
for (int y = 0; y < h; y++) {
|
||||||
|
for (int x = 0; x < w; x++) {
|
||||||
|
// bilinear interpolation at a 16th pel step
|
||||||
|
const int a1 = ref[(w + 1) * (y + 0) + x + 0];
|
||||||
|
const int a2 = ref[(w + 1) * (y + 0) + x + 1];
|
||||||
|
const int b1 = ref[(w + 1) * (y + 1) + x + 0];
|
||||||
|
const int b2 = ref[(w + 1) * (y + 1) + x + 1];
|
||||||
|
const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
|
||||||
|
const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
|
||||||
|
const int r = a + (((b - a) * yoff + 8) >> 4);
|
||||||
|
int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
|
||||||
|
se += diff;
|
||||||
|
sse += diff * diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*sse_ptr = sse;
|
||||||
|
return sse - (((int64_t) se * se) >> (l2w + l2h));
|
||||||
|
}
|
||||||
|
|
||||||
template<typename VarianceFunctionType>
|
template<typename VarianceFunctionType>
|
||||||
class VarianceTest :
|
class VarianceTest :
|
||||||
public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
|
public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
|
||||||
@ -174,6 +202,7 @@ class SubpelVarianceTest :
|
|||||||
rnd(ACMRandom::DeterministicSeed());
|
rnd(ACMRandom::DeterministicSeed());
|
||||||
block_size_ = width_ * height_;
|
block_size_ = width_ * height_;
|
||||||
src_ = new uint8_t[block_size_];
|
src_ = new uint8_t[block_size_];
|
||||||
|
sec_ = new uint8_t[block_size_];
|
||||||
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
|
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
|
||||||
ASSERT_TRUE(src_ != NULL);
|
ASSERT_TRUE(src_ != NULL);
|
||||||
ASSERT_TRUE(ref_ != NULL);
|
ASSERT_TRUE(ref_ != NULL);
|
||||||
@ -182,14 +211,16 @@ class SubpelVarianceTest :
|
|||||||
virtual void TearDown() {
|
virtual void TearDown() {
|
||||||
delete[] src_;
|
delete[] src_;
|
||||||
delete[] ref_;
|
delete[] ref_;
|
||||||
|
delete[] sec_;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void RefTest();
|
void RefTest();
|
||||||
|
|
||||||
ACMRandom rnd;
|
ACMRandom rnd;
|
||||||
uint8_t* src_;
|
uint8_t *src_;
|
||||||
uint8_t* ref_;
|
uint8_t *ref_;
|
||||||
|
uint8_t *sec_;
|
||||||
int width_, log2width_;
|
int width_, log2width_;
|
||||||
int height_, log2height_;
|
int height_, log2height_;
|
||||||
int block_size_;
|
int block_size_;
|
||||||
@ -217,6 +248,29 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
|
||||||
|
for (int x = 0; x < 16; ++x) {
|
||||||
|
for (int y = 0; y < 16; ++y) {
|
||||||
|
for (int j = 0; j < block_size_; j++) {
|
||||||
|
src_[j] = rnd.Rand8();
|
||||||
|
sec_[j] = rnd.Rand8();
|
||||||
|
}
|
||||||
|
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
|
||||||
|
ref_[j] = rnd.Rand8();
|
||||||
|
}
|
||||||
|
unsigned int sse1, sse2;
|
||||||
|
const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y,
|
||||||
|
src_, width_, &sse1, sec_);
|
||||||
|
const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
|
||||||
|
log2width_, log2height_,
|
||||||
|
x, y, &sse2);
|
||||||
|
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
|
||||||
|
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
// VP8 test cases.
|
// VP8 test cases.
|
||||||
|
|
||||||
@ -283,10 +337,12 @@ namespace vp9 {
|
|||||||
#if CONFIG_VP9_ENCODER
|
#if CONFIG_VP9_ENCODER
|
||||||
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
|
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
|
||||||
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
|
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
|
||||||
|
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
|
||||||
|
|
||||||
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
|
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
|
||||||
TEST_P(VP9VarianceTest, Ref) { RefTest(); }
|
TEST_P(VP9VarianceTest, Ref) { RefTest(); }
|
||||||
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
|
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
|
||||||
|
TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
|
||||||
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
|
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
|
||||||
|
|
||||||
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
|
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
|
||||||
@ -360,6 +416,48 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
make_tuple(6, 5, subpel_variance64x32_c),
|
make_tuple(6, 5, subpel_variance64x32_c),
|
||||||
make_tuple(6, 6, subpel_variance64x64_c)));
|
make_tuple(6, 6, subpel_variance64x64_c)));
|
||||||
|
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
|
||||||
|
vp9_sub_pixel_avg_variance4x4_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
|
||||||
|
vp9_sub_pixel_avg_variance4x8_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
|
||||||
|
vp9_sub_pixel_avg_variance8x4_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
|
||||||
|
vp9_sub_pixel_avg_variance8x8_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
|
||||||
|
vp9_sub_pixel_avg_variance8x16_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
|
||||||
|
vp9_sub_pixel_avg_variance16x8_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
|
||||||
|
vp9_sub_pixel_avg_variance16x16_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
|
||||||
|
vp9_sub_pixel_avg_variance16x32_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
|
||||||
|
vp9_sub_pixel_avg_variance32x16_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
|
||||||
|
vp9_sub_pixel_avg_variance32x32_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
|
||||||
|
vp9_sub_pixel_avg_variance32x64_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
|
||||||
|
vp9_sub_pixel_avg_variance64x32_c;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
|
||||||
|
vp9_sub_pixel_avg_variance64x64_c;
|
||||||
|
INSTANTIATE_TEST_CASE_P(
|
||||||
|
C, VP9SubpelAvgVarianceTest,
|
||||||
|
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
|
||||||
|
make_tuple(2, 3, subpel_avg_variance4x8_c),
|
||||||
|
make_tuple(3, 2, subpel_avg_variance8x4_c),
|
||||||
|
make_tuple(3, 3, subpel_avg_variance8x8_c),
|
||||||
|
make_tuple(3, 4, subpel_avg_variance8x16_c),
|
||||||
|
make_tuple(4, 3, subpel_avg_variance16x8_c),
|
||||||
|
make_tuple(4, 4, subpel_avg_variance16x16_c),
|
||||||
|
make_tuple(4, 5, subpel_avg_variance16x32_c),
|
||||||
|
make_tuple(5, 4, subpel_avg_variance32x16_c),
|
||||||
|
make_tuple(5, 5, subpel_avg_variance32x32_c),
|
||||||
|
make_tuple(5, 6, subpel_avg_variance32x64_c),
|
||||||
|
make_tuple(6, 5, subpel_avg_variance64x32_c),
|
||||||
|
make_tuple(6, 6, subpel_avg_variance64x64_c)));
|
||||||
|
|
||||||
#if HAVE_MMX
|
#if HAVE_MMX
|
||||||
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
|
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
|
||||||
const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
|
const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
|
||||||
@ -446,6 +544,48 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
make_tuple(5, 6, subpel_variance32x64_sse2),
|
make_tuple(5, 6, subpel_variance32x64_sse2),
|
||||||
make_tuple(6, 5, subpel_variance64x32_sse2),
|
make_tuple(6, 5, subpel_variance64x32_sse2),
|
||||||
make_tuple(6, 6, subpel_variance64x64_sse2)));
|
make_tuple(6, 6, subpel_variance64x64_sse2)));
|
||||||
|
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
|
||||||
|
vp9_sub_pixel_avg_variance4x4_sse;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
|
||||||
|
vp9_sub_pixel_avg_variance4x8_sse;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance8x4_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance8x8_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance8x16_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance16x8_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance16x16_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance16x32_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance32x16_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance32x32_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance32x64_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance64x32_sse2;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
|
||||||
|
vp9_sub_pixel_avg_variance64x64_sse2;
|
||||||
|
INSTANTIATE_TEST_CASE_P(
|
||||||
|
SSE2, VP9SubpelAvgVarianceTest,
|
||||||
|
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
|
||||||
|
make_tuple(2, 3, subpel_avg_variance4x8_sse),
|
||||||
|
make_tuple(3, 2, subpel_avg_variance8x4_sse2),
|
||||||
|
make_tuple(3, 3, subpel_avg_variance8x8_sse2),
|
||||||
|
make_tuple(3, 4, subpel_avg_variance8x16_sse2),
|
||||||
|
make_tuple(4, 3, subpel_avg_variance16x8_sse2),
|
||||||
|
make_tuple(4, 4, subpel_avg_variance16x16_sse2),
|
||||||
|
make_tuple(4, 5, subpel_avg_variance16x32_sse2),
|
||||||
|
make_tuple(5, 4, subpel_avg_variance32x16_sse2),
|
||||||
|
make_tuple(5, 5, subpel_avg_variance32x32_sse2),
|
||||||
|
make_tuple(5, 6, subpel_avg_variance32x64_sse2),
|
||||||
|
make_tuple(6, 5, subpel_avg_variance64x32_sse2),
|
||||||
|
make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if HAVE_SSSE3
|
#if HAVE_SSSE3
|
||||||
@ -490,6 +630,48 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
make_tuple(5, 6, subpel_variance32x64_ssse3),
|
make_tuple(5, 6, subpel_variance32x64_ssse3),
|
||||||
make_tuple(6, 5, subpel_variance64x32_ssse3),
|
make_tuple(6, 5, subpel_variance64x32_ssse3),
|
||||||
make_tuple(6, 6, subpel_variance64x64_ssse3)));
|
make_tuple(6, 6, subpel_variance64x64_ssse3)));
|
||||||
|
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance4x4_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance4x8_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance8x4_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance8x8_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance8x16_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance16x8_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance16x16_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance16x32_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance32x16_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance32x32_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance32x64_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance64x32_ssse3;
|
||||||
|
const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
|
||||||
|
vp9_sub_pixel_avg_variance64x64_ssse3;
|
||||||
|
INSTANTIATE_TEST_CASE_P(
|
||||||
|
SSSE3, VP9SubpelAvgVarianceTest,
|
||||||
|
::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
|
||||||
|
make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
|
||||||
|
make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
|
||||||
|
make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
|
||||||
|
make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
|
||||||
|
make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
|
||||||
|
make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
|
||||||
|
make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
|
||||||
|
make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
|
||||||
|
make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
|
||||||
|
make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
|
||||||
|
make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
|
||||||
|
make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
|
||||||
#endif
|
#endif
|
||||||
#endif // CONFIG_VP9_ENCODER
|
#endif // CONFIG_VP9_ENCODER
|
||||||
|
|
||||||
|
@ -269,81 +269,81 @@ prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int
|
|||||||
specialize vp9_sub_pixel_variance64x64 sse2 ssse3
|
specialize vp9_sub_pixel_variance64x64 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance64x64
|
specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance32x64 sse2 ssse3
|
specialize vp9_sub_pixel_variance32x64 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance32x64
|
specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance64x32 sse2 ssse3
|
specialize vp9_sub_pixel_variance64x32 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance64x32
|
specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance32x16 sse2 ssse3
|
specialize vp9_sub_pixel_variance32x16 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance32x16
|
specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance16x32 sse2 ssse3
|
specialize vp9_sub_pixel_variance16x32 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance16x32
|
specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance32x32 sse2 ssse3
|
specialize vp9_sub_pixel_variance32x32 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance32x32
|
specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance16x16 sse2 ssse3
|
specialize vp9_sub_pixel_variance16x16 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance16x16
|
specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance8x16 sse2 ssse3
|
specialize vp9_sub_pixel_variance8x16 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance8x16
|
specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance16x8 sse2 ssse3
|
specialize vp9_sub_pixel_variance16x8 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance16x8
|
specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance8x8 sse2 ssse3
|
specialize vp9_sub_pixel_variance8x8 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance8x8
|
specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
|
||||||
|
|
||||||
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
|
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
|
||||||
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance8x4 sse2 ssse3
|
specialize vp9_sub_pixel_variance8x4 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance8x4
|
specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance4x8 sse ssse3
|
specialize vp9_sub_pixel_variance4x8 sse ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance4x8
|
specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
|
||||||
specialize vp9_sub_pixel_variance4x4 sse ssse3
|
specialize vp9_sub_pixel_variance4x4 sse ssse3
|
||||||
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
|
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
|
||||||
|
|
||||||
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
|
||||||
specialize vp9_sub_pixel_avg_variance4x4
|
specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
|
||||||
|
|
||||||
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
|
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
|
||||||
specialize vp9_sad64x64 sse2
|
specialize vp9_sad64x64 sse2
|
||||||
|
@ -116,7 +116,7 @@ bilin_filter_m_ssse3: times 8 db 16, 0
|
|||||||
RET
|
RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro SUBPEL_VARIANCE 1 ; W
|
%macro SUBPEL_VARIANCE 1-2 0 ; W
|
||||||
%if cpuflag(ssse3)
|
%if cpuflag(ssse3)
|
||||||
%define bilin_filter_m bilin_filter_m_ssse3
|
%define bilin_filter_m bilin_filter_m_ssse3
|
||||||
%define filter_idx_shift 4
|
%define filter_idx_shift 4
|
||||||
@ -128,12 +128,38 @@ bilin_filter_m_ssse3: times 8 db 16, 0
|
|||||||
; 11, not 13, if the registers are ordered correctly. May make a minor speed
|
; 11, not 13, if the registers are ordered correctly. May make a minor speed
|
||||||
; difference on Win64
|
; difference on Win64
|
||||||
%ifdef PIC
|
%ifdef PIC
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
|
||||||
|
x_offset, y_offset, \
|
||||||
|
dst, dst_stride, \
|
||||||
|
sec, sec_stride, height, sse
|
||||||
|
%define sec_str sec_strideq
|
||||||
|
%else
|
||||||
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
|
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
|
||||||
dst, dst_stride, height, sse
|
dst, dst_stride, height, sse
|
||||||
|
%endif
|
||||||
|
%define h heightd
|
||||||
%define bilin_filter sseq
|
%define bilin_filter sseq
|
||||||
%else
|
%else
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
|
||||||
|
7 + 2 * ARCH_X86_64, 13, src, src_stride, \
|
||||||
|
x_offset, y_offset, \
|
||||||
|
dst, dst_stride, \
|
||||||
|
sec, sec_stride, \
|
||||||
|
height, sse
|
||||||
|
%if ARCH_X86_64
|
||||||
|
%define h heightd
|
||||||
|
%define sec_str sec_strideq
|
||||||
|
%else
|
||||||
|
%define h dword heightm
|
||||||
|
%define sec_str sec_stridemp
|
||||||
|
%endif
|
||||||
|
%else
|
||||||
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
||||||
dst, dst_stride, height, sse
|
dst, dst_stride, height, sse
|
||||||
|
%define h heightd
|
||||||
|
%endif
|
||||||
%define bilin_filter bilin_filter_m
|
%define bilin_filter bilin_filter_m
|
||||||
%endif
|
%endif
|
||||||
ASSERT %1 <= 16 ; m6 overflows if w > 16
|
ASSERT %1 <= 16 ; m6 overflows if w > 16
|
||||||
@ -143,7 +169,10 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
; could perhaps use it for something more productive then
|
; could perhaps use it for something more productive then
|
||||||
pxor m5, m5 ; dedicated zero register
|
pxor m5, m5 ; dedicated zero register
|
||||||
%if %1 < 16
|
%if %1 < 16
|
||||||
sar heightd, 1
|
sar h, 1
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
shl sec_str, 1
|
||||||
|
%endif
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
; FIXME(rbultje) replace by jumptable?
|
; FIXME(rbultje) replace by jumptable?
|
||||||
@ -158,30 +187,55 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
%if %1 == 16
|
%if %1 == 16
|
||||||
movu m0, [srcq]
|
movu m0, [srcq]
|
||||||
mova m1, [dstq]
|
mova m1, [dstq]
|
||||||
punpckhbw m2, m0, m5
|
%if %2 == 1 ; avg
|
||||||
punpcklbw m0, m5
|
pavgb m0, [secq]
|
||||||
punpckhbw m3, m1, m5
|
punpckhbw m3, m1, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%if %2 == 0 ; !avg
|
||||||
|
punpckhbw m3, m1, m5
|
||||||
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
%if mmsize == 16
|
||||||
|
movhps m0, [srcq+src_strideq]
|
||||||
|
%else ; mmsize == 8
|
||||||
|
punpckldq m0, [srcq+src_strideq]
|
||||||
|
%endif
|
||||||
|
%else ; !avg
|
||||||
movh m2, [srcq+src_strideq]
|
movh m2, [srcq+src_strideq]
|
||||||
|
%endif
|
||||||
movh m1, [dstq]
|
movh m1, [dstq]
|
||||||
movh m3, [dstq+dst_strideq]
|
movh m3, [dstq+dst_strideq]
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpcklbw m3, m5
|
||||||
|
punpcklbw m1, m5
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%else ; !avg
|
||||||
punpcklbw m0, m5
|
punpcklbw m0, m5
|
||||||
punpcklbw m2, m5
|
punpcklbw m2, m5
|
||||||
punpcklbw m3, m5
|
punpcklbw m3, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_zero_y_zero_loop
|
jg .x_zero_y_zero_loop
|
||||||
STORE_AND_RET
|
STORE_AND_RET
|
||||||
|
|
||||||
@ -196,18 +250,40 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
movu m4, [srcq+src_strideq]
|
movu m4, [srcq+src_strideq]
|
||||||
mova m1, [dstq]
|
mova m1, [dstq]
|
||||||
pavgb m0, m4
|
pavgb m0, m4
|
||||||
|
punpckhbw m3, m1, m5
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
pavgb m0, [secq]
|
||||||
|
%endif
|
||||||
|
punpcklbw m1, m5
|
||||||
punpckhbw m2, m0, m5
|
punpckhbw m2, m0, m5
|
||||||
punpcklbw m0, m5
|
punpcklbw m0, m5
|
||||||
punpckhbw m3, m1, m5
|
|
||||||
punpcklbw m1, m5
|
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
movh m2, [srcq+src_strideq]
|
movh m2, [srcq+src_strideq]
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
%if mmsize == 16
|
||||||
|
movhps m2, [srcq+src_strideq*2]
|
||||||
|
%else ; mmsize == 8
|
||||||
|
punpckldq m2, [srcq+src_strideq*2]
|
||||||
|
%endif
|
||||||
|
movh m1, [dstq]
|
||||||
|
%if mmsize == 16
|
||||||
|
movlhps m0, m2
|
||||||
|
%else ; mmsize == 8
|
||||||
|
punpckldq m0, m2
|
||||||
|
%endif
|
||||||
|
movh m3, [dstq+dst_strideq]
|
||||||
|
pavgb m0, m2
|
||||||
|
punpcklbw m1, m5
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpcklbw m3, m5
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%else ; !avg
|
||||||
movh m4, [srcq+src_strideq*2]
|
movh m4, [srcq+src_strideq*2]
|
||||||
movh m1, [dstq]
|
movh m1, [dstq]
|
||||||
pavgb m0, m2
|
pavgb m0, m2
|
||||||
@ -217,12 +293,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
punpcklbw m2, m5
|
punpcklbw m2, m5
|
||||||
punpcklbw m3, m5
|
punpcklbw m3, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_zero_y_half_loop
|
jg .x_zero_y_half_loop
|
||||||
STORE_AND_RET
|
STORE_AND_RET
|
||||||
|
|
||||||
@ -280,13 +360,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
%endif
|
%endif
|
||||||
psraw m2, 4
|
psraw m2, 4
|
||||||
psraw m0, 4
|
psraw m0, 4
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%endif
|
||||||
punpckhbw m3, m1, m5
|
punpckhbw m3, m1, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
movh m2, [srcq+src_strideq]
|
movh m2, [srcq+src_strideq]
|
||||||
@ -318,13 +404,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
%endif
|
%endif
|
||||||
psraw m0, 4
|
psraw m0, 4
|
||||||
psraw m2, 4
|
psraw m2, 4
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%endif
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_zero_y_other_loop
|
jg .x_zero_y_other_loop
|
||||||
%undef filter_y_a
|
%undef filter_y_a
|
||||||
%undef filter_y_b
|
%undef filter_y_b
|
||||||
@ -345,18 +441,37 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
movu m4, [srcq+1]
|
movu m4, [srcq+1]
|
||||||
mova m1, [dstq]
|
mova m1, [dstq]
|
||||||
pavgb m0, m4
|
pavgb m0, m4
|
||||||
|
punpckhbw m3, m1, m5
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
pavgb m0, [secq]
|
||||||
|
%endif
|
||||||
|
punpcklbw m1, m5
|
||||||
punpckhbw m2, m0, m5
|
punpckhbw m2, m0, m5
|
||||||
punpcklbw m0, m5
|
punpcklbw m0, m5
|
||||||
punpckhbw m3, m1, m5
|
|
||||||
punpcklbw m1, m5
|
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
movh m4, [srcq+1]
|
movh m4, [srcq+1]
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
%if mmsize == 16
|
||||||
|
movhps m0, [srcq+src_strideq]
|
||||||
|
movhps m4, [srcq+src_strideq+1]
|
||||||
|
%else ; mmsize == 8
|
||||||
|
punpckldq m0, [srcq+src_strideq]
|
||||||
|
punpckldq m4, [srcq+src_strideq+1]
|
||||||
|
%endif
|
||||||
|
movh m1, [dstq]
|
||||||
|
movh m3, [dstq+dst_strideq]
|
||||||
|
pavgb m0, m4
|
||||||
|
punpcklbw m3, m5
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpcklbw m1, m5
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%else ; !avg
|
||||||
movh m2, [srcq+src_strideq]
|
movh m2, [srcq+src_strideq]
|
||||||
movh m1, [dstq]
|
movh m1, [dstq]
|
||||||
pavgb m0, m4
|
pavgb m0, m4
|
||||||
@ -367,12 +482,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
punpcklbw m2, m5
|
punpcklbw m2, m5
|
||||||
punpcklbw m3, m5
|
punpcklbw m3, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_half_y_zero_loop
|
jg .x_half_y_zero_loop
|
||||||
STORE_AND_RET
|
STORE_AND_RET
|
||||||
|
|
||||||
@ -391,17 +510,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
movu m3, [srcq+1]
|
movu m3, [srcq+1]
|
||||||
mova m1, [dstq]
|
mova m1, [dstq]
|
||||||
pavgb m4, m3
|
pavgb m4, m3
|
||||||
|
punpckhbw m3, m1, m5
|
||||||
pavgb m0, m4
|
pavgb m0, m4
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
punpcklbw m1, m5
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%else
|
||||||
punpckhbw m2, m0, m5
|
punpckhbw m2, m0, m5
|
||||||
punpcklbw m0, m5
|
punpcklbw m0, m5
|
||||||
punpckhbw m3, m1, m5
|
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
mova m0, m4
|
mova m0, m4
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
movh m3, [srcq+1]
|
movh m3, [srcq+1]
|
||||||
@ -410,6 +535,31 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
.x_half_y_half_loop:
|
.x_half_y_half_loop:
|
||||||
movh m2, [srcq]
|
movh m2, [srcq]
|
||||||
movh m3, [srcq+1]
|
movh m3, [srcq+1]
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
%if mmsize == 16
|
||||||
|
movhps m2, [srcq+src_strideq]
|
||||||
|
movhps m3, [srcq+src_strideq+1]
|
||||||
|
%else
|
||||||
|
punpckldq m2, [srcq+src_strideq]
|
||||||
|
punpckldq m3, [srcq+src_strideq+1]
|
||||||
|
%endif
|
||||||
|
pavgb m2, m3
|
||||||
|
%if mmsize == 16
|
||||||
|
movlhps m0, m2
|
||||||
|
movhlps m4, m2
|
||||||
|
%else ; mmsize == 8
|
||||||
|
punpckldq m0, m2
|
||||||
|
pshufw m4, m2, 0xe
|
||||||
|
%endif
|
||||||
|
movh m1, [dstq]
|
||||||
|
pavgb m0, m2
|
||||||
|
movh m3, [dstq+dst_strideq]
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpcklbw m3, m5
|
||||||
|
punpcklbw m1, m5
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%else ; !avg
|
||||||
movh m4, [srcq+src_strideq]
|
movh m4, [srcq+src_strideq]
|
||||||
movh m1, [srcq+src_strideq+1]
|
movh m1, [srcq+src_strideq+1]
|
||||||
pavgb m2, m3
|
pavgb m2, m3
|
||||||
@ -422,13 +572,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
punpcklbw m2, m5
|
punpcklbw m2, m5
|
||||||
punpcklbw m3, m5
|
punpcklbw m3, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
mova m0, m4
|
mova m0, m4
|
||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_half_y_half_loop
|
jg .x_half_y_half_loop
|
||||||
STORE_AND_RET
|
STORE_AND_RET
|
||||||
|
|
||||||
@ -488,13 +642,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
%endif
|
%endif
|
||||||
punpckhbw m3, m1, m5
|
punpckhbw m3, m1, m5
|
||||||
psraw m0, 4
|
psraw m0, 4
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%endif
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
mova m0, m4
|
mova m0, m4
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
movh m3, [srcq+1]
|
movh m3, [srcq+1]
|
||||||
@ -536,14 +696,24 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
%endif
|
%endif
|
||||||
psraw m0, 4
|
psraw m0, 4
|
||||||
psraw m2, 4
|
psraw m2, 4
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%endif
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
mova m0, m4
|
mova m0, m4
|
||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_half_y_other_loop
|
jg .x_half_y_other_loop
|
||||||
%undef filter_y_a
|
%undef filter_y_a
|
||||||
%undef filter_y_b
|
%undef filter_y_b
|
||||||
@ -602,13 +772,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
%endif
|
%endif
|
||||||
psraw m2, 4
|
psraw m2, 4
|
||||||
psraw m0, 4
|
psraw m0, 4
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%endif
|
||||||
punpckhbw m3, m1, m5
|
punpckhbw m3, m1, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
movh m1, [srcq+1]
|
movh m1, [srcq+1]
|
||||||
@ -642,13 +818,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
%endif
|
%endif
|
||||||
psraw m0, 4
|
psraw m0, 4
|
||||||
psraw m2, 4
|
psraw m2, 4
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%endif
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_other_y_zero_loop
|
jg .x_other_y_zero_loop
|
||||||
%undef filter_x_a
|
%undef filter_x_a
|
||||||
%undef filter_x_b
|
%undef filter_x_b
|
||||||
@ -724,8 +910,6 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
pavgb m0, m4
|
pavgb m0, m4
|
||||||
punpckhbw m3, m1, m5
|
punpckhbw m3, m1, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
punpckhbw m2, m0, m5
|
|
||||||
punpcklbw m0, m5
|
|
||||||
%else
|
%else
|
||||||
punpckhbw m2, m4, m5
|
punpckhbw m2, m4, m5
|
||||||
punpckhbw m1, m3, m5
|
punpckhbw m1, m3, m5
|
||||||
@ -750,15 +934,18 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
packuswb m4, m2
|
packuswb m4, m2
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
pavgb m0, m4
|
pavgb m0, m4
|
||||||
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
pavgb m0, [secq]
|
||||||
|
%endif
|
||||||
punpckhbw m2, m0, m5
|
punpckhbw m2, m0, m5
|
||||||
punpcklbw m0, m5
|
punpcklbw m0, m5
|
||||||
%endif
|
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
mova m0, m4
|
mova m0, m4
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
movh m1, [srcq+1]
|
movh m1, [srcq+1]
|
||||||
@ -810,6 +997,13 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
psraw m4, 4
|
psraw m4, 4
|
||||||
pavgw m0, m2
|
pavgw m0, m2
|
||||||
pavgw m2, m4
|
pavgw m2, m4
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline - also consider going to bytes here
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
|
%endif
|
||||||
punpcklbw m3, m5
|
punpcklbw m3, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
@ -817,8 +1011,11 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_other_y_half_loop
|
jg .x_other_y_half_loop
|
||||||
%undef filter_x_a
|
%undef filter_x_a
|
||||||
%undef filter_x_b
|
%undef filter_x_b
|
||||||
@ -941,13 +1138,19 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
punpckhbw m3, m1, m5
|
punpckhbw m3, m1, m5
|
||||||
psraw m0, 4
|
psraw m0, 4
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
%endif
|
%endif
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
mova m0, m4
|
mova m0, m4
|
||||||
|
|
||||||
add srcq, src_strideq
|
add srcq, src_strideq
|
||||||
add dstq, dst_strideq
|
add dstq, dst_strideq
|
||||||
dec heightd
|
|
||||||
%else ; %1 < 16
|
%else ; %1 < 16
|
||||||
movh m0, [srcq]
|
movh m0, [srcq]
|
||||||
movh m1, [srcq+1]
|
movh m1, [srcq+1]
|
||||||
@ -1025,14 +1228,24 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
|
|||||||
psraw m2, 4
|
psraw m2, 4
|
||||||
punpcklbw m3, m5
|
punpcklbw m3, m5
|
||||||
punpcklbw m1, m5
|
punpcklbw m1, m5
|
||||||
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
; FIXME(rbultje) pipeline
|
||||||
|
packuswb m0, m2
|
||||||
|
pavgb m0, [secq]
|
||||||
|
punpckhbw m2, m0, m5
|
||||||
|
punpcklbw m0, m5
|
||||||
%endif
|
%endif
|
||||||
SUM_SSE m0, m1, m2, m3, m6, m7
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
||||||
mova m0, m4
|
mova m0, m4
|
||||||
|
|
||||||
lea srcq, [srcq+src_strideq*2]
|
lea srcq, [srcq+src_strideq*2]
|
||||||
lea dstq, [dstq+dst_strideq*2]
|
lea dstq, [dstq+dst_strideq*2]
|
||||||
dec heightd
|
|
||||||
%endif
|
%endif
|
||||||
|
%if %2 == 1 ; avg
|
||||||
|
add secq, sec_str
|
||||||
|
%endif
|
||||||
|
dec h
|
||||||
jg .x_other_y_other_loop
|
jg .x_other_y_other_loop
|
||||||
%undef filter_x_a
|
%undef filter_x_a
|
||||||
%undef filter_x_b
|
%undef filter_x_b
|
||||||
@ -1059,3 +1272,15 @@ SUBPEL_VARIANCE 4
|
|||||||
INIT_XMM ssse3
|
INIT_XMM ssse3
|
||||||
SUBPEL_VARIANCE 8
|
SUBPEL_VARIANCE 8
|
||||||
SUBPEL_VARIANCE 16
|
SUBPEL_VARIANCE 16
|
||||||
|
|
||||||
|
INIT_MMX sse
|
||||||
|
SUBPEL_VARIANCE 4, 1
|
||||||
|
INIT_XMM sse2
|
||||||
|
SUBPEL_VARIANCE 8, 1
|
||||||
|
SUBPEL_VARIANCE 16, 1
|
||||||
|
|
||||||
|
INIT_MMX ssse3
|
||||||
|
SUBPEL_VARIANCE 4, 1
|
||||||
|
INIT_XMM ssse3
|
||||||
|
SUBPEL_VARIANCE 8, 1
|
||||||
|
SUBPEL_VARIANCE 16, 1
|
||||||
|
@ -343,29 +343,22 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
|
|||||||
return (var - (((int64_t)avg * avg) >> 11));
|
return (var - (((int64_t)avg * avg) >> 11));
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DECLS(opt1, opt2) \
|
#define DECL(w, opt) \
|
||||||
int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \
|
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
|
||||||
ptrdiff_t src_stride, \
|
|
||||||
int x_offset, int y_offset, \
|
|
||||||
const uint8_t *dst, \
|
|
||||||
ptrdiff_t dst_stride, \
|
|
||||||
int height, unsigned int *sse); \
|
|
||||||
int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \
|
|
||||||
ptrdiff_t src_stride, \
|
|
||||||
int x_offset, int y_offset, \
|
|
||||||
const uint8_t *dst, \
|
|
||||||
ptrdiff_t dst_stride, \
|
|
||||||
int height, unsigned int *sse); \
|
|
||||||
int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \
|
|
||||||
ptrdiff_t src_stride, \
|
ptrdiff_t src_stride, \
|
||||||
int x_offset, int y_offset, \
|
int x_offset, int y_offset, \
|
||||||
const uint8_t *dst, \
|
const uint8_t *dst, \
|
||||||
ptrdiff_t dst_stride, \
|
ptrdiff_t dst_stride, \
|
||||||
int height, unsigned int *sse)
|
int height, unsigned int *sse)
|
||||||
|
#define DECLS(opt1, opt2) \
|
||||||
|
DECL(4, opt2); \
|
||||||
|
DECL(8, opt1); \
|
||||||
|
DECL(16, opt1)
|
||||||
|
|
||||||
DECLS(sse2, sse);
|
DECLS(sse2, sse);
|
||||||
DECLS(ssse3, ssse3);
|
DECLS(ssse3, ssse3);
|
||||||
#undef DECLS
|
#undef DECLS
|
||||||
|
#undef DECL
|
||||||
|
|
||||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||||
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
|
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
|
||||||
@ -427,6 +420,86 @@ FNS(ssse3, ssse3);
|
|||||||
#undef FNS
|
#undef FNS
|
||||||
#undef FN
|
#undef FN
|
||||||
|
|
||||||
|
#define DECL(w, opt) \
|
||||||
|
int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
|
||||||
|
ptrdiff_t src_stride, \
|
||||||
|
int x_offset, int y_offset, \
|
||||||
|
const uint8_t *dst, \
|
||||||
|
ptrdiff_t dst_stride, \
|
||||||
|
const uint8_t *sec, \
|
||||||
|
ptrdiff_t sec_stride, \
|
||||||
|
int height, unsigned int *sse)
|
||||||
|
#define DECLS(opt1, opt2) \
|
||||||
|
DECL(4, opt2); \
|
||||||
|
DECL(8, opt1); \
|
||||||
|
DECL(16, opt1)
|
||||||
|
|
||||||
|
DECLS(sse2, sse);
|
||||||
|
DECLS(ssse3, ssse3);
|
||||||
|
#undef DECL
|
||||||
|
#undef DECLS
|
||||||
|
|
||||||
|
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||||
|
unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
|
||||||
|
int src_stride, \
|
||||||
|
int x_offset, \
|
||||||
|
int y_offset, \
|
||||||
|
const uint8_t *dst, \
|
||||||
|
int dst_stride, \
|
||||||
|
unsigned int *sseptr, \
|
||||||
|
const uint8_t *sec) { \
|
||||||
|
unsigned int sse; \
|
||||||
|
int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
|
||||||
|
y_offset, dst, dst_stride, \
|
||||||
|
sec, w, h, &sse); \
|
||||||
|
if (w > wf) { \
|
||||||
|
unsigned int sse2; \
|
||||||
|
int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
|
||||||
|
x_offset, y_offset, \
|
||||||
|
dst + 16, dst_stride, \
|
||||||
|
sec + 16, w, h, &sse2); \
|
||||||
|
se += se2; \
|
||||||
|
sse += sse2; \
|
||||||
|
if (w > wf * 2) { \
|
||||||
|
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||||
|
x_offset, y_offset, \
|
||||||
|
dst + 32, dst_stride, \
|
||||||
|
sec + 32, w, h, &sse2); \
|
||||||
|
se += se2; \
|
||||||
|
sse += sse2; \
|
||||||
|
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||||
|
x_offset, y_offset, \
|
||||||
|
dst + 48, dst_stride, \
|
||||||
|
sec + 48, w, h, &sse2); \
|
||||||
|
se += se2; \
|
||||||
|
sse += sse2; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
*sseptr = sse; \
|
||||||
|
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FNS(opt1, opt2) \
|
||||||
|
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||||
|
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||||
|
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||||
|
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||||
|
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||||
|
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||||
|
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
|
||||||
|
FN(16, 8, 16, 4, 3, opt1,); \
|
||||||
|
FN(8, 16, 8, 3, 4, opt1,); \
|
||||||
|
FN(8, 8, 8, 3, 3, opt1,); \
|
||||||
|
FN(8, 4, 8, 3, 2, opt1,); \
|
||||||
|
FN(4, 8, 4, 2, 3, opt2,); \
|
||||||
|
FN(4, 4, 4, 2, 2, opt2,)
|
||||||
|
|
||||||
|
FNS(sse2, sse);
|
||||||
|
FNS(ssse3, ssse3);
|
||||||
|
|
||||||
|
#undef FNS
|
||||||
|
#undef FN
|
||||||
|
|
||||||
unsigned int vp9_variance_halfpixvar16x16_h_wmt(
|
unsigned int vp9_variance_halfpixvar16x16_h_wmt(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
int src_pixels_per_line,
|
int src_pixels_per_line,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user