Compare commits
144 Commits
sandbox/wa
...
highbitdep
Author | SHA1 | Date | |
---|---|---|---|
![]() |
fd05fb0c21 | ||
![]() |
39da55a49f | ||
![]() |
6d741e4d76 | ||
![]() |
db7192e0b0 | ||
![]() |
08d2f54800 | ||
![]() |
a1b726117f | ||
![]() |
005d80cd05 | ||
![]() |
d7422b2b1e | ||
![]() |
454342d4e7 | ||
![]() |
cda2ad0121 | ||
![]() |
6b378b8868 | ||
![]() |
123f29d1d7 | ||
![]() |
4230c2306c | ||
![]() |
5b76018057 | ||
![]() |
b1a6f6b9cb | ||
![]() |
a92f987a6b | ||
![]() |
1bf87dc353 | ||
![]() |
b84bf3323b | ||
![]() |
563aeba901 | ||
![]() |
93657ee6ec | ||
![]() |
3df0e78eae | ||
![]() |
e27a93fe25 | ||
![]() |
159247f30c | ||
![]() |
06cfc9f3e1 | ||
![]() |
85dba225a8 | ||
![]() |
53ef87e0c7 | ||
![]() |
09faceb706 | ||
![]() |
75c8fc2412 | ||
![]() |
635bb7aed4 | ||
![]() |
dd0a5ecd2c | ||
![]() |
3d170a834a | ||
![]() |
1db0b0ff1b | ||
![]() |
902529c595 | ||
![]() |
7dc3cdba0c | ||
![]() |
f9b4008020 | ||
![]() |
64fca22b4d | ||
![]() |
b94a475d7b | ||
![]() |
325fe4a430 | ||
![]() |
484a7f31a6 | ||
![]() |
059a721d92 | ||
![]() |
23b5c58174 | ||
![]() |
02118dcb3b | ||
![]() |
3489c19d2b | ||
![]() |
9595633751 | ||
![]() |
cea11ebb1e | ||
![]() |
60ee54352e | ||
![]() |
4b11ea5e32 | ||
![]() |
27dc02de95 | ||
![]() |
b7327fd3ea | ||
![]() |
6791a9b1d5 | ||
![]() |
3edc408011 | ||
![]() |
41c8641b6b | ||
![]() |
0a904797f6 | ||
![]() |
cc3e6d68df | ||
![]() |
aba709a5ec | ||
![]() |
b0bcd57d12 | ||
![]() |
78df9ead21 | ||
![]() |
cba05a9ecb | ||
![]() |
87d40bbafc | ||
![]() |
da82f64342 | ||
![]() |
02c3221593 | ||
![]() |
139ac07ae7 | ||
![]() |
209885c785 | ||
![]() |
000ea5e446 | ||
![]() |
c62b82d7c9 | ||
![]() |
7e802a5bba | ||
![]() |
8a50b60c9c | ||
![]() |
d5f6cd1b38 | ||
![]() |
91c222491e | ||
![]() |
2cf1232bd6 | ||
![]() |
3cd669c548 | ||
![]() |
56f2cb9478 | ||
![]() |
6dd6bb6602 | ||
![]() |
225c60848b | ||
![]() |
67dd2eaff7 | ||
![]() |
d61fbfd760 | ||
![]() |
6c9426809f | ||
![]() |
7f0eaadadc | ||
![]() |
7051f38e5a | ||
![]() |
d21a89895f | ||
![]() |
149c891dac | ||
![]() |
939f871ccc | ||
![]() |
6eb5e8ded6 | ||
![]() |
a13f2137fa | ||
![]() |
8ba0eeba1b | ||
![]() |
8ca39ede47 | ||
![]() |
db55558e0b | ||
![]() |
4c017c00e6 | ||
![]() |
20e745e8e4 | ||
![]() |
e0305995f3 | ||
![]() |
7ea0334f9a | ||
![]() |
65d2615208 | ||
![]() |
9b33e1088f | ||
![]() |
8b72b71c1c | ||
![]() |
093a32ffd7 | ||
![]() |
321bd42060 | ||
![]() |
f08489e609 | ||
![]() |
4320ac26ee | ||
![]() |
e91d29dea3 | ||
![]() |
091829d376 | ||
![]() |
666fd1300c | ||
![]() |
47354ee2f4 | ||
![]() |
0c95fcc25c | ||
![]() |
3224fd3c66 | ||
![]() |
822c27cd42 | ||
![]() |
42a1a3e3ba | ||
![]() |
51790ab228 | ||
![]() |
1ff621ec99 | ||
![]() |
b7649e15c2 | ||
![]() |
a79f06696d | ||
![]() |
2baec56312 | ||
![]() |
eb863b46f3 | ||
![]() |
5a2a78117f | ||
![]() |
cab30216a5 | ||
![]() |
efd115c415 | ||
![]() |
edd1fa0487 | ||
![]() |
747f0e3b8e | ||
![]() |
094f0024c3 | ||
![]() |
018173cf91 | ||
![]() |
87a571a34e | ||
![]() |
e037f0fa0c | ||
![]() |
4cba43ac3d | ||
![]() |
aa2d8ca7e2 | ||
![]() |
c4a5ef1ced | ||
![]() |
99e1518a16 | ||
![]() |
83e566029c | ||
![]() |
2535202902 | ||
![]() |
a9ed7c4434 | ||
![]() |
6956f43a90 | ||
![]() |
9d427884e9 | ||
![]() |
ed352156c9 | ||
![]() |
df64b3d04a | ||
![]() |
11f75a26c1 | ||
![]() |
81758337d5 | ||
![]() |
4e016f6e21 | ||
![]() |
22fbe77710 | ||
![]() |
d26ae35646 | ||
![]() |
0b8ae49a05 | ||
![]() |
d868780eb9 | ||
![]() |
bc0bc688c2 | ||
![]() |
a242def2aa | ||
![]() |
a0c772e381 | ||
![]() |
d3e62b846b | ||
![]() |
bdd7f74c3f |
@@ -264,6 +264,8 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
|
||||
|
||||
typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
|
||||
typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
|
||||
typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
|
||||
Idct16x16Param;
|
||||
|
||||
void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
|
||||
int /*tx_type*/) {
|
||||
@@ -311,6 +313,32 @@ void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
|
||||
}
|
||||
|
||||
void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_10_add_c(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_10_add_c(in, out, stride, 12);
|
||||
}
|
||||
|
||||
#if HAVE_SSE2
|
||||
void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
|
||||
void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
class Trans16x16TestBase {
|
||||
@@ -540,7 +568,7 @@ class Trans16x16TestBase {
|
||||
|
||||
reference_16x16_dct_2d(in, out_r);
|
||||
for (int j = 0; j < kNumCoeffs; ++j)
|
||||
coeff[j] = round(out_r[j]);
|
||||
coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
|
||||
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
|
||||
@@ -565,6 +593,62 @@ class Trans16x16TestBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
void CompareInvReference(IdctFunc ref_txfm, int thresh) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = 10000;
|
||||
const int eob = 10;
|
||||
const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
if (j < eob) {
|
||||
// Random values less than the threshold, either positive or negative
|
||||
coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
|
||||
} else {
|
||||
coeff[scan[j]] = 0;
|
||||
}
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
dst[j] = 0;
|
||||
ref[j] = 0;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
} else {
|
||||
dst16[j] = 0;
|
||||
ref16[j] = 0;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
ref_txfm(coeff, ref, pitch_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
|
||||
} else {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
|
||||
pitch_));
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const uint32_t diff =
|
||||
bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
|
||||
#else
|
||||
const uint32_t diff = dst[j] - ref[j];
|
||||
#endif
|
||||
const uint32_t error = diff * diff;
|
||||
EXPECT_EQ(0u, error)
|
||||
<< "Error: 16x16 IDCT Comparison has error " << error
|
||||
<< " at index " << j;
|
||||
}
|
||||
}
|
||||
}
|
||||
int pitch_;
|
||||
int tx_type_;
|
||||
vpx_bit_depth_t bit_depth_;
|
||||
@@ -590,10 +674,10 @@ class Trans16x16DCT
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
switch (bit_depth_) {
|
||||
case 10:
|
||||
case VPX_BITS_10:
|
||||
inv_txfm_ref = idct16x16_10_ref;
|
||||
break;
|
||||
case 12:
|
||||
case VPX_BITS_12:
|
||||
inv_txfm_ref = idct16x16_12_ref;
|
||||
break;
|
||||
default:
|
||||
@@ -703,6 +787,37 @@ TEST_P(Trans16x16HT, QuantCheck) {
|
||||
RunQuantCheck(429, 729);
|
||||
}
|
||||
|
||||
class InvTrans16x16DCT
|
||||
: public Trans16x16TestBase,
|
||||
public ::testing::TestWithParam<Idct16x16Param> {
|
||||
public:
|
||||
virtual ~InvTrans16x16DCT() {}
|
||||
|
||||
virtual void SetUp() {
|
||||
ref_txfm_ = GET_PARAM(0);
|
||||
inv_txfm_ = GET_PARAM(1);
|
||||
thresh_ = GET_PARAM(2);
|
||||
bit_depth_ = GET_PARAM(3);
|
||||
pitch_ = 16;
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
}
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
|
||||
void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
|
||||
inv_txfm_(out, dst, stride);
|
||||
}
|
||||
|
||||
IdctFunc ref_txfm_;
|
||||
IdctFunc inv_txfm_;
|
||||
int thresh_;
|
||||
};
|
||||
|
||||
TEST_P(InvTrans16x16DCT, CompareReference) {
|
||||
CompareInvReference(ref_txfm_, thresh_);
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
@@ -772,6 +887,51 @@ INSTANTIATE_TEST_CASE_P(
|
||||
VPX_BITS_8)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans16x16DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fdct16x16_sse2,
|
||||
&idct16x16_10, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct16x16_c,
|
||||
&idct16x16_256_add_10_sse2, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct16x16_sse2,
|
||||
&idct16x16_12, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fdct16x16_c,
|
||||
&idct16x16_256_add_12_sse2, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_fdct16x16_sse2,
|
||||
&vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans16x16HT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
|
||||
VPX_BITS_8)));
|
||||
// Optimizations take effect at a threshold of 3155, so we use a value close to
|
||||
// that to test both branches.
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, InvTrans16x16DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&idct16x16_10_add_10_c,
|
||||
&idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
|
||||
make_tuple(&idct16x16_10,
|
||||
&idct16x16_256_add_10_sse2, 3167, VPX_BITS_10),
|
||||
make_tuple(&idct16x16_10_add_12_c,
|
||||
&idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
|
||||
make_tuple(&idct16x16_12,
|
||||
&idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSSE3, Trans16x16DCT,
|
||||
|
@@ -79,6 +79,10 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
|
||||
Trans32x32Param;
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void idct32x32_8(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct32x32_1024_add_c(in, out, stride, 8);
|
||||
}
|
||||
|
||||
void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct32x32_1024_add_c(in, out, stride, 10);
|
||||
}
|
||||
@@ -114,7 +118,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
uint32_t max_error = 0;
|
||||
int64_t total_error = 0;
|
||||
const int count_test_block = 1000;
|
||||
const int count_test_block = 10000;
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
|
||||
@@ -127,7 +131,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Initialize a test block with input range [-mask_, mask_].
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
if (bit_depth_ == 8) {
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
src[j] = rnd.Rand8();
|
||||
dst[j] = rnd.Rand8();
|
||||
test_input_block[j] = src[j] - dst[j];
|
||||
@@ -282,7 +286,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
|
||||
|
||||
reference_32x32_dct_2d(in, out_r);
|
||||
for (int j = 0; j < kNumCoeffs; ++j)
|
||||
coeff[j] = round(out_r[j]);
|
||||
coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
@@ -353,6 +357,22 @@ INSTANTIATE_TEST_CASE_P(
|
||||
&vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans32x32Test,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1,
|
||||
VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1,
|
||||
VPX_BITS_12),
|
||||
make_tuple(&vp9_fdct32x32_sse2, &vp9_idct32x32_1024_add_c, 0,
|
||||
VPX_BITS_8),
|
||||
make_tuple(&vp9_fdct32x32_rd_sse2, &vp9_idct32x32_1024_add_c, 1,
|
||||
VPX_BITS_8)));
|
||||
#endif
|
||||
|
||||
#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
AVX2, Trans32x32Test,
|
||||
|
146
test/error_block_test.cc
Normal file
146
test/error_block_test.cc
Normal file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/util.h"
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/vp9_entropy.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
namespace {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const int number_of_iterations = 1000;
|
||||
|
||||
typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
|
||||
const tran_low_t *dqcoeff, intptr_t block_size,
|
||||
int64_t *ssz, int bps);
|
||||
typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
|
||||
ErrorBlockParam;
|
||||
class ErrorBlockTest
|
||||
: public ::testing::TestWithParam<ErrorBlockParam> {
|
||||
public:
|
||||
virtual ~ErrorBlockTest() {}
|
||||
virtual void SetUp() {
|
||||
error_block_op_ = GET_PARAM(0);
|
||||
ref_error_block_op_ = GET_PARAM(1);
|
||||
bit_depth_ = GET_PARAM(2);
|
||||
}
|
||||
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
vpx_bit_depth_t bit_depth_;
|
||||
ErrorBlockFunc error_block_op_;
|
||||
ErrorBlockFunc ref_error_block_op_;
|
||||
};
|
||||
|
||||
TEST_P(ErrorBlockTest, OperationCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
|
||||
int err_count_total = 0;
|
||||
int first_failure = -1;
|
||||
intptr_t block_size;
|
||||
int64_t ssz;
|
||||
int64_t ret;
|
||||
int64_t ref_ssz;
|
||||
int64_t ref_ret;
|
||||
for (int i = 0; i < number_of_iterations; ++i) {
|
||||
int err_count = 0;
|
||||
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
|
||||
for (int j = 0; j < block_size; j++) {
|
||||
coeff[j] = rnd(2<<20)-(1<<20);
|
||||
dqcoeff[j] = rnd(2<<20)-(1<<20);
|
||||
}
|
||||
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
|
||||
bit_depth_);
|
||||
ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
|
||||
&ssz, bit_depth_));
|
||||
err_count += (ref_ret != ret) | (ref_ssz != ssz);
|
||||
if (err_count && !err_count_total) {
|
||||
first_failure = i;
|
||||
}
|
||||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
|
||||
TEST_P(ErrorBlockTest, ExtremeValues) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
|
||||
int err_count_total = 0;
|
||||
int first_failure = -1;
|
||||
intptr_t block_size;
|
||||
int64_t ssz;
|
||||
int64_t ret;
|
||||
int64_t ref_ssz;
|
||||
int64_t ref_ret;
|
||||
int max_val = ((1<<20)-1);
|
||||
for (int i = 0; i < number_of_iterations; ++i) {
|
||||
int err_count = 0;
|
||||
int k = (i / 9) % 5;
|
||||
|
||||
// Change the maximum coeff value, to test different bit boundaries
|
||||
if ( k == 4 && (i % 9) == 0 ) {
|
||||
max_val >>= 1;
|
||||
}
|
||||
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
|
||||
for (int j = 0; j < block_size; j++) {
|
||||
if (k < 4) { // Test at maximum values
|
||||
coeff[j] = k % 2 ? max_val : -max_val;
|
||||
dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
|
||||
} else {
|
||||
coeff[j] = rnd(2 << 14) - (1 << 14);
|
||||
dqcoeff[j] = rnd(2 << 14) - (1 << 14);
|
||||
}
|
||||
}
|
||||
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
|
||||
bit_depth_);
|
||||
ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
|
||||
&ssz, bit_depth_));
|
||||
err_count += (ref_ret != ret) | (ref_ssz != ssz);
|
||||
if (err_count && !err_count_total) {
|
||||
first_failure = i;
|
||||
}
|
||||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2_C_COMPARE, ErrorBlockTest,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_block_error_sse2,
|
||||
&vp9_highbd_block_error_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_block_error_sse2,
|
||||
&vp9_highbd_block_error_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_block_error_sse2,
|
||||
&vp9_highbd_block_error_c, VPX_BITS_8)));
|
||||
#endif // HAVE_SSE2
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
} // namespace
|
@@ -75,6 +75,16 @@ void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_iwht4x4_16_add_c(in, out, stride, 12);
|
||||
}
|
||||
|
||||
#if HAVE_SSE2
|
||||
void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
class Trans4x4TestBase {
|
||||
@@ -496,4 +506,31 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans4x4DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_fdct4x4_sse2, &vp9_idct4x4_16_add_c, 0,
|
||||
VPX_BITS_8)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans4x4HT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
} // namespace
|
||||
|
@@ -71,6 +71,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
|
||||
|
||||
typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
|
||||
typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
|
||||
typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
|
||||
|
||||
void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
|
||||
vp9_fdct8x8_c(in, out, stride);
|
||||
@@ -96,6 +97,32 @@ void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
|
||||
}
|
||||
|
||||
void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_10_add_c(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_10_add_c(in, out, stride, 12);
|
||||
}
|
||||
|
||||
#if HAVE_SSE2
|
||||
void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
|
||||
void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
class FwdTrans8x8TestBase {
|
||||
@@ -146,9 +173,10 @@ class FwdTrans8x8TestBase {
|
||||
memset(count_sign_block, 0, sizeof(count_sign_block));
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Initialize a test block with input range [-15, 15].
|
||||
// Initialize a test block with input range [-mask_/16, mask_/16].
|
||||
for (int j = 0; j < 64; ++j)
|
||||
test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
|
||||
test_input_block[j] = ((rnd.Rand16() & mask_) >> 4) -
|
||||
((rnd.Rand16() & mask_) >> 4);
|
||||
ASM_REGISTER_STATE_CHECK(
|
||||
RunFwdTxfm(test_input_block, test_output_block, pitch_));
|
||||
|
||||
@@ -188,7 +216,7 @@ class FwdTrans8x8TestBase {
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Initialize a test block with input range [-255, 255].
|
||||
// Initialize a test block with input range [-mask_, mask_].
|
||||
for (int j = 0; j < 64; ++j) {
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
src[j] = rnd.Rand8();
|
||||
@@ -427,6 +455,63 @@ class FwdTrans8x8TestBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CompareInvReference(IdctFunc ref_txfm, int thresh) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = 10000;
|
||||
const int eob = 12;
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
|
||||
#endif
|
||||
const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
if (j < eob) {
|
||||
// Random values less than the threshold, either positive or negative
|
||||
coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
|
||||
} else {
|
||||
coeff[scan[j]] = 0;
|
||||
}
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
dst[j] = 0;
|
||||
ref[j] = 0;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
} else {
|
||||
dst16[j] = 0;
|
||||
ref16[j] = 0;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
ref_txfm(coeff, ref, pitch_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
} else {
|
||||
ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
|
||||
pitch_));
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const uint32_t diff =
|
||||
bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
|
||||
#else
|
||||
const uint32_t diff = dst[j] - ref[j];
|
||||
#endif
|
||||
const uint32_t error = diff * diff;
|
||||
EXPECT_EQ(0u, error)
|
||||
<< "Error: 8x8 IDCT has error " << error
|
||||
<< " at index " << j;
|
||||
}
|
||||
}
|
||||
}
|
||||
int pitch_;
|
||||
int tx_type_;
|
||||
FhtFunc fwd_txfm_ref;
|
||||
@@ -526,6 +611,38 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) {
|
||||
RunExtremalCheck();
|
||||
}
|
||||
|
||||
class InvTrans8x8DCT
|
||||
: public FwdTrans8x8TestBase,
|
||||
public ::testing::TestWithParam<Idct8x8Param> {
|
||||
public:
|
||||
virtual ~InvTrans8x8DCT() {}
|
||||
|
||||
virtual void SetUp() {
|
||||
ref_txfm_ = GET_PARAM(0);
|
||||
inv_txfm_ = GET_PARAM(1);
|
||||
thresh_ = GET_PARAM(2);
|
||||
pitch_ = 8;
|
||||
bit_depth_ = GET_PARAM(3);
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
}
|
||||
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
|
||||
inv_txfm_(out, dst, stride);
|
||||
}
|
||||
void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
|
||||
|
||||
IdctFunc ref_txfm_;
|
||||
IdctFunc inv_txfm_;
|
||||
int thresh_;
|
||||
};
|
||||
|
||||
TEST_P(InvTrans8x8DCT, CompareReference) {
|
||||
CompareInvReference(ref_txfm_, thresh_);
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
@@ -598,6 +715,45 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, FwdTrans8x8DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fdct8x8_c,
|
||||
&idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct8x8_sse2,
|
||||
&idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct8x8_c,
|
||||
&idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fdct8x8_sse2,
|
||||
&idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
|
||||
make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, FwdTrans8x8HT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
|
||||
|
||||
// Optimizations take effect at a threshold of 6201, so we use a value close to
|
||||
// that to test both branches.
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, InvTrans8x8DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&idct8x8_10_add_10_c,
|
||||
&idct8x8_10_add_10_sse2, 6225, VPX_BITS_10),
|
||||
make_tuple(&idct8x8_10,
|
||||
&idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
|
||||
make_tuple(&idct8x8_10_add_12_c,
|
||||
&idct8x8_10_add_12_sse2, 6225, VPX_BITS_12),
|
||||
make_tuple(&idct8x8_12,
|
||||
&idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
|
||||
#endif
|
||||
|
||||
|
||||
#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
|
||||
!CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
|
@@ -23,6 +23,8 @@
|
||||
#include "vp9/common/vp9_entropy.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
#define MAX_LOOP_FILTER 63
|
||||
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
namespace {
|
||||
@@ -51,8 +53,9 @@ typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
|
||||
const uint8_t *thresh1);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
|
||||
typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
|
||||
typedef std::tr1::tuple<loop_op_t, loop_op_t, vpx_bit_depth_t> loop8_param_t;
|
||||
typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t,
|
||||
vpx_bit_depth_t> dualloop8_param_t;
|
||||
|
||||
#if HAVE_SSE2
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
@@ -119,7 +122,7 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
int bit_depth_;
|
||||
vpx_bit_depth_t bit_depth_;
|
||||
int mask_;
|
||||
loop_op_t loopfilter_op_;
|
||||
loop_op_t ref_loopfilter_op_;
|
||||
@@ -138,7 +141,7 @@ class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
int bit_depth_;
|
||||
vpx_bit_depth_t bit_depth_;
|
||||
int mask_;
|
||||
dual_loop_op_t loopfilter_op_;
|
||||
dual_loop_op_t ref_loopfilter_op_;
|
||||
@@ -148,7 +151,7 @@ TEST_P(Loop8Test6Param, OperationCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = number_of_iterations;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
int32_t bd = bit_depth_;
|
||||
vpx_bit_depth_t bd = bit_depth_;
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
|
||||
#else
|
||||
@@ -160,11 +163,18 @@ TEST_P(Loop8Test6Param, OperationCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
int err_count = 0;
|
||||
uint8_t tmp = rnd.Rand8();
|
||||
// mblim <= 3 * MAX_LOOP_FILTER + 4
|
||||
while (tmp > 3 * MAX_LOOP_FILTER + 4) {
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
};
|
||||
tmp = rnd.Rand8();
|
||||
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, limit[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
@@ -211,7 +221,7 @@ TEST_P(Loop8Test6Param, OperationCheck) {
|
||||
ASM_REGISTER_STATE_CHECK(
|
||||
loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
|
||||
#else
|
||||
ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
|
||||
ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count);
|
||||
ASM_REGISTER_STATE_CHECK(
|
||||
loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
@@ -234,7 +244,7 @@ TEST_P(Loop8Test6Param, ValueCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = number_of_iterations;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const int32_t bd = bit_depth_;
|
||||
vpx_bit_depth_t bd = bit_depth_;
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
|
||||
#else
|
||||
@@ -246,11 +256,17 @@ TEST_P(Loop8Test6Param, ValueCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
int err_count = 0;
|
||||
uint8_t tmp = rnd.Rand8();
|
||||
while (tmp > 3*MAX_LOOP_FILTER + 4) { // mblim <= 3*MAX_LOOP_FILTER + 4
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
};
|
||||
tmp = rnd.Rand8();
|
||||
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, limit[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
@@ -271,7 +287,7 @@ TEST_P(Loop8Test6Param, ValueCheck) {
|
||||
ASM_REGISTER_STATE_CHECK(
|
||||
loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
|
||||
#else
|
||||
ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
|
||||
ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count);
|
||||
ASM_REGISTER_STATE_CHECK(
|
||||
loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
@@ -293,7 +309,7 @@ TEST_P(Loop8Test9Param, OperationCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = number_of_iterations;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const int32_t bd = bit_depth_;
|
||||
vpx_bit_depth_t bd = bit_depth_;
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
|
||||
#else
|
||||
@@ -305,11 +321,19 @@ TEST_P(Loop8Test9Param, OperationCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
int err_count = 0;
|
||||
uint8_t tmp = rnd.Rand8();
|
||||
// mblim <= 3 * MAX_LOOP_FILTER + 4
|
||||
while (tmp > 3 * MAX_LOOP_FILTER + 4) {
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
};
|
||||
tmp = rnd.Rand8();
|
||||
// lim <= MAX_LOOP_FILTER
|
||||
while (tmp > MAX_LOOP_FILTER) {
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
@@ -320,11 +344,18 @@ TEST_P(Loop8Test9Param, OperationCheck) {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
};
|
||||
tmp = rnd.Rand8();
|
||||
// mblim <= 3 * MAX_LOOP_FILTER + 4
|
||||
while (tmp > 3 * MAX_LOOP_FILTER + 4) {
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
};
|
||||
tmp = rnd.Rand8();
|
||||
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
@@ -407,11 +438,18 @@ TEST_P(Loop8Test9Param, ValueCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
int err_count = 0;
|
||||
uint8_t tmp = rnd.Rand8();
|
||||
// mblim <= 3 * MAX_LOOP_FILTER + 4
|
||||
while (tmp > 3 * MAX_LOOP_FILTER + 4) {
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
};
|
||||
tmp = rnd.Rand8();
|
||||
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
@@ -422,11 +460,17 @@ TEST_P(Loop8Test9Param, ValueCheck) {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
};
|
||||
tmp = rnd.Rand8();
|
||||
while (tmp > 3 * MAX_LOOP_FILTER + 4) { // mblim <= 3*MAX_LOOP_FILTER + 4
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
};
|
||||
tmp = rnd.Rand8();
|
||||
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
|
||||
tmp = rnd.Rand8();
|
||||
}
|
||||
DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = {
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
|
||||
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
|
||||
@@ -442,7 +486,7 @@ TEST_P(Loop8Test9Param, ValueCheck) {
|
||||
ref_s[j] = s[j];
|
||||
}
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const int32_t bd = bit_depth_;
|
||||
vpx_bit_depth_t bd = bit_depth_;
|
||||
ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
|
||||
blimit1, limit1, thresh1, bd);
|
||||
ASM_REGISTER_STATE_CHECK(
|
||||
@@ -477,48 +521,51 @@ INSTANTIATE_TEST_CASE_P(
|
||||
SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
|
||||
&vp9_highbd_lpf_horizontal_4_c, 8),
|
||||
&vp9_highbd_lpf_horizontal_4_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
|
||||
&vp9_highbd_lpf_vertical_4_c, 8),
|
||||
&vp9_highbd_lpf_vertical_4_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
|
||||
&vp9_highbd_lpf_horizontal_8_c, 8),
|
||||
&vp9_highbd_lpf_horizontal_8_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
|
||||
&vp9_highbd_lpf_horizontal_16_c, 8),
|
||||
&vp9_highbd_lpf_horizontal_16_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
|
||||
&vp9_highbd_lpf_vertical_8_c, 8),
|
||||
&vp9_highbd_lpf_vertical_8_c, VPX_BITS_8),
|
||||
make_tuple(&wrapper_vertical_16_sse2,
|
||||
&wrapper_vertical_16_c, 8),
|
||||
&wrapper_vertical_16_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
|
||||
&vp9_highbd_lpf_horizontal_4_c, 10),
|
||||
&vp9_highbd_lpf_horizontal_4_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
|
||||
&vp9_highbd_lpf_vertical_4_c, 10),
|
||||
&vp9_highbd_lpf_vertical_4_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
|
||||
&vp9_highbd_lpf_horizontal_8_c, 10),
|
||||
&vp9_highbd_lpf_horizontal_8_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
|
||||
&vp9_highbd_lpf_horizontal_16_c, 10),
|
||||
&vp9_highbd_lpf_horizontal_16_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
|
||||
&vp9_highbd_lpf_vertical_8_c, 10),
|
||||
&vp9_highbd_lpf_vertical_8_c, VPX_BITS_10),
|
||||
make_tuple(&wrapper_vertical_16_sse2,
|
||||
&wrapper_vertical_16_c, 10),
|
||||
&wrapper_vertical_16_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
|
||||
&vp9_highbd_lpf_horizontal_4_c, 12),
|
||||
&vp9_highbd_lpf_horizontal_4_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
|
||||
&vp9_highbd_lpf_vertical_4_c, 12),
|
||||
&vp9_highbd_lpf_vertical_4_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
|
||||
&vp9_highbd_lpf_horizontal_8_c, 12),
|
||||
&vp9_highbd_lpf_horizontal_8_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
|
||||
&vp9_highbd_lpf_horizontal_16_c, 12),
|
||||
&vp9_highbd_lpf_horizontal_16_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
|
||||
&vp9_highbd_lpf_vertical_8_c, 12),
|
||||
&vp9_highbd_lpf_vertical_8_c, VPX_BITS_12),
|
||||
make_tuple(&wrapper_vertical_16_sse2,
|
||||
&wrapper_vertical_16_c, 12)));
|
||||
&wrapper_vertical_16_c, VPX_BITS_12)));
|
||||
#else
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
|
||||
make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
|
||||
make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8)));
|
||||
make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c,
|
||||
VPX_BITS_8),
|
||||
make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c,
|
||||
VPX_BITS_8),
|
||||
make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c,
|
||||
VPX_BITS_8)));
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
#endif
|
||||
|
||||
@@ -528,60 +575,61 @@ INSTANTIATE_TEST_CASE_P(
|
||||
SSE2_C_COMPARE_DUAL, Loop8Test6Param,
|
||||
::testing::Values(
|
||||
make_tuple(&wrapper_vertical_16_dual_sse2,
|
||||
&wrapper_vertical_16_dual_c, 8),
|
||||
&wrapper_vertical_16_dual_c, VPX_BITS_8),
|
||||
make_tuple(&wrapper_vertical_16_dual_sse2,
|
||||
&wrapper_vertical_16_dual_c, 10),
|
||||
&wrapper_vertical_16_dual_c, VPX_BITS_10),
|
||||
make_tuple(&wrapper_vertical_16_dual_sse2,
|
||||
&wrapper_vertical_16_dual_c, 12)));
|
||||
&wrapper_vertical_16_dual_c, VPX_BITS_12)));
|
||||
#else
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2_C_COMPARE_DUAL, Loop8Test6Param,
|
||||
::testing::Values(
|
||||
make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
|
||||
make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c,
|
||||
VPX_BITS_8)));
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_SSE2
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE_C_COMPARE_DUAL, Loop8Test9Param,
|
||||
SSE2_C_COMPARE_DUAL, Loop8Test9Param,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
|
||||
&vp9_highbd_lpf_horizontal_4_dual_c, 8),
|
||||
&vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
|
||||
&vp9_highbd_lpf_horizontal_8_dual_c, 8),
|
||||
&vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
|
||||
&vp9_highbd_lpf_vertical_4_dual_c, 8),
|
||||
&vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
|
||||
&vp9_highbd_lpf_vertical_8_dual_c, 8),
|
||||
&vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
|
||||
&vp9_highbd_lpf_horizontal_4_dual_c, 10),
|
||||
&vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
|
||||
&vp9_highbd_lpf_horizontal_8_dual_c, 10),
|
||||
&vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
|
||||
&vp9_highbd_lpf_vertical_4_dual_c, 10),
|
||||
&vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
|
||||
&vp9_highbd_lpf_vertical_8_dual_c, 10),
|
||||
&vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
|
||||
&vp9_highbd_lpf_horizontal_4_dual_c, 12),
|
||||
&vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
|
||||
&vp9_highbd_lpf_horizontal_8_dual_c, 12),
|
||||
&vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
|
||||
&vp9_highbd_lpf_vertical_4_dual_c, 12),
|
||||
&vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
|
||||
&vp9_highbd_lpf_vertical_8_dual_c, 12)));
|
||||
&vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_12)));
|
||||
#else
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE_C_COMPARE_DUAL, Loop8Test9Param,
|
||||
SSE2_C_COMPARE_DUAL, Loop8Test9Param,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
|
||||
&vp9_lpf_horizontal_4_dual_c, 8),
|
||||
&vp9_lpf_horizontal_4_dual_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
|
||||
&vp9_lpf_horizontal_8_dual_c, 8),
|
||||
&vp9_lpf_horizontal_8_dual_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_lpf_vertical_4_dual_sse2,
|
||||
&vp9_lpf_vertical_4_dual_c, 8),
|
||||
&vp9_lpf_vertical_4_dual_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_lpf_vertical_8_dual_sse2,
|
||||
&vp9_lpf_vertical_8_dual_c, 8)));
|
||||
&vp9_lpf_vertical_8_dual_c, VPX_BITS_8)));
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
#endif
|
||||
} // namespace
|
||||
|
353
test/quantize_test.cc
Normal file
353
test/quantize_test.cc
Normal file
@@ -0,0 +1,353 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/util.h"
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/vp9_entropy.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
namespace {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const int number_of_iterations = 100;
|
||||
|
||||
typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
|
||||
int skip_block, const int16_t *zbin,
|
||||
const int16_t *round, const int16_t *quant,
|
||||
const int16_t *quant_shift,
|
||||
tran_low_t *qcoeff, tran_low_t *dqcoeff,
|
||||
const int16_t *dequant, int zbin_oq_value,
|
||||
uint16_t *eob, const int16_t *scan,
|
||||
const int16_t *iscan);
|
||||
typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
|
||||
QuantizeParam;
|
||||
class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
|
||||
public:
|
||||
virtual ~QuantizeTest() {}
|
||||
virtual void SetUp() {
|
||||
quantize_op_ = GET_PARAM(0);
|
||||
ref_quantize_op_ = GET_PARAM(1);
|
||||
bit_depth_ = GET_PARAM(2);
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
}
|
||||
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
vpx_bit_depth_t bit_depth_;
|
||||
int mask_;
|
||||
QuantizeFunc quantize_op_;
|
||||
QuantizeFunc ref_quantize_op_;
|
||||
};
|
||||
class Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
|
||||
public:
|
||||
virtual ~Quantize32Test() {}
|
||||
virtual void SetUp() {
|
||||
quantize_op_ = GET_PARAM(0);
|
||||
ref_quantize_op_ = GET_PARAM(1);
|
||||
bit_depth_ = GET_PARAM(2);
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
}
|
||||
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
vpx_bit_depth_t bit_depth_;
|
||||
int mask_;
|
||||
QuantizeFunc quantize_op_;
|
||||
QuantizeFunc ref_quantize_op_;
|
||||
};
|
||||
|
||||
TEST_P(QuantizeTest, OperationCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
int zbin_oq_value = 0;
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
|
||||
int err_count_total = 0;
|
||||
int first_failure = -1;
|
||||
for (int i = 0; i < number_of_iterations; ++i) {
|
||||
int skip_block = i == 0;
|
||||
TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16
|
||||
TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
|
||||
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
|
||||
int count = (4 << sz) * (4 << sz); // 16, 64, 256
|
||||
int err_count = 0;
|
||||
*eob_ptr = rnd.Rand16();
|
||||
*ref_eob_ptr = *eob_ptr;
|
||||
for (int j = 0; j < count; j++) {
|
||||
coeff_ptr[j] = rnd.Rand16()&mask_;
|
||||
}
|
||||
for (int j = 0; j < 2; j++) {
|
||||
zbin_ptr[j] = rnd.Rand16()&mask_;
|
||||
round_ptr[j] = rnd.Rand16();
|
||||
quant_ptr[j] = rnd.Rand16();
|
||||
quant_shift_ptr[j] = rnd.Rand16();
|
||||
dequant_ptr[j] = rnd.Rand16();
|
||||
}
|
||||
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
|
||||
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
|
||||
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
|
||||
ref_eob_ptr, scan_order->scan, scan_order->iscan);
|
||||
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
|
||||
zbin_ptr, round_ptr, quant_ptr,
|
||||
quant_shift_ptr, qcoeff_ptr,
|
||||
dqcoeff_ptr, dequant_ptr,
|
||||
zbin_oq_value, eob_ptr,
|
||||
scan_order->scan, scan_order->iscan));
|
||||
for (int j = 0; j < sz; ++j) {
|
||||
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
|
||||
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
|
||||
}
|
||||
err_count += (*ref_eob_ptr != *eob_ptr);
|
||||
if (err_count && !err_count_total) {
|
||||
first_failure = i;
|
||||
}
|
||||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
TEST_P(Quantize32Test, OperationCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
int zbin_oq_value = 0;
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
|
||||
int err_count_total = 0;
|
||||
int first_failure = -1;
|
||||
for (int i = 0; i < number_of_iterations; ++i) {
|
||||
int skip_block = i == 0;
|
||||
TX_SIZE sz = TX_32X32;
|
||||
TX_TYPE tx_type = (TX_TYPE)(i % 4);
|
||||
|
||||
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
|
||||
int count = (4 << sz) * (4 << sz); // 1024
|
||||
int err_count = 0;
|
||||
*eob_ptr = rnd.Rand16();
|
||||
*ref_eob_ptr = *eob_ptr;
|
||||
for (int j = 0; j < count; j++) {
|
||||
coeff_ptr[j] = rnd.Rand16()&mask_;
|
||||
}
|
||||
for (int j = 0; j < 2; j++) {
|
||||
zbin_ptr[j] = rnd.Rand16()&mask_;
|
||||
round_ptr[j] = rnd.Rand16();
|
||||
quant_ptr[j] = rnd.Rand16();
|
||||
quant_shift_ptr[j] = rnd.Rand16();
|
||||
dequant_ptr[j] = rnd.Rand16();
|
||||
}
|
||||
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
|
||||
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
|
||||
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
|
||||
ref_eob_ptr, scan_order->scan, scan_order->iscan);
|
||||
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
|
||||
zbin_ptr, round_ptr, quant_ptr,
|
||||
quant_shift_ptr, qcoeff_ptr,
|
||||
dqcoeff_ptr, dequant_ptr,
|
||||
zbin_oq_value, eob_ptr,
|
||||
scan_order->scan, scan_order->iscan));
|
||||
for (int j = 0; j < sz; ++j) {
|
||||
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
|
||||
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
|
||||
}
|
||||
err_count += (*ref_eob_ptr != *eob_ptr);
|
||||
if (err_count && !err_count_total) {
|
||||
first_failure = i;
|
||||
}
|
||||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
TEST_P(QuantizeTest, EOBCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
int zbin_oq_value = 0;
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
|
||||
int err_count_total = 0;
|
||||
int first_failure = -1;
|
||||
for (int i = 0; i < number_of_iterations; ++i) {
|
||||
int skip_block = i == 0;
|
||||
TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16
|
||||
TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
|
||||
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
|
||||
int count = (4 << sz) * (4 << sz); // 16, 64, 256
|
||||
int err_count = 0;
|
||||
*eob_ptr = rnd.Rand16();
|
||||
*ref_eob_ptr = *eob_ptr;
|
||||
// Two random entries
|
||||
for (int j = 0; j < count; j++) {
|
||||
coeff_ptr[j] = 0;
|
||||
}
|
||||
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
|
||||
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
zbin_ptr[j] = rnd.Rand16()&mask_;
|
||||
round_ptr[j] = rnd.Rand16();
|
||||
quant_ptr[j] = rnd.Rand16();
|
||||
quant_shift_ptr[j] = rnd.Rand16();
|
||||
dequant_ptr[j] = rnd.Rand16();
|
||||
}
|
||||
|
||||
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
|
||||
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
|
||||
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
|
||||
ref_eob_ptr, scan_order->scan, scan_order->iscan);
|
||||
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
|
||||
zbin_ptr, round_ptr, quant_ptr,
|
||||
quant_shift_ptr, qcoeff_ptr,
|
||||
dqcoeff_ptr, dequant_ptr,
|
||||
zbin_oq_value, eob_ptr,
|
||||
scan_order->scan, scan_order->iscan));
|
||||
|
||||
for (int j = 0; j < sz; ++j) {
|
||||
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
|
||||
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
|
||||
}
|
||||
err_count += (*ref_eob_ptr != *eob_ptr);
|
||||
if (err_count && !err_count_total) {
|
||||
first_failure = i;
|
||||
}
|
||||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
TEST_P(Quantize32Test, EOBCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
int zbin_oq_value = 0;
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
|
||||
int err_count_total = 0;
|
||||
int first_failure = -1;
|
||||
for (int i = 0; i < number_of_iterations; ++i) {
|
||||
int skip_block = i == 0;
|
||||
TX_SIZE sz = TX_32X32;
|
||||
TX_TYPE tx_type = (TX_TYPE)(i % 4);
|
||||
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
|
||||
int count = (4 << sz) * (4 << sz); // 1024
|
||||
int err_count = 0;
|
||||
*eob_ptr = rnd.Rand16();
|
||||
*ref_eob_ptr = *eob_ptr;
|
||||
for (int j = 0; j < count; j++) {
|
||||
coeff_ptr[j] = 0;
|
||||
}
|
||||
// Two random entries
|
||||
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
|
||||
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
zbin_ptr[j] = rnd.Rand16()&mask_;
|
||||
round_ptr[j] = rnd.Rand16();
|
||||
quant_ptr[j] = rnd.Rand16();
|
||||
quant_shift_ptr[j] = rnd.Rand16();
|
||||
dequant_ptr[j] = rnd.Rand16();
|
||||
}
|
||||
|
||||
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
|
||||
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
|
||||
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
|
||||
ref_eob_ptr, scan_order->scan, scan_order->iscan);
|
||||
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
|
||||
zbin_ptr, round_ptr, quant_ptr,
|
||||
quant_shift_ptr, qcoeff_ptr,
|
||||
dqcoeff_ptr, dequant_ptr,
|
||||
zbin_oq_value, eob_ptr,
|
||||
scan_order->scan, scan_order->iscan));
|
||||
|
||||
for (int j = 0; j < sz; ++j) {
|
||||
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
|
||||
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
|
||||
}
|
||||
err_count += (*ref_eob_ptr != *eob_ptr);
|
||||
if (err_count && !err_count_total) {
|
||||
first_failure = i;
|
||||
}
|
||||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2_C_COMPARE, QuantizeTest,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_quantize_b_sse2,
|
||||
&vp9_highbd_quantize_b_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_quantize_b_sse2,
|
||||
&vp9_highbd_quantize_b_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_quantize_b_sse2,
|
||||
&vp9_highbd_quantize_b_c, VPX_BITS_12)));
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2_C_COMPARE, Quantize32Test,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
|
||||
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_8),
|
||||
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
|
||||
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
|
||||
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_12)));
|
||||
#endif // HAVE_SSE2
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
} // namespace
|
828
test/sad_test.cc
828
test/sad_test.cc
File diff suppressed because it is too large
Load Diff
@@ -134,6 +134,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += quantize_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += error_block_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc
|
||||
|
||||
ifeq ($(CONFIG_VP9_ENCODER),yes)
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -276,7 +276,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
|
||||
// Note the offset is 1 less than half.
|
||||
const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
|
||||
int plane;
|
||||
if (dst->w != src->w || dst->h != src->h ||
|
||||
if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
|
||||
dst->x_chroma_shift != src->x_chroma_shift ||
|
||||
dst->y_chroma_shift != src->y_chroma_shift ||
|
||||
dst->fmt != src->fmt || input_shift < 0) {
|
||||
@@ -293,12 +293,12 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
|
||||
break;
|
||||
}
|
||||
for (plane = 0; plane < 3; plane++) {
|
||||
int w = src->w;
|
||||
int h = src->h;
|
||||
int w = src->d_w;
|
||||
int h = src->d_h;
|
||||
int x, y;
|
||||
if (plane) {
|
||||
w >>= src->x_chroma_shift;
|
||||
h >>= src->y_chroma_shift;
|
||||
w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
|
||||
h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
|
||||
}
|
||||
for (y = 0; y < h; y++) {
|
||||
uint16_t *p_src =
|
||||
@@ -316,7 +316,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
|
||||
// Note the offset is 1 less than half.
|
||||
const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
|
||||
int plane;
|
||||
if (dst->w != src->w || dst->h != src->h ||
|
||||
if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
|
||||
dst->x_chroma_shift != src->x_chroma_shift ||
|
||||
dst->y_chroma_shift != src->y_chroma_shift ||
|
||||
dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
|
||||
@@ -334,8 +334,8 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
|
||||
break;
|
||||
}
|
||||
for (plane = 0; plane < 3; plane++) {
|
||||
int w = src->w;
|
||||
int h = src->h;
|
||||
int w = src->d_w;
|
||||
int h = src->d_h;
|
||||
int x, y;
|
||||
if (plane) {
|
||||
w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
|
||||
@@ -384,8 +384,8 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
|
||||
int h = src->d_h;
|
||||
int x, y;
|
||||
if (plane) {
|
||||
w >>= src->x_chroma_shift;
|
||||
h >>= src->y_chroma_shift;
|
||||
w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
|
||||
h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
|
||||
}
|
||||
for (y = 0; y < h; y++) {
|
||||
uint16_t *p_src =
|
||||
|
@@ -15,36 +15,6 @@
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
|
||||
#if CONFIG_EMULATE_HARDWARE
|
||||
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
|
||||
// non-normative method to handle overflows. A stream that causes
|
||||
// overflows in the inverse transform is considered invalid in VP9,
|
||||
// and a hardware implementer is free to choose any reasonable
|
||||
// method to handle overflows. However to aid in hardware
|
||||
// verification they can use a specific implementation of the
|
||||
// WRAPLOW() macro below that is identical to their intended
|
||||
// hardware implementation (and also use configure options to trigger
|
||||
// the C-implementation of the transform).
|
||||
//
|
||||
// The particular WRAPLOW implementation below performs strict
|
||||
// overflow wrapping to match common hardware implementations.
|
||||
// bd of 8 uses trans_low with 16bits, need to remove 16bits
|
||||
// bd of 10 uses trans_low with 18bits, need to remove 14bits
|
||||
// bd of 12 uses trans_low with 20bits, need to remove 12bits
|
||||
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
|
||||
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
|
||||
#else
|
||||
#define WRAPLOW(x, bd) (x)
|
||||
#endif // CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
|
||||
int bd) {
|
||||
trans = WRAPLOW(trans, bd);
|
||||
return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
|
||||
trans = WRAPLOW(trans, 8);
|
||||
return clip_pixel(WRAPLOW(dest + trans, 8));
|
||||
@@ -276,10 +246,10 @@ void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
|
||||
static void iadst4(const tran_low_t *input, tran_low_t *output) {
|
||||
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
|
||||
tran_high_t x0 = input[0];
|
||||
tran_high_t x1 = input[1];
|
||||
tran_high_t x2 = input[2];
|
||||
tran_high_t x3 = input[3];
|
||||
tran_low_t x0 = input[0];
|
||||
tran_low_t x1 = input[1];
|
||||
tran_low_t x2 = input[2];
|
||||
tran_low_t x3 = input[3];
|
||||
|
||||
if (!(x0 | x1 | x2 | x3)) {
|
||||
output[0] = output[1] = output[2] = output[3] = 0;
|
||||
@@ -295,24 +265,19 @@ static void iadst4(const tran_low_t *input, tran_low_t *output) {
|
||||
s6 = sinpi_4_9 * x3;
|
||||
s7 = x0 - x2 + x3;
|
||||
|
||||
x0 = s0 + s3 + s5;
|
||||
x1 = s1 - s4 - s6;
|
||||
x2 = sinpi_3_9 * s7;
|
||||
x3 = s2;
|
||||
|
||||
s0 = x0 + x3;
|
||||
s1 = x1 + x3;
|
||||
s2 = x2;
|
||||
s3 = x0 + x1 - x3;
|
||||
s0 = s0 + s3 + s5;
|
||||
s1 = s1 - s4 - s6;
|
||||
s3 = s2;
|
||||
s2 = sinpi_3_9 * s7;
|
||||
|
||||
// 1-D transform scaling factor is sqrt(2).
|
||||
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
|
||||
// + 1b (addition) = 29b.
|
||||
// Hence the output bit depth is 15b.
|
||||
output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
|
||||
output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
|
||||
output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
|
||||
output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
|
||||
output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
|
||||
output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
|
||||
output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
|
||||
}
|
||||
|
||||
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
|
||||
@@ -1545,7 +1510,7 @@ void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
tran_low_t step[4];
|
||||
tran_high_t temp1, temp2;
|
||||
(void) bd;
|
||||
@@ -1576,7 +1541,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
|
||||
// Rows
|
||||
for (i = 0; i < 4; ++i) {
|
||||
highbd_idct4(input, outptr, bd);
|
||||
vp9_highbd_idct4(input, outptr, bd);
|
||||
input += 4;
|
||||
outptr += 4;
|
||||
}
|
||||
@@ -1585,7 +1550,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
for (i = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
temp_in[j] = out[j * 4 + i];
|
||||
highbd_idct4(temp_in, temp_out, bd);
|
||||
vp9_highbd_idct4(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 4; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
|
||||
@@ -1612,7 +1577,7 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
tran_low_t step1[8], step2[8];
|
||||
tran_high_t temp1, temp2;
|
||||
// stage 1
|
||||
@@ -1630,7 +1595,7 @@ static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
|
||||
// stage 2 & stage 3 - even half
|
||||
highbd_idct4(step1, step1, bd);
|
||||
vp9_highbd_idct4(step1, step1, bd);
|
||||
|
||||
// stage 2 - odd half
|
||||
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
|
||||
@@ -1667,7 +1632,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
|
||||
// First transform rows.
|
||||
for (i = 0; i < 8; ++i) {
|
||||
highbd_idct8(input, outptr, bd);
|
||||
vp9_highbd_idct8(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
@@ -1676,7 +1641,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j)
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
highbd_idct8(temp_in, temp_out, bd);
|
||||
vp9_highbd_idct8(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
|
||||
@@ -1702,10 +1667,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
|
||||
tran_high_t x0 = input[0];
|
||||
tran_high_t x1 = input[1];
|
||||
tran_high_t x2 = input[2];
|
||||
tran_high_t x3 = input[3];
|
||||
tran_low_t x0 = input[0];
|
||||
tran_low_t x1 = input[1];
|
||||
tran_low_t x2 = input[2];
|
||||
tran_low_t x3 = input[3];
|
||||
(void) bd;
|
||||
|
||||
if (!(x0 | x1 | x2 | x3)) {
|
||||
@@ -1720,34 +1685,29 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
s4 = sinpi_1_9 * x2;
|
||||
s5 = sinpi_2_9 * x3;
|
||||
s6 = sinpi_4_9 * x3;
|
||||
s7 = x0 - x2 + x3;
|
||||
s7 = (tran_high_t)(x0 - x2 + x3);
|
||||
|
||||
x0 = s0 + s3 + s5;
|
||||
x1 = s1 - s4 - s6;
|
||||
x2 = sinpi_3_9 * s7;
|
||||
x3 = s2;
|
||||
|
||||
s0 = x0 + x3;
|
||||
s1 = x1 + x3;
|
||||
s2 = x2;
|
||||
s3 = x0 + x1 - x3;
|
||||
s0 = s0 + s3 + s5;
|
||||
s1 = s1 - s4 - s6;
|
||||
s3 = s2;
|
||||
s2 = sinpi_3_9 * s7;
|
||||
|
||||
// 1-D transform scaling factor is sqrt(2).
|
||||
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
|
||||
// + 1b (addition) = 29b.
|
||||
// Hence the output bit depth is 15b.
|
||||
output[0] = WRAPLOW(dct_const_round_shift(s0), bd);
|
||||
output[1] = WRAPLOW(dct_const_round_shift(s1), bd);
|
||||
output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), bd);
|
||||
output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), bd);
|
||||
output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
|
||||
output[3] = WRAPLOW(dct_const_round_shift(s3), bd);
|
||||
output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
|
||||
}
|
||||
|
||||
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int tx_type, int bd) {
|
||||
const highbd_transform_2d IHT_4[] = {
|
||||
{ highbd_idct4, highbd_idct4 }, // DCT_DCT = 0
|
||||
{ highbd_iadst4, highbd_idct4 }, // ADST_DCT = 1
|
||||
{ highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
|
||||
{ vp9_highbd_idct4, vp9_highbd_idct4 }, // DCT_DCT = 0
|
||||
{ highbd_iadst4, vp9_highbd_idct4 }, // ADST_DCT = 1
|
||||
{ vp9_highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
|
||||
{ highbd_iadst4, highbd_iadst4 } // ADST_ADST = 3
|
||||
};
|
||||
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
@@ -1779,14 +1739,14 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
|
||||
tran_high_t x0 = input[7];
|
||||
tran_high_t x1 = input[0];
|
||||
tran_high_t x2 = input[5];
|
||||
tran_high_t x3 = input[2];
|
||||
tran_high_t x4 = input[3];
|
||||
tran_high_t x5 = input[4];
|
||||
tran_high_t x6 = input[1];
|
||||
tran_high_t x7 = input[6];
|
||||
tran_low_t x0 = input[7];
|
||||
tran_low_t x1 = input[0];
|
||||
tran_low_t x2 = input[5];
|
||||
tran_low_t x3 = input[2];
|
||||
tran_low_t x4 = input[3];
|
||||
tran_low_t x5 = input[4];
|
||||
tran_low_t x6 = input[1];
|
||||
tran_low_t x7 = input[6];
|
||||
(void) bd;
|
||||
|
||||
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
|
||||
@@ -1854,9 +1814,9 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
}
|
||||
|
||||
static const highbd_transform_2d HIGH_IHT_8[] = {
|
||||
{ highbd_idct8, highbd_idct8 }, // DCT_DCT = 0
|
||||
{ highbd_iadst8, highbd_idct8 }, // ADST_DCT = 1
|
||||
{ highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2
|
||||
{ vp9_highbd_idct8, vp9_highbd_idct8 }, // DCT_DCT = 0
|
||||
{ highbd_iadst8, vp9_highbd_idct8 }, // ADST_DCT = 1
|
||||
{ vp9_highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2
|
||||
{ highbd_iadst8, highbd_iadst8 } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
@@ -1899,7 +1859,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
// First transform rows.
|
||||
// Only first 4 row has non-zero coefs.
|
||||
for (i = 0; i < 4; ++i) {
|
||||
highbd_idct8(input, outptr, bd);
|
||||
vp9_highbd_idct8(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
@@ -1907,7 +1867,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j)
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
highbd_idct8(temp_in, temp_out, bd);
|
||||
vp9_highbd_idct8(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
|
||||
@@ -1915,7 +1875,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
tran_low_t step1[16], step2[16];
|
||||
tran_high_t temp1, temp2;
|
||||
(void) bd;
|
||||
@@ -2091,7 +2051,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
|
||||
// First transform rows.
|
||||
for (i = 0; i < 16; ++i) {
|
||||
highbd_idct16(input, outptr, bd);
|
||||
vp9_highbd_idct16(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
@@ -2100,7 +2060,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j)
|
||||
temp_in[j] = out[j * 16 + i];
|
||||
highbd_idct16(temp_in, temp_out, bd);
|
||||
vp9_highbd_idct16(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
||||
@@ -2113,22 +2073,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
|
||||
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
|
||||
tran_high_t s9, s10, s11, s12, s13, s14, s15;
|
||||
|
||||
tran_high_t x0 = input[15];
|
||||
tran_high_t x1 = input[0];
|
||||
tran_high_t x2 = input[13];
|
||||
tran_high_t x3 = input[2];
|
||||
tran_high_t x4 = input[11];
|
||||
tran_high_t x5 = input[4];
|
||||
tran_high_t x6 = input[9];
|
||||
tran_high_t x7 = input[6];
|
||||
tran_high_t x8 = input[7];
|
||||
tran_high_t x9 = input[8];
|
||||
tran_high_t x10 = input[5];
|
||||
tran_high_t x11 = input[10];
|
||||
tran_high_t x12 = input[3];
|
||||
tran_high_t x13 = input[12];
|
||||
tran_high_t x14 = input[1];
|
||||
tran_high_t x15 = input[14];
|
||||
tran_low_t x0 = input[15];
|
||||
tran_low_t x1 = input[0];
|
||||
tran_low_t x2 = input[13];
|
||||
tran_low_t x3 = input[2];
|
||||
tran_low_t x4 = input[11];
|
||||
tran_low_t x5 = input[4];
|
||||
tran_low_t x6 = input[9];
|
||||
tran_low_t x7 = input[6];
|
||||
tran_low_t x8 = input[7];
|
||||
tran_low_t x9 = input[8];
|
||||
tran_low_t x10 = input[5];
|
||||
tran_low_t x11 = input[10];
|
||||
tran_low_t x12 = input[3];
|
||||
tran_low_t x13 = input[12];
|
||||
tran_low_t x14 = input[1];
|
||||
tran_low_t x15 = input[14];
|
||||
(void) bd;
|
||||
|
||||
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
|
||||
@@ -2280,9 +2240,9 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
|
||||
}
|
||||
|
||||
static const highbd_transform_2d HIGH_IHT_16[] = {
|
||||
{ highbd_idct16, highbd_idct16 }, // DCT_DCT = 0
|
||||
{ highbd_iadst16, highbd_idct16 }, // ADST_DCT = 1
|
||||
{ highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2
|
||||
{ vp9_highbd_idct16, vp9_highbd_idct16 }, // DCT_DCT = 0
|
||||
{ highbd_iadst16, vp9_highbd_idct16 }, // ADST_DCT = 1
|
||||
{ vp9_highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2
|
||||
{ highbd_iadst16, highbd_iadst16 } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
@@ -2325,7 +2285,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
// First transform rows. Since all non-zero dct coefficients are in
|
||||
// upper-left 4x4 area, we only need to calculate first 4 rows here.
|
||||
for (i = 0; i < 4; ++i) {
|
||||
highbd_idct16(input, outptr, bd);
|
||||
vp9_highbd_idct16(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
@@ -2334,7 +2294,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j)
|
||||
temp_in[j] = out[j*16 + i];
|
||||
highbd_idct16(temp_in, temp_out, bd);
|
||||
vp9_highbd_idct16(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
||||
|
@@ -116,6 +116,28 @@ typedef struct {
|
||||
} highbd_transform_2d;
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if CONFIG_EMULATE_HARDWARE
|
||||
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
|
||||
// non-normative method to handle overflows. A stream that causes
|
||||
// overflows in the inverse transform is considered invalid in VP9,
|
||||
// and a hardware implementer is free to choose any reasonable
|
||||
// method to handle overflows. However to aid in hardware
|
||||
// verification they can use a specific implementation of the
|
||||
// WRAPLOW() macro below that is identical to their intended
|
||||
// hardware implementation (and also use configure options to trigger
|
||||
// the C-implementation of the transform).
|
||||
//
|
||||
// The particular WRAPLOW implementation below performs strict
|
||||
// overflow wrapping to match common hardware implementations.
|
||||
// bd of 8 uses trans_low with 16bits, need to remove 16bits
|
||||
// bd of 10 uses trans_low with 18bits, need to remove 14bits
|
||||
// bd of 12 uses trans_low with 20bits, need to remove 12bits
|
||||
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
|
||||
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
|
||||
#else
|
||||
#define WRAPLOW(x, bd) (x)
|
||||
#endif // CONFIG_EMULATE_HARDWARE
|
||||
|
||||
void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
|
||||
int eob);
|
||||
void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
|
||||
@@ -135,6 +157,9 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
|
||||
int stride, int eob);
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);
|
||||
void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);
|
||||
void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);
|
||||
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
|
||||
int eob, int bd);
|
||||
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
|
||||
@@ -151,6 +176,11 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
|
||||
uint8_t *dest, int stride, int eob, int bd);
|
||||
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
|
||||
uint8_t *dest, int stride, int eob, int bd);
|
||||
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
|
||||
int bd) {
|
||||
trans = WRAPLOW(trans, bd);
|
||||
return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
@@ -750,27 +750,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct4x4_1_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct4x4_16_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_1_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_64_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_10_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_1_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_256_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_10_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct32x32_1024_add/;
|
||||
|
||||
@@ -796,6 +781,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_iwht4x4_16_add/;
|
||||
|
||||
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
|
||||
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
|
||||
|
||||
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct4x4_16_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_64_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_10_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_256_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_10_add/;
|
||||
|
||||
} else {
|
||||
|
||||
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct4x4_16_add sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_64_add sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_10_add sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_256_add sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_10_add sse2/;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
@@ -1114,6 +1135,11 @@ specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
|
||||
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
|
||||
specialize qw/vp9_avg_8x8 sse2/;
|
||||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
|
||||
specialize qw/vp9_highbd_avg_8x8/;
|
||||
}
|
||||
|
||||
# ENCODEMB INVOKE
|
||||
|
||||
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
|
||||
@@ -1176,43 +1202,43 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
|
||||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht4x4/;
|
||||
specialize qw/vp9_fht4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht8x8/;
|
||||
specialize qw/vp9_fht8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht16x16/;
|
||||
specialize qw/vp9_fht16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fwht4x4/;
|
||||
specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
|
||||
|
||||
add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct4x4_1/;
|
||||
specialize qw/vp9_fdct4x4_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct4x4/;
|
||||
specialize qw/vp9_fdct4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct8x8_1/;
|
||||
specialize qw/vp9_fdct8x8_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct8x8/;
|
||||
specialize qw/vp9_fdct8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct16x16_1/;
|
||||
specialize qw/vp9_fdct16x16_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct16x16/;
|
||||
specialize qw/vp9_fdct16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct32x32_1/;
|
||||
specialize qw/vp9_fdct32x32_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct32x32/;
|
||||
specialize qw/vp9_fdct32x32 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct32x32_rd/;
|
||||
specialize qw/vp9_fdct32x32_rd sse2/;
|
||||
} else {
|
||||
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht4x4 sse2/;
|
||||
@@ -1278,34 +1304,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
# variance
|
||||
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance32x16/;
|
||||
specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance16x32/;
|
||||
specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance64x32/;
|
||||
specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance32x64/;
|
||||
specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance32x32/;
|
||||
specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance64x64/;
|
||||
specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance16x16/;
|
||||
specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance16x8/;
|
||||
specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance8x16/;
|
||||
specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance8x8/;
|
||||
specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_variance8x4/;
|
||||
@@ -1317,40 +1343,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_variance4x4/;
|
||||
|
||||
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_get8x8var/;
|
||||
specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_get16x16var/;
|
||||
specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance32x16/;
|
||||
specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance16x32/;
|
||||
specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance64x32/;
|
||||
specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance32x64/;
|
||||
specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance32x32/;
|
||||
specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance64x64/;
|
||||
specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance16x16/;
|
||||
specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance16x8/;
|
||||
specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance8x16/;
|
||||
specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance8x8/;
|
||||
specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_variance8x4/;
|
||||
@@ -1362,40 +1388,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_10_variance4x4/;
|
||||
|
||||
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_10_get8x8var/;
|
||||
specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_10_get16x16var/;
|
||||
specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance32x16/;
|
||||
specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance16x32/;
|
||||
specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance64x32/;
|
||||
specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance32x64/;
|
||||
specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance32x32/;
|
||||
specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance64x64/;
|
||||
specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance16x16/;
|
||||
specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance16x8/;
|
||||
specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance8x16/;
|
||||
specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance8x8/;
|
||||
specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_variance8x4/;
|
||||
@@ -1407,76 +1433,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_12_variance4x4/;
|
||||
|
||||
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_12_get8x8var/;
|
||||
specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vp9_highbd_12_get16x16var/;
|
||||
specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance64x64/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x64/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance64x32/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x16/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x32/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x32/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x16/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x16/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x8/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x8/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x4/;
|
||||
specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_sub_pixel_variance4x8/;
|
||||
@@ -1491,70 +1517,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
|
||||
@@ -1569,70 +1595,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
|
||||
@@ -1647,37 +1673,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad64x64/;
|
||||
specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad32x64/;
|
||||
specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad64x32/;
|
||||
specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad32x16/;
|
||||
specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad16x32/;
|
||||
specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad32x32/;
|
||||
specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad16x16/;
|
||||
specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad16x8/;
|
||||
specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad8x16/;
|
||||
specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad8x8/;
|
||||
specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad8x4/;
|
||||
specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vp9_highbd_sad4x8/;
|
||||
@@ -1686,37 +1712,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_sad4x4/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad64x64_avg/;
|
||||
specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad32x64_avg/;
|
||||
specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad64x32_avg/;
|
||||
specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad32x16_avg/;
|
||||
specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad16x32_avg/;
|
||||
specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad32x32_avg/;
|
||||
specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad16x16_avg/;
|
||||
specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad16x8_avg/;
|
||||
specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad8x16_avg/;
|
||||
specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad8x8_avg/;
|
||||
specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad8x4_avg/;
|
||||
specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
|
||||
specialize qw/vp9_highbd_sad4x8_avg/;
|
||||
@@ -1773,47 +1799,46 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_sad4x4x8/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad64x64x4d/;
|
||||
specialize qw/vp9_highbd_sad64x64x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad32x64x4d/;
|
||||
specialize qw/vp9_highbd_sad32x64x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad64x32x4d/;
|
||||
specialize qw/vp9_highbd_sad64x32x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad32x16x4d/;
|
||||
specialize qw/vp9_highbd_sad32x16x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad16x32x4d/;
|
||||
specialize qw/vp9_highbd_sad16x32x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad32x32x4d/;
|
||||
specialize qw/vp9_highbd_sad32x32x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad16x16x4d/;
|
||||
specialize qw/vp9_highbd_sad16x16x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad16x8x4d/;
|
||||
specialize qw/vp9_highbd_sad16x8x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad8x16x4d/;
|
||||
specialize qw/vp9_highbd_sad8x16x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad8x8x4d/;
|
||||
specialize qw/vp9_highbd_sad8x8x4d sse2/;
|
||||
|
||||
# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
|
||||
add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad8x4x4d/;
|
||||
specialize qw/vp9_highbd_sad8x4x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad4x8x4d/;
|
||||
specialize qw/vp9_highbd_sad4x8x4d sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
|
||||
specialize qw/vp9_highbd_sad4x4x4d/;
|
||||
specialize qw/vp9_highbd_sad4x4x4d sse2/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_mse16x16/;
|
||||
specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_mse8x16/;
|
||||
@@ -1822,10 +1847,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_mse16x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_mse8x8/;
|
||||
specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_mse16x16/;
|
||||
specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_mse8x16/;
|
||||
@@ -1834,10 +1859,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_10_mse16x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_10_mse8x8/;
|
||||
specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_mse16x16/;
|
||||
specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_mse8x16/;
|
||||
@@ -1846,12 +1871,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_12_mse16x8/;
|
||||
|
||||
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vp9_highbd_12_mse8x8/;
|
||||
specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
|
||||
|
||||
# ENCODEMB INVOKE
|
||||
|
||||
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
|
||||
specialize qw/vp9_highbd_block_error/;
|
||||
specialize qw/vp9_highbd_block_error sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
|
||||
specialize qw/vp9_highbd_subtract_block/;
|
||||
@@ -1863,10 +1888,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_highbd_quantize_fp_32x32/;
|
||||
|
||||
add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_highbd_quantize_b/;
|
||||
specialize qw/vp9_highbd_quantize_b sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_highbd_quantize_b_32x32/;
|
||||
specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
|
||||
|
||||
#
|
||||
# Structured Similarity (SSIM)
|
||||
@@ -1878,40 +1903,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
# fdct functions
|
||||
add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_highbd_fht4x4/;
|
||||
specialize qw/vp9_highbd_fht4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_highbd_fht8x8/;
|
||||
specialize qw/vp9_highbd_fht8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_highbd_fht16x16/;
|
||||
specialize qw/vp9_highbd_fht16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fwht4x4/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct4x4/;
|
||||
specialize qw/vp9_highbd_fdct4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct8x8_1/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct8x8/;
|
||||
specialize qw/vp9_highbd_fdct8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct16x16_1/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct16x16/;
|
||||
specialize qw/vp9_highbd_fdct16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct32x32_1/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct32x32/;
|
||||
specialize qw/vp9_highbd_fdct32x32 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct32x32_rd/;
|
||||
specialize qw/vp9_highbd_fdct32x32_rd sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
|
||||
specialize qw/vp9_highbd_temporal_filter_apply/;
|
||||
|
@@ -14,6 +14,10 @@
|
||||
#include "vp9/common/vp9_loopfilter.h"
|
||||
#include "vpx_ports/emmintrin_compat.h"
|
||||
|
||||
static INLINE __m128i highbd_abs_diff(__m128i a, __m128i b) {
|
||||
return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
|
||||
}
|
||||
|
||||
static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
|
||||
__m128i ubounded;
|
||||
__m128i lbounded;
|
||||
@@ -35,8 +39,126 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
|
||||
return retval;
|
||||
}
|
||||
|
||||
// TODO(debargha, peter): Break up large functions into smaller ones
|
||||
// in this file.
|
||||
static INLINE void get_hev_and_mask(const __m128i thresh, const __m128i limit,
|
||||
const __m128i blimit, const __m128i zero,
|
||||
const __m128i one, const __m128i ffff,
|
||||
__m128i abs_p1p0, __m128i abs_q1q0,
|
||||
__m128i abs_p0q0, __m128i abs_p1q1,
|
||||
__m128i abs_p2p1, __m128i abs_q2q1,
|
||||
__m128i abs_p3p2, __m128i abs_q3q2,
|
||||
__m128i* hev, __m128i* mask) {
|
||||
__m128i work0, work1, work2;
|
||||
|
||||
// highbd_hev_mask
|
||||
work0 = _mm_max_epi16(abs_p1p0, abs_q1q0);
|
||||
*hev = _mm_subs_epu16(work0, thresh);
|
||||
*hev = _mm_xor_si128(_mm_cmpeq_epi16(*hev, zero), ffff);
|
||||
|
||||
work1 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
|
||||
work2 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
|
||||
*mask = _mm_subs_epu16(_mm_adds_epu16(work1, work2), blimit);
|
||||
*mask = _mm_xor_si128(_mm_cmpeq_epi16(*mask, zero), ffff);
|
||||
*mask = _mm_and_si128(*mask, _mm_adds_epu16(limit, one));
|
||||
*mask = _mm_max_epi16(work0, *mask);
|
||||
work0 = _mm_max_epi16(abs_p2p1, abs_q2q1);
|
||||
*mask = _mm_max_epi16(work0, *mask);
|
||||
work0 = _mm_max_epi16(abs_p3p2, abs_q3q2);
|
||||
*mask = _mm_max_epi16(work0, *mask);
|
||||
|
||||
*mask = _mm_subs_epu16(*mask, limit);
|
||||
*mask = _mm_cmpeq_epi16(*mask, zero); // return ~mask
|
||||
}
|
||||
static INLINE void highbd_filter4_sse2(const __m128i mask, const __m128i hev,
|
||||
const __m128i p1, const __m128i p0,
|
||||
const __m128i q1, const __m128i q0,
|
||||
__m128i *ps1, __m128i *ps0,
|
||||
__m128i *qs0, __m128i *qs1,
|
||||
int bd) {
|
||||
const __m128i t4 = _mm_set1_epi16(4);
|
||||
const __m128i t3 = _mm_set1_epi16(3);
|
||||
const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
|
||||
const __m128i t1 = _mm_set1_epi16(0x1);
|
||||
__m128i filt, work, filter1, filter2;
|
||||
|
||||
*ps1 = _mm_subs_epi16(p1, t80);
|
||||
*qs1 = _mm_subs_epi16(q1, t80);
|
||||
*ps0 = _mm_subs_epi16(p0, t80);
|
||||
*qs0 = _mm_subs_epi16(q0, t80);
|
||||
|
||||
filt = _mm_and_si128(
|
||||
signed_char_clamp_bd_sse2(_mm_subs_epi16(*ps1, *qs1), bd), hev);
|
||||
work = _mm_subs_epi16(*qs0, *ps0);
|
||||
filt = _mm_adds_epi16(filt, work);
|
||||
filt = _mm_adds_epi16(filt, work);
|
||||
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work), bd);
|
||||
filt = _mm_and_si128(filt, mask);
|
||||
|
||||
filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
|
||||
filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
|
||||
|
||||
// Filter1 >> 3
|
||||
filter1 = _mm_srai_epi16(filter1, 0x3);
|
||||
filter2 = _mm_srai_epi16(filter2, 0x3);
|
||||
|
||||
*qs0 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_subs_epi16(*qs0, filter1), bd),
|
||||
t80);
|
||||
*ps0 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_adds_epi16(*ps0, filter2), bd),
|
||||
t80);
|
||||
filt = _mm_adds_epi16(filter1, t1);
|
||||
filt = _mm_srai_epi16(filt, 1);
|
||||
filt = _mm_andnot_si128(hev, filt);
|
||||
|
||||
*qs1 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_subs_epi16(*qs1, filt), bd),
|
||||
t80);
|
||||
*ps1 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_adds_epi16(*ps1, filt), bd),
|
||||
t80);
|
||||
}
|
||||
|
||||
static INLINE void apply_7tap_filter(const __m128i p3, const __m128i p2,
|
||||
const __m128i p1, const __m128i p0,
|
||||
const __m128i q0, const __m128i q1,
|
||||
const __m128i q2, const __m128i q3,
|
||||
uint16_t* flat_op2, uint16_t* flat_op1,
|
||||
uint16_t* flat_op0, uint16_t* flat_oq0,
|
||||
uint16_t* flat_oq1, uint16_t* flat_oq2) {
|
||||
__m128i workp_a, workp_b, workp_shft;
|
||||
const __m128i four = _mm_set1_epi16(4);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
|
||||
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
|
||||
workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)flat_op2, workp_shft);
|
||||
|
||||
workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)flat_op1, workp_shft);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
|
||||
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)flat_op0, workp_shft);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
|
||||
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)flat_oq0, workp_shft);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
|
||||
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)flat_oq1, workp_shft);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
|
||||
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)flat_oq2, workp_shft);
|
||||
}
|
||||
|
||||
static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
|
||||
int p,
|
||||
const uint8_t *_blimit,
|
||||
@@ -45,6 +167,7 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
|
||||
int bd) {
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i ffff = _mm_cmpeq_epi16(zero, zero);
|
||||
const __m128i blimit = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(
|
||||
_mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
|
||||
@@ -56,8 +179,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
|
||||
__m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
|
||||
__m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
|
||||
__m128i ps1, qs1, ps0, qs0;
|
||||
__m128i abs_p0q0, abs_p1q1, ffff, work;
|
||||
__m128i filt, work_a, filter1, filter2;
|
||||
__m128i abs_p0q0, abs_p1q1, work;
|
||||
__m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
|
||||
__m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
|
||||
__m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
|
||||
__m128i flat2_q0, flat2_p0;
|
||||
@@ -65,7 +188,6 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
|
||||
__m128i pixelFilter_p, pixelFilter_q;
|
||||
__m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
|
||||
__m128i sum_p7, sum_q7, sum_p3, sum_q3;
|
||||
__m128i t4, t3, t80, t1;
|
||||
__m128i eight, four;
|
||||
|
||||
q4 = _mm_load_si128((__m128i *)(s + 4 * p));
|
||||
@@ -80,98 +202,25 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
|
||||
p0 = _mm_load_si128((__m128i *)(s - 1 * p));
|
||||
|
||||
// highbd_filter_mask
|
||||
abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
|
||||
abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
|
||||
abs_p1p0 = highbd_abs_diff(p1, p0);
|
||||
abs_q1q0 = highbd_abs_diff(q1, q0);
|
||||
abs_p0q0 = highbd_abs_diff(p0, q0);
|
||||
abs_p1q1 = highbd_abs_diff(p1, q1);
|
||||
abs_p2p1 = highbd_abs_diff(p2, p1);
|
||||
abs_q2q1 = highbd_abs_diff(q2, q1);
|
||||
abs_p3p2 = highbd_abs_diff(p3, p2);
|
||||
abs_q3q2 = highbd_abs_diff(q3, q2);
|
||||
|
||||
ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
|
||||
|
||||
abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
|
||||
abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
|
||||
|
||||
// highbd_hev_mask (in C code this is actually called from highbd_filter4)
|
||||
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
|
||||
hev = _mm_subs_epu16(flat, thresh);
|
||||
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
|
||||
|
||||
abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
|
||||
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
|
||||
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
|
||||
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
|
||||
mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
|
||||
_mm_subs_epu16(p0, p1)),
|
||||
_mm_or_si128(_mm_subs_epu16(q1, q0),
|
||||
_mm_subs_epu16(q0, q1)));
|
||||
mask = _mm_max_epi16(work, mask);
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
|
||||
_mm_subs_epu16(p1, p2)),
|
||||
_mm_or_si128(_mm_subs_epu16(q2, q1),
|
||||
_mm_subs_epu16(q1, q2)));
|
||||
mask = _mm_max_epi16(work, mask);
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
|
||||
_mm_subs_epu16(p2, p3)),
|
||||
_mm_or_si128(_mm_subs_epu16(q3, q2),
|
||||
_mm_subs_epu16(q2, q3)));
|
||||
mask = _mm_max_epi16(work, mask);
|
||||
|
||||
mask = _mm_subs_epu16(mask, limit);
|
||||
mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
|
||||
get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
|
||||
abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
|
||||
&hev, &mask);
|
||||
|
||||
// lp filter
|
||||
// highbd_filter4
|
||||
t4 = _mm_set1_epi16(4);
|
||||
t3 = _mm_set1_epi16(3);
|
||||
t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
|
||||
t1 = _mm_set1_epi16(0x1);
|
||||
|
||||
ps1 = _mm_subs_epi16(p1, t80);
|
||||
qs1 = _mm_subs_epi16(q1, t80);
|
||||
ps0 = _mm_subs_epi16(p0, t80);
|
||||
qs0 = _mm_subs_epi16(q0, t80);
|
||||
|
||||
filt = _mm_and_si128(
|
||||
signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
|
||||
work_a = _mm_subs_epi16(qs0, ps0);
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
|
||||
filt = _mm_and_si128(filt, mask);
|
||||
|
||||
filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
|
||||
filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
|
||||
|
||||
// Filter1 >> 3
|
||||
filter1 = _mm_srai_epi16(filter1, 0x3);
|
||||
filter2 = _mm_srai_epi16(filter2, 0x3);
|
||||
|
||||
qs0 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
|
||||
t80);
|
||||
ps0 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
|
||||
t80);
|
||||
filt = _mm_adds_epi16(filter1, t1);
|
||||
filt = _mm_srai_epi16(filt, 1);
|
||||
filt = _mm_andnot_si128(hev, filt);
|
||||
|
||||
qs1 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
|
||||
t80);
|
||||
ps1 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
|
||||
t80);
|
||||
// end highbd_filter4
|
||||
// loopfilter done
|
||||
highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);
|
||||
|
||||
// highbd_flat_mask4
|
||||
flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
|
||||
_mm_subs_epu16(p0, p2)),
|
||||
_mm_or_si128(_mm_subs_epu16(p3, p0),
|
||||
_mm_subs_epu16(p0, p3)));
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
|
||||
_mm_subs_epu16(q0, q2)),
|
||||
_mm_or_si128(_mm_subs_epu16(q3, q0),
|
||||
_mm_subs_epu16(q0, q3)));
|
||||
flat = _mm_max_epi16(highbd_abs_diff(p2, p0), highbd_abs_diff(p3, p0));
|
||||
work = _mm_max_epi16(highbd_abs_diff(q2, q0), highbd_abs_diff(q3, q0));
|
||||
flat = _mm_max_epi16(work, flat);
|
||||
work = _mm_max_epi16(abs_p1p0, abs_q1q0);
|
||||
flat = _mm_max_epi16(work, flat);
|
||||
@@ -192,27 +241,15 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
|
||||
|
||||
// highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
|
||||
// but referred to as p0-p4 & q0-q4 in fn)
|
||||
flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
|
||||
_mm_subs_epu16(p0, p4)),
|
||||
_mm_or_si128(_mm_subs_epu16(q4, q0),
|
||||
_mm_subs_epu16(q0, q4)));
|
||||
flat2 = _mm_max_epi16(highbd_abs_diff(p4, p0), highbd_abs_diff(q4, q0));
|
||||
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
|
||||
_mm_subs_epu16(p0, p5)),
|
||||
_mm_or_si128(_mm_subs_epu16(q5, q0),
|
||||
_mm_subs_epu16(q0, q5)));
|
||||
work = _mm_max_epi16(highbd_abs_diff(p5, p0), highbd_abs_diff(q5, q0));
|
||||
flat2 = _mm_max_epi16(work, flat2);
|
||||
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
|
||||
_mm_subs_epu16(p0, p6)),
|
||||
_mm_or_si128(_mm_subs_epu16(q6, q0),
|
||||
_mm_subs_epu16(q0, q6)));
|
||||
work = _mm_max_epi16(highbd_abs_diff(p6, p0), highbd_abs_diff(q6, q0));
|
||||
flat2 = _mm_max_epi16(work, flat2);
|
||||
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
|
||||
_mm_subs_epu16(p0, p7)),
|
||||
_mm_or_si128(_mm_subs_epu16(q7, q0),
|
||||
_mm_subs_epu16(q0, q7)));
|
||||
work = _mm_max_epi16(highbd_abs_diff(p7, p0), highbd_abs_diff(q7, q0));
|
||||
flat2 = _mm_max_epi16(work, flat2);
|
||||
|
||||
flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
|
||||
@@ -225,10 +262,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
|
||||
eight = _mm_set1_epi16(8);
|
||||
four = _mm_set1_epi16(4);
|
||||
|
||||
pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
|
||||
_mm_add_epi16(p4, p3));
|
||||
pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
|
||||
_mm_add_epi16(q4, q3));
|
||||
pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
|
||||
pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
|
||||
|
||||
pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
|
||||
pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
|
||||
@@ -237,9 +272,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
|
||||
pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
|
||||
pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
|
||||
pixelFilter_q));
|
||||
pixetFilter_p2p1p0 = _mm_add_epi16(four,
|
||||
_mm_add_epi16(pixetFilter_p2p1p0,
|
||||
pixetFilter_q2q1q0));
|
||||
pixetFilter_p2p1p0 = _mm_add_epi16(four, _mm_add_epi16(pixetFilter_p2p1p0,
|
||||
pixetFilter_q2q1q0));
|
||||
flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
|
||||
_mm_add_epi16(p7, p0)), 4);
|
||||
flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
|
||||
@@ -486,6 +520,8 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i ffff = _mm_cmpeq_epi16(one, one);
|
||||
const __m128i blimit = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
|
||||
bd - 8);
|
||||
@@ -504,74 +540,30 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
|
||||
__m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
|
||||
__m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
|
||||
__m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i ffff = _mm_cmpeq_epi16(one, one);
|
||||
__m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
|
||||
const __m128i four = _mm_set1_epi16(4);
|
||||
__m128i workp_a, workp_b, workp_shft;
|
||||
|
||||
const __m128i t4 = _mm_set1_epi16(4);
|
||||
const __m128i t3 = _mm_set1_epi16(3);
|
||||
const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
|
||||
const __m128i t1 = _mm_set1_epi16(0x1);
|
||||
const __m128i ps1 = _mm_subs_epi16(p1, t80);
|
||||
const __m128i ps0 = _mm_subs_epi16(p0, t80);
|
||||
const __m128i qs0 = _mm_subs_epi16(q0, t80);
|
||||
const __m128i qs1 = _mm_subs_epi16(q1, t80);
|
||||
__m128i filt;
|
||||
__m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
|
||||
__m128i ps0, ps1, qs0, qs1;
|
||||
__m128i work_a;
|
||||
__m128i filter1, filter2;
|
||||
|
||||
(void)count;
|
||||
|
||||
// filter_mask and hev_mask
|
||||
abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
|
||||
_mm_subs_epu16(p0, p1));
|
||||
abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
|
||||
_mm_subs_epu16(q0, q1));
|
||||
abs_p1p0 = highbd_abs_diff(p1, p0);
|
||||
abs_q1q0 = highbd_abs_diff(q1, q0);
|
||||
abs_p0q0 = highbd_abs_diff(p0, q0);
|
||||
abs_p1q1 = highbd_abs_diff(p1, q1);
|
||||
abs_p2p1 = highbd_abs_diff(p2, p1);
|
||||
abs_q2q1 = highbd_abs_diff(q2, q1);
|
||||
abs_p3p2 = highbd_abs_diff(p3, p2);
|
||||
abs_q3q2 = highbd_abs_diff(q3, q2);
|
||||
|
||||
abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
|
||||
_mm_subs_epu16(q0, p0));
|
||||
abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
|
||||
_mm_subs_epu16(q1, p1));
|
||||
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
|
||||
hev = _mm_subs_epu16(flat, thresh);
|
||||
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
|
||||
|
||||
abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
|
||||
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
|
||||
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
|
||||
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
|
||||
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
|
||||
// So taking maximums continues to work:
|
||||
mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
|
||||
mask = _mm_max_epi16(abs_p1p0, mask);
|
||||
// mask |= (abs(p1 - p0) > limit) * -1;
|
||||
mask = _mm_max_epi16(abs_q1q0, mask);
|
||||
// mask |= (abs(q1 - q0) > limit) * -1;
|
||||
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
|
||||
_mm_subs_epu16(p1, p2)),
|
||||
_mm_or_si128(_mm_subs_epu16(q2, q1),
|
||||
_mm_subs_epu16(q1, q2)));
|
||||
mask = _mm_max_epi16(work, mask);
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
|
||||
_mm_subs_epu16(p2, p3)),
|
||||
_mm_or_si128(_mm_subs_epu16(q3, q2),
|
||||
_mm_subs_epu16(q2, q3)));
|
||||
mask = _mm_max_epi16(work, mask);
|
||||
mask = _mm_subs_epu16(mask, limit);
|
||||
mask = _mm_cmpeq_epi16(mask, zero);
|
||||
get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
|
||||
abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
|
||||
&hev, &mask);
|
||||
|
||||
// flat_mask4
|
||||
flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
|
||||
_mm_subs_epu16(p0, p2)),
|
||||
_mm_or_si128(_mm_subs_epu16(q2, q0),
|
||||
_mm_subs_epu16(q0, q2)));
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
|
||||
_mm_subs_epu16(p0, p3)),
|
||||
_mm_or_si128(_mm_subs_epu16(q3, q0),
|
||||
_mm_subs_epu16(q0, q3)));
|
||||
flat = _mm_max_epi16(highbd_abs_diff(p2, p0), highbd_abs_diff(q2, q0));
|
||||
work = _mm_max_epi16(highbd_abs_diff(p3, p0), highbd_abs_diff(q3, q0));
|
||||
flat = _mm_max_epi16(work, flat);
|
||||
flat = _mm_max_epi16(abs_p1p0, flat);
|
||||
flat = _mm_max_epi16(abs_q1q0, flat);
|
||||
@@ -579,77 +571,20 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
|
||||
flat = _mm_cmpeq_epi16(flat, zero);
|
||||
flat = _mm_and_si128(flat, mask); // flat & mask
|
||||
|
||||
// Added before shift for rounding part of ROUND_POWER_OF_TWO
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
|
||||
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
|
||||
workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
|
||||
|
||||
workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
|
||||
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
|
||||
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
|
||||
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
|
||||
|
||||
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
|
||||
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
|
||||
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
|
||||
_mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
|
||||
// Apply 7-tap filter (result used if flat && mask) c.f. highbd_filter8
|
||||
apply_7tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, &flat_op2[0], &flat_op1[0],
|
||||
&flat_op0[0], &flat_oq0[0], &flat_oq1[0], &flat_oq2[0]);
|
||||
|
||||
// lp filter
|
||||
filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
|
||||
filt = _mm_and_si128(filt, hev);
|
||||
work_a = _mm_subs_epi16(qs0, ps0);
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
// (vp9_filter + 3 * (qs0 - ps0)) & mask
|
||||
filt = signed_char_clamp_bd_sse2(filt, bd);
|
||||
filt = _mm_and_si128(filt, mask);
|
||||
highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);
|
||||
|
||||
filter1 = _mm_adds_epi16(filt, t4);
|
||||
filter2 = _mm_adds_epi16(filt, t3);
|
||||
|
||||
// Filter1 >> 3
|
||||
filter1 = signed_char_clamp_bd_sse2(filter1, bd);
|
||||
filter1 = _mm_srai_epi16(filter1, 3);
|
||||
|
||||
// Filter2 >> 3
|
||||
filter2 = signed_char_clamp_bd_sse2(filter2, bd);
|
||||
filter2 = _mm_srai_epi16(filter2, 3);
|
||||
|
||||
// filt >> 1
|
||||
filt = _mm_adds_epi16(filter1, t1);
|
||||
filt = _mm_srai_epi16(filt, 1);
|
||||
// filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
|
||||
filt = _mm_andnot_si128(hev, filt);
|
||||
|
||||
work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
|
||||
work_a = _mm_adds_epi16(work_a, t80);
|
||||
q0 = _mm_load_si128((__m128i *)flat_oq0);
|
||||
work_a = _mm_andnot_si128(flat, work_a);
|
||||
work_a = _mm_andnot_si128(flat, qs0);
|
||||
q0 = _mm_and_si128(flat, q0);
|
||||
q0 = _mm_or_si128(work_a, q0);
|
||||
|
||||
work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
|
||||
work_a = _mm_adds_epi16(work_a, t80);
|
||||
q1 = _mm_load_si128((__m128i *)flat_oq1);
|
||||
work_a = _mm_andnot_si128(flat, work_a);
|
||||
work_a = _mm_andnot_si128(flat, qs1);
|
||||
q1 = _mm_and_si128(flat, q1);
|
||||
q1 = _mm_or_si128(work_a, q1);
|
||||
|
||||
@@ -659,17 +594,13 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
|
||||
q2 = _mm_and_si128(flat, q2);
|
||||
q2 = _mm_or_si128(work_a, q2);
|
||||
|
||||
work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
|
||||
work_a = _mm_adds_epi16(work_a, t80);
|
||||
p0 = _mm_load_si128((__m128i *)flat_op0);
|
||||
work_a = _mm_andnot_si128(flat, work_a);
|
||||
work_a = _mm_andnot_si128(flat, ps0);
|
||||
p0 = _mm_and_si128(flat, p0);
|
||||
p0 = _mm_or_si128(work_a, p0);
|
||||
|
||||
work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
|
||||
work_a = _mm_adds_epi16(work_a, t80);
|
||||
p1 = _mm_load_si128((__m128i *)flat_op1);
|
||||
work_a = _mm_andnot_si128(flat, work_a);
|
||||
work_a = _mm_andnot_si128(flat, ps1);
|
||||
p1 = _mm_and_si128(flat, p1);
|
||||
p1 = _mm_or_si128(work_a, p1);
|
||||
|
||||
@@ -715,7 +646,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
|
||||
const __m128i thresh = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(
|
||||
_mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
|
||||
__m128i mask, hev, flat;
|
||||
__m128i mask, hev;
|
||||
__m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
|
||||
__m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
|
||||
__m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
|
||||
@@ -724,121 +655,36 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
|
||||
__m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
|
||||
__m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
|
||||
__m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
|
||||
const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
|
||||
_mm_subs_epu16(p0, p1));
|
||||
const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
|
||||
_mm_subs_epu16(q0, q1));
|
||||
const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
|
||||
const __m128i ffff = _mm_cmpeq_epi16(zero, zero);
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
__m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
|
||||
_mm_subs_epu16(q0, p0));
|
||||
__m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
|
||||
_mm_subs_epu16(q1, p1));
|
||||
__m128i work;
|
||||
const __m128i t4 = _mm_set1_epi16(4);
|
||||
const __m128i t3 = _mm_set1_epi16(3);
|
||||
const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
|
||||
const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
|
||||
const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
|
||||
const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
|
||||
// equivalent to shifting 0x1f left by bitdepth - 8
|
||||
// and setting new bits to 1
|
||||
const __m128i t1 = _mm_set1_epi16(0x1);
|
||||
const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
|
||||
// equivalent to shifting 0x7f left by bitdepth - 8
|
||||
// and setting new bits to 1
|
||||
const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
|
||||
t80);
|
||||
const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
|
||||
t80);
|
||||
const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
|
||||
t80);
|
||||
const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
|
||||
t80);
|
||||
__m128i filt;
|
||||
__m128i work_a;
|
||||
__m128i filter1, filter2;
|
||||
|
||||
__m128i ps1, ps0, qs0, qs1;
|
||||
__m128i abs_p1p0, abs_q1q0, abs_p0q0, abs_p1q1;
|
||||
__m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
|
||||
(void)count;
|
||||
|
||||
// filter_mask and hev_mask
|
||||
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
|
||||
hev = _mm_subs_epu16(flat, thresh);
|
||||
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
|
||||
abs_p1p0 = highbd_abs_diff(p1, p0);
|
||||
abs_q1q0 = highbd_abs_diff(q1, q0);
|
||||
abs_p0q0 = highbd_abs_diff(p0, q0);
|
||||
abs_p1q1 = highbd_abs_diff(p1, q1);
|
||||
abs_p2p1 = highbd_abs_diff(p2, p1);
|
||||
abs_q2q1 = highbd_abs_diff(q2, q1);
|
||||
abs_p3p2 = highbd_abs_diff(p3, p2);
|
||||
abs_q3q2 = highbd_abs_diff(q3, q2);
|
||||
|
||||
abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
|
||||
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
|
||||
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
|
||||
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
|
||||
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
|
||||
// So taking maximums continues to work:
|
||||
mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
|
||||
mask = _mm_max_epi16(flat, mask);
|
||||
// mask |= (abs(p1 - p0) > limit) * -1;
|
||||
// mask |= (abs(q1 - q0) > limit) * -1;
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
|
||||
_mm_subs_epu16(p1, p2)),
|
||||
_mm_or_si128(_mm_subs_epu16(p3, p2),
|
||||
_mm_subs_epu16(p2, p3)));
|
||||
mask = _mm_max_epi16(work, mask);
|
||||
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
|
||||
_mm_subs_epu16(q1, q2)),
|
||||
_mm_or_si128(_mm_subs_epu16(q3, q2),
|
||||
_mm_subs_epu16(q2, q3)));
|
||||
mask = _mm_max_epi16(work, mask);
|
||||
mask = _mm_subs_epu16(mask, limit);
|
||||
mask = _mm_cmpeq_epi16(mask, zero);
|
||||
get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
|
||||
abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
|
||||
&hev, &mask);
|
||||
|
||||
// filter4
|
||||
filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
|
||||
filt = _mm_and_si128(filt, hev);
|
||||
work_a = _mm_subs_epi16(qs0, ps0);
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
|
||||
// (vp9_filter + 3 * (qs0 - ps0)) & mask
|
||||
filt = _mm_and_si128(filt, mask);
|
||||
highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);
|
||||
|
||||
filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
|
||||
filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
|
||||
|
||||
// Filter1 >> 3
|
||||
work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
|
||||
filter1 = _mm_srli_epi16(filter1, 3);
|
||||
work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
|
||||
filter1 = _mm_and_si128(filter1, t1f); // clamp the range
|
||||
filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
|
||||
|
||||
// Filter2 >> 3
|
||||
work_a = _mm_cmpgt_epi16(zero, filter2);
|
||||
filter2 = _mm_srli_epi16(filter2, 3);
|
||||
work_a = _mm_and_si128(work_a, tffe0);
|
||||
filter2 = _mm_and_si128(filter2, t1f);
|
||||
filter2 = _mm_or_si128(filter2, work_a);
|
||||
|
||||
// filt >> 1
|
||||
filt = _mm_adds_epi16(filter1, t1);
|
||||
work_a = _mm_cmpgt_epi16(zero, filt);
|
||||
filt = _mm_srli_epi16(filt, 1);
|
||||
work_a = _mm_and_si128(work_a, tff80);
|
||||
filt = _mm_and_si128(filt, t7f);
|
||||
filt = _mm_or_si128(filt, work_a);
|
||||
|
||||
filt = _mm_andnot_si128(hev, filt);
|
||||
|
||||
q0 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
|
||||
q1 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
|
||||
p0 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
|
||||
p1 = _mm_adds_epi16(
|
||||
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(s - 2 * p), p1);
|
||||
_mm_storeu_si128((__m128i *)(s - 1 * p), p0);
|
||||
_mm_storeu_si128((__m128i *)(s + 0 * p), q0);
|
||||
_mm_storeu_si128((__m128i *)(s + 1 * p), q1);
|
||||
_mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
|
||||
_mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
|
||||
_mm_storeu_si128((__m128i *)(s + 0 * p), qs0);
|
||||
_mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
|
||||
}
|
||||
|
||||
void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
|
||||
|
@@ -19,42 +19,34 @@
|
||||
mov rcx, 0x00000040
|
||||
|
||||
movdqa xmm7, [rdx] ;load filters
|
||||
pshuflw xmm0, xmm7, 0b ;k0
|
||||
pshuflw xmm1, xmm7, 01010101b ;k1
|
||||
pshuflw xmm2, xmm7, 10101010b ;k2
|
||||
pshuflw xmm3, xmm7, 11111111b ;k3
|
||||
pshuflw xmm10, xmm7, 0b ;k0
|
||||
pshuflw xmm11, xmm7, 01010101b ;k1
|
||||
pshuflw xmm12, xmm7, 10101010b ;k2
|
||||
pshuflw xmm13, xmm7, 11111111b ;k3
|
||||
psrldq xmm7, 8
|
||||
pshuflw xmm4, xmm7, 0b ;k4
|
||||
pshuflw xmm5, xmm7, 01010101b ;k5
|
||||
pshuflw xmm6, xmm7, 10101010b ;k6
|
||||
pshuflw xmm7, xmm7, 11111111b ;k7
|
||||
|
||||
punpcklwd xmm0, xmm6
|
||||
punpcklwd xmm2, xmm5
|
||||
punpcklwd xmm3, xmm4
|
||||
punpcklwd xmm1, xmm7
|
||||
punpcklwd xmm10, xmm6
|
||||
punpcklwd xmm12, xmm5
|
||||
punpcklwd xmm13, xmm4
|
||||
punpcklwd xmm11, xmm7
|
||||
|
||||
movdqa k0k6, xmm0
|
||||
movdqa k2k5, xmm2
|
||||
movdqa k3k4, xmm3
|
||||
movdqa k1k7, xmm1
|
||||
|
||||
movq xmm6, rcx
|
||||
pshufd xmm6, xmm6, 0
|
||||
movdqa krd, xmm6
|
||||
movq xmm9, rcx
|
||||
pshufd xmm9, xmm9, 0
|
||||
|
||||
;Compute max and min values of a pixel
|
||||
mov rdx, 0x00010001
|
||||
movsxd rcx, DWORD PTR arg(6) ;bps
|
||||
movq xmm0, rdx
|
||||
movq xmm14, rdx
|
||||
movq xmm1, rcx
|
||||
pshufd xmm0, xmm0, 0b
|
||||
movdqa xmm2, xmm0
|
||||
psllw xmm0, xmm1
|
||||
psubw xmm0, xmm2
|
||||
pxor xmm1, xmm1
|
||||
movdqa max, xmm0 ;max value (for clamping)
|
||||
movdqa min, xmm1 ;min value (for clamping)
|
||||
pshufd xmm14, xmm14, 0b
|
||||
movdqa xmm2, xmm14
|
||||
psllw xmm14, xmm1
|
||||
psubw xmm14, xmm2 ;max value (for clamping)
|
||||
pxor xmm8, xmm8 ;min value (for clamping)
|
||||
|
||||
%endm
|
||||
|
||||
@@ -64,22 +56,22 @@
|
||||
punpcklwd xmm2, xmm5
|
||||
punpcklwd xmm3, xmm4
|
||||
|
||||
pmaddwd xmm0, k0k6 ;multiply the filter factors
|
||||
pmaddwd xmm1, k1k7
|
||||
pmaddwd xmm2, k2k5
|
||||
pmaddwd xmm3, k3k4
|
||||
pmaddwd xmm0, xmm10 ;multiply the filter factors
|
||||
pmaddwd xmm1, xmm11
|
||||
pmaddwd xmm2, xmm12
|
||||
pmaddwd xmm3, xmm13
|
||||
|
||||
paddd xmm0, xmm1 ;sum
|
||||
paddd xmm0, xmm2
|
||||
paddd xmm0, xmm3
|
||||
|
||||
paddd xmm0, krd ;rounding
|
||||
paddd xmm0, xmm9 ;rounding
|
||||
psrad xmm0, 7 ;shift
|
||||
packssdw xmm0, xmm0 ;pack to word
|
||||
|
||||
;clamp the values
|
||||
pminsw xmm0, max
|
||||
pmaxsw xmm0, min
|
||||
pminsw xmm0, xmm14
|
||||
pmaxsw xmm0, xmm8
|
||||
|
||||
%if %1
|
||||
movq xmm1, [rdi]
|
||||
@@ -95,42 +87,34 @@
|
||||
mov rcx, 0x00000040
|
||||
|
||||
movdqa xmm7, [rdx] ;load filters
|
||||
pshuflw xmm0, xmm7, 0b ;k0
|
||||
pshuflw xmm10, xmm7, 0b ;k0
|
||||
pshuflw xmm1, xmm7, 01010101b ;k1
|
||||
pshuflw xmm2, xmm7, 10101010b ;k2
|
||||
pshuflw xmm3, xmm7, 11111111b ;k3
|
||||
pshuflw xmm12, xmm7, 10101010b ;k2
|
||||
pshuflw xmm13, xmm7, 11111111b ;k3
|
||||
pshufhw xmm4, xmm7, 0b ;k4
|
||||
pshufhw xmm5, xmm7, 01010101b ;k5
|
||||
pshufhw xmm6, xmm7, 10101010b ;k6
|
||||
pshufhw xmm11, xmm7, 10101010b ;k6
|
||||
pshufhw xmm7, xmm7, 11111111b ;k7
|
||||
punpcklqdq xmm2, xmm2
|
||||
punpcklqdq xmm3, xmm3
|
||||
punpcklwd xmm0, xmm1
|
||||
punpckhwd xmm6, xmm7
|
||||
punpckhwd xmm2, xmm5
|
||||
punpckhwd xmm3, xmm4
|
||||
punpcklqdq xmm12, xmm12
|
||||
punpcklqdq xmm13, xmm13
|
||||
punpcklwd xmm10, xmm1
|
||||
punpckhwd xmm11, xmm7
|
||||
punpckhwd xmm12, xmm5
|
||||
punpckhwd xmm13, xmm4
|
||||
|
||||
movdqa k0k1, xmm0 ;store filter factors on stack
|
||||
movdqa k6k7, xmm6
|
||||
movdqa k2k5, xmm2
|
||||
movdqa k3k4, xmm3
|
||||
|
||||
movq xmm6, rcx
|
||||
pshufd xmm6, xmm6, 0
|
||||
movdqa krd, xmm6 ;rounding
|
||||
movq xmm9, rcx
|
||||
pshufd xmm9, xmm9, 0 ;rounding
|
||||
|
||||
;Compute max and min values of a pixel
|
||||
mov rdx, 0x00010001
|
||||
movsxd rcx, DWORD PTR arg(6) ;bps
|
||||
movq xmm0, rdx
|
||||
movq xmm14, rdx
|
||||
movq xmm1, rcx
|
||||
pshufd xmm0, xmm0, 0b
|
||||
movdqa xmm2, xmm0
|
||||
psllw xmm0, xmm1
|
||||
psubw xmm0, xmm2
|
||||
pxor xmm1, xmm1
|
||||
movdqa max, xmm0 ;max value (for clamping)
|
||||
movdqa min, xmm1 ;min value (for clamping)
|
||||
pshufd xmm14, xmm14, 0b
|
||||
movdqa xmm2, xmm14
|
||||
psllw xmm14, xmm1
|
||||
psubw xmm14, xmm2 ;max value (for clamping)
|
||||
pxor xmm15, xmm15 ;min value (for clamping)
|
||||
%endm
|
||||
|
||||
%macro LOAD_VERT_8 1
|
||||
@@ -146,7 +130,7 @@
|
||||
%endm
|
||||
|
||||
%macro HIGH_APPLY_FILTER_8 2
|
||||
movdqu temp, xmm4
|
||||
movdqa xmm8, xmm4
|
||||
movdqa xmm4, xmm0
|
||||
punpcklwd xmm0, xmm1
|
||||
punpckhwd xmm4, xmm1
|
||||
@@ -157,21 +141,21 @@
|
||||
punpcklwd xmm2, xmm5
|
||||
punpckhwd xmm7, xmm5
|
||||
|
||||
movdqu xmm5, temp
|
||||
movdqu temp, xmm4
|
||||
movdqa xmm5, xmm8
|
||||
movdqa xmm8, xmm4
|
||||
movdqa xmm4, xmm3
|
||||
punpcklwd xmm3, xmm5
|
||||
punpckhwd xmm4, xmm5
|
||||
movdqu xmm5, temp
|
||||
movdqa xmm5, xmm8
|
||||
|
||||
pmaddwd xmm0, k0k1
|
||||
pmaddwd xmm5, k0k1
|
||||
pmaddwd xmm6, k6k7
|
||||
pmaddwd xmm1, k6k7
|
||||
pmaddwd xmm2, k2k5
|
||||
pmaddwd xmm7, k2k5
|
||||
pmaddwd xmm3, k3k4
|
||||
pmaddwd xmm4, k3k4
|
||||
pmaddwd xmm0, xmm10
|
||||
pmaddwd xmm5, xmm10
|
||||
pmaddwd xmm6, xmm11
|
||||
pmaddwd xmm1, xmm11
|
||||
pmaddwd xmm2, xmm12
|
||||
pmaddwd xmm7, xmm12
|
||||
pmaddwd xmm3, xmm13
|
||||
pmaddwd xmm4, xmm13
|
||||
|
||||
paddd xmm0, xmm6
|
||||
paddd xmm0, xmm2
|
||||
@@ -180,15 +164,15 @@
|
||||
paddd xmm5, xmm7
|
||||
paddd xmm5, xmm4
|
||||
|
||||
paddd xmm0, krd ;rounding
|
||||
paddd xmm5, krd
|
||||
paddd xmm0, xmm9 ;rounding
|
||||
paddd xmm5, xmm9
|
||||
psrad xmm0, 7 ;shift
|
||||
psrad xmm5, 7
|
||||
packssdw xmm0, xmm5 ;pack back to word
|
||||
|
||||
;clamp the values
|
||||
pminsw xmm0, max
|
||||
pmaxsw xmm0, min
|
||||
pminsw xmm0, xmm14
|
||||
pmaxsw xmm0, xmm15
|
||||
|
||||
%if %1
|
||||
movdqu xmm1, [rdi + %2]
|
||||
@@ -211,22 +195,12 @@ sym(vp9_highbd_filter_block1d4_v8_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 14
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 7
|
||||
%define k0k6 [rsp + 16 * 0]
|
||||
%define k2k5 [rsp + 16 * 1]
|
||||
%define k3k4 [rsp + 16 * 2]
|
||||
%define k1k7 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define max [rsp + 16 * 5]
|
||||
%define min [rsp + 16 * 6]
|
||||
|
||||
HIGH_GET_FILTERS_4
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
@@ -256,8 +230,6 @@ sym(vp9_highbd_filter_block1d4_v8_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 7
|
||||
pop rsp
|
||||
pop rbx
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@@ -281,23 +253,12 @@ sym(vp9_highbd_filter_block1d8_v8_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 15
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 8
|
||||
%define k0k1 [rsp + 16 * 0]
|
||||
%define k6k7 [rsp + 16 * 1]
|
||||
%define k2k5 [rsp + 16 * 2]
|
||||
%define k3k4 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define temp [rsp + 16 * 5]
|
||||
%define max [rsp + 16 * 6]
|
||||
%define min [rsp + 16 * 7]
|
||||
|
||||
HIGH_GET_FILTERS
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -315,8 +276,6 @@ sym(vp9_highbd_filter_block1d8_v8_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 8
|
||||
pop rsp
|
||||
pop rbx
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@@ -340,23 +299,12 @@ sym(vp9_highbd_filter_block1d16_v8_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 15
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 8
|
||||
%define k0k1 [rsp + 16 * 0]
|
||||
%define k6k7 [rsp + 16 * 1]
|
||||
%define k2k5 [rsp + 16 * 2]
|
||||
%define k3k4 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define temp [rsp + 16 * 5]
|
||||
%define max [rsp + 16 * 6]
|
||||
%define min [rsp + 16 * 7]
|
||||
|
||||
HIGH_GET_FILTERS
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -378,8 +326,6 @@ sym(vp9_highbd_filter_block1d16_v8_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 8
|
||||
pop rsp
|
||||
pop rbx
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@@ -394,22 +340,12 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 14
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 7
|
||||
%define k0k6 [rsp + 16 * 0]
|
||||
%define k2k5 [rsp + 16 * 1]
|
||||
%define k3k4 [rsp + 16 * 2]
|
||||
%define k1k7 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define max [rsp + 16 * 5]
|
||||
%define min [rsp + 16 * 6]
|
||||
|
||||
HIGH_GET_FILTERS_4
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
@@ -439,8 +375,6 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 7
|
||||
pop rsp
|
||||
pop rbx
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@@ -455,23 +389,12 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 15
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 8
|
||||
%define k0k1 [rsp + 16 * 0]
|
||||
%define k6k7 [rsp + 16 * 1]
|
||||
%define k2k5 [rsp + 16 * 2]
|
||||
%define k3k4 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define temp [rsp + 16 * 5]
|
||||
%define max [rsp + 16 * 6]
|
||||
%define min [rsp + 16 * 7]
|
||||
|
||||
HIGH_GET_FILTERS
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -488,8 +411,6 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 8
|
||||
pop rsp
|
||||
pop rbx
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@@ -504,23 +425,12 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 15
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 8
|
||||
%define k0k1 [rsp + 16 * 0]
|
||||
%define k6k7 [rsp + 16 * 1]
|
||||
%define k2k5 [rsp + 16 * 2]
|
||||
%define k3k4 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define temp [rsp + 16 * 5]
|
||||
%define max [rsp + 16 * 6]
|
||||
%define min [rsp + 16 * 7]
|
||||
|
||||
HIGH_GET_FILTERS
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -541,8 +451,6 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 8
|
||||
pop rsp
|
||||
pop rbx
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@@ -566,21 +474,11 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 14
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 7
|
||||
%define k0k6 [rsp + 16 * 0]
|
||||
%define k2k5 [rsp + 16 * 1]
|
||||
%define k3k4 [rsp + 16 * 2]
|
||||
%define k1k7 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define max [rsp + 16 * 5]
|
||||
%define min [rsp + 16 * 6]
|
||||
|
||||
HIGH_GET_FILTERS_4
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
@@ -592,6 +490,16 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
|
||||
lea rdx, [rdx + rdx]
|
||||
movsxd rcx, DWORD PTR arg(4) ;output_height
|
||||
|
||||
.load
|
||||
prefetcht0 [rsi - 6]
|
||||
prefetcht0 [rsi + 17]
|
||||
lea rsi, [rsi + rax]
|
||||
dec rcx
|
||||
jnz .load
|
||||
|
||||
mov rsi, arg(0)
|
||||
movsxd rcx, DWORD PTR arg(4)
|
||||
|
||||
.loop:
|
||||
movdqu xmm0, [rsi - 6] ;load src
|
||||
movdqu xmm4, [rsi + 2]
|
||||
@@ -616,9 +524,6 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 7
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
@@ -641,22 +546,11 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 15
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 8
|
||||
%define k0k1 [rsp + 16 * 0]
|
||||
%define k6k7 [rsp + 16 * 1]
|
||||
%define k2k5 [rsp + 16 * 2]
|
||||
%define k3k4 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define temp [rsp + 16 * 5]
|
||||
%define max [rsp + 16 * 6]
|
||||
%define min [rsp + 16 * 7]
|
||||
|
||||
HIGH_GET_FILTERS
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -665,6 +559,16 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
|
||||
lea rdx, [rdx + rdx]
|
||||
movsxd rcx, DWORD PTR arg(4) ;output_height
|
||||
|
||||
.load
|
||||
prefetcht0 [rsi - 6]
|
||||
prefetcht0 [rsi + 23]
|
||||
lea rsi, [rsi + rax]
|
||||
dec rcx
|
||||
jnz .load
|
||||
|
||||
mov rsi, arg(0)
|
||||
movsxd rcx, DWORD PTR arg(4)
|
||||
|
||||
.loop:
|
||||
movdqu xmm0, [rsi - 6] ;load src
|
||||
movdqu xmm1, [rsi - 4]
|
||||
@@ -682,9 +586,6 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 8
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
@@ -707,22 +608,11 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 15
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 8
|
||||
%define k0k1 [rsp + 16 * 0]
|
||||
%define k6k7 [rsp + 16 * 1]
|
||||
%define k2k5 [rsp + 16 * 2]
|
||||
%define k3k4 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define temp [rsp + 16 * 5]
|
||||
%define max [rsp + 16 * 6]
|
||||
%define min [rsp + 16 * 7]
|
||||
|
||||
HIGH_GET_FILTERS
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -731,6 +621,16 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
|
||||
lea rdx, [rdx + rdx]
|
||||
movsxd rcx, DWORD PTR arg(4) ;output_height
|
||||
|
||||
.load
|
||||
prefetcht0 [rsi - 6]
|
||||
prefetcht0 [rsi + 31]
|
||||
lea rsi, [rsi + rax]
|
||||
dec rcx
|
||||
jnz .load
|
||||
|
||||
mov rsi, arg(0)
|
||||
movsxd rcx, DWORD PTR arg(4)
|
||||
|
||||
.loop:
|
||||
movdqu xmm0, [rsi - 6] ;load src
|
||||
movdqu xmm1, [rsi - 4]
|
||||
@@ -759,9 +659,6 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 8
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
@@ -775,21 +672,11 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 14
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 7
|
||||
%define k0k6 [rsp + 16 * 0]
|
||||
%define k2k5 [rsp + 16 * 1]
|
||||
%define k3k4 [rsp + 16 * 2]
|
||||
%define k1k7 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define max [rsp + 16 * 5]
|
||||
%define min [rsp + 16 * 6]
|
||||
|
||||
HIGH_GET_FILTERS_4
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
@@ -801,6 +688,16 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
|
||||
lea rdx, [rdx + rdx]
|
||||
movsxd rcx, DWORD PTR arg(4) ;output_height
|
||||
|
||||
.load
|
||||
prefetcht0 [rsi - 6]
|
||||
prefetcht0 [rsi + 17]
|
||||
lea rsi, [rsi + rax]
|
||||
dec rcx
|
||||
jnz .load
|
||||
|
||||
mov rsi, arg(0)
|
||||
movsxd rcx, DWORD PTR arg(4)
|
||||
|
||||
.loop:
|
||||
movdqu xmm0, [rsi - 6] ;load src
|
||||
movdqu xmm4, [rsi + 2]
|
||||
@@ -825,9 +722,6 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 7
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
@@ -841,22 +735,11 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 15
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 8
|
||||
%define k0k1 [rsp + 16 * 0]
|
||||
%define k6k7 [rsp + 16 * 1]
|
||||
%define k2k5 [rsp + 16 * 2]
|
||||
%define k3k4 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define temp [rsp + 16 * 5]
|
||||
%define max [rsp + 16 * 6]
|
||||
%define min [rsp + 16 * 7]
|
||||
|
||||
HIGH_GET_FILTERS
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -865,6 +748,16 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
|
||||
lea rdx, [rdx + rdx]
|
||||
movsxd rcx, DWORD PTR arg(4) ;output_height
|
||||
|
||||
.load
|
||||
prefetcht0 [rsi - 6]
|
||||
prefetcht0 [rsi + 23]
|
||||
lea rsi, [rsi + rax]
|
||||
dec rcx
|
||||
jnz .load
|
||||
|
||||
mov rsi, arg(0)
|
||||
movsxd rcx, DWORD PTR arg(4)
|
||||
|
||||
.loop:
|
||||
movdqu xmm0, [rsi - 6] ;load src
|
||||
movdqu xmm1, [rsi - 4]
|
||||
@@ -882,8 +775,6 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 8
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@@ -898,22 +789,11 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
SAVE_XMM 15
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16 * 8
|
||||
%define k0k1 [rsp + 16 * 0]
|
||||
%define k6k7 [rsp + 16 * 1]
|
||||
%define k2k5 [rsp + 16 * 2]
|
||||
%define k3k4 [rsp + 16 * 3]
|
||||
%define krd [rsp + 16 * 4]
|
||||
%define temp [rsp + 16 * 5]
|
||||
%define max [rsp + 16 * 6]
|
||||
%define min [rsp + 16 * 7]
|
||||
|
||||
HIGH_GET_FILTERS
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -922,6 +802,16 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
|
||||
lea rdx, [rdx + rdx]
|
||||
movsxd rcx, DWORD PTR arg(4) ;output_height
|
||||
|
||||
.load
|
||||
prefetcht0 [rsi - 6]
|
||||
prefetcht0 [rsi + 31]
|
||||
lea rsi, [rsi + rax]
|
||||
dec rcx
|
||||
jnz .load
|
||||
|
||||
mov rsi, arg(0)
|
||||
movsxd rcx, DWORD PTR arg(4)
|
||||
|
||||
.loop:
|
||||
movdqu xmm0, [rsi - 6] ;load src
|
||||
movdqu xmm1, [rsi - 4]
|
||||
@@ -950,9 +840,6 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
|
||||
dec rcx
|
||||
jnz .loop
|
||||
|
||||
add rsp, 16 * 8
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
@@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
|
||||
#define RECON_AND_STORE4X4(dest, in_x) \
|
||||
{ \
|
||||
@@ -3985,3 +3986,573 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
|
||||
dest += 8 - (stride * 32);
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
|
||||
__m128i ubounded, retval;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
|
||||
ubounded = _mm_cmpgt_epi16(value, max);
|
||||
retval = _mm_andnot_si128(ubounded, value);
|
||||
ubounded = _mm_and_si128(ubounded, max);
|
||||
retval = _mm_or_si128(retval, ubounded);
|
||||
retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
|
||||
return retval;
|
||||
}
|
||||
|
||||
void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[4 * 4];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j;
|
||||
__m128i inptr[4];
|
||||
__m128i sign_bits[2];
|
||||
__m128i temp_mm, min_input, max_input;
|
||||
int test;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
int optimised_cols = 0;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i eight = _mm_set1_epi16(8);
|
||||
const __m128i max = _mm_set1_epi16(12043);
|
||||
const __m128i min = _mm_set1_epi16(-12043);
|
||||
// Load input into __m128i
|
||||
inptr[0] = _mm_loadu_si128((const __m128i *)input);
|
||||
inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
|
||||
inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
|
||||
inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
|
||||
|
||||
// Pack to 16 bits
|
||||
inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
|
||||
inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
|
||||
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp_mm = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp_mm);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct4_sse2(inptr);
|
||||
|
||||
// Check the min & max values
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp_mm = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp_mm);
|
||||
|
||||
if (test) {
|
||||
transpose_4x4(inptr);
|
||||
sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
|
||||
sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
|
||||
inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
|
||||
inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
|
||||
inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
|
||||
inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
|
||||
_mm_storeu_si128((__m128i*)outptr, inptr[0]);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]);
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vp9_highbd_idct4(input, outptr, bd);
|
||||
input += 4;
|
||||
outptr += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct4_sse2(inptr);
|
||||
|
||||
// Final round and shift
|
||||
inptr[0] = _mm_add_epi16(inptr[0], eight);
|
||||
inptr[1] = _mm_add_epi16(inptr[1], eight);
|
||||
|
||||
inptr[0] = _mm_srai_epi16(inptr[0], 4);
|
||||
inptr[1] = _mm_srai_epi16(inptr[1], 4);
|
||||
|
||||
// Reconstruction and Store
|
||||
{
|
||||
__m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
|
||||
__m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
|
||||
d0 = _mm_unpacklo_epi64(d0,
|
||||
_mm_loadl_epi64((const __m128i *)(dest + stride)));
|
||||
d2 = _mm_unpacklo_epi64(d2,
|
||||
_mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
|
||||
d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
|
||||
d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
|
||||
// store input0
|
||||
_mm_storel_epi64((__m128i *)dest, d0);
|
||||
// store input1
|
||||
d0 = _mm_srli_si128(d0, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + stride), d0);
|
||||
// store input2
|
||||
_mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
|
||||
// store input3
|
||||
d2 = _mm_srli_si128(d2, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[4], temp_out[4];
|
||||
// Columns
|
||||
for (i = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
temp_in[j] = out[j * 4 + i];
|
||||
vp9_highbd_idct4(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 4; ++j)
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
|
||||
ROUND_POWER_OF_TWO(temp_out[j], 4),
|
||||
bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[8 * 8];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[8];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i sixteen = _mm_set1_epi16(16);
|
||||
const __m128i max = _mm_set1_epi16(6201);
|
||||
const __m128i min = _mm_set1_epi16(-6201);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 8; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
array_transpose_8x8(inptr, inptr);
|
||||
for (i = 0; i < 8; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vp9_highbd_idct8(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
|
||||
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 5);
|
||||
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[8], temp_out[8];
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j)
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
vp9_highbd_idct8(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j)
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
|
||||
ROUND_POWER_OF_TWO(temp_out[j], 5),
|
||||
bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[8 * 8] = { 0 };
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[8];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i sixteen = _mm_set1_epi16(16);
|
||||
const __m128i max = _mm_set1_epi16(6201);
|
||||
const __m128i min = _mm_set1_epi16(-6201);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 8; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
// only first 4 row has non-zero coefs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
// N.B. Only first 4 cols contain non-zero coeffs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
// Use fact only first 4 rows contain non-zero coeffs
|
||||
array_transpose_4X8(inptr, inptr);
|
||||
for (i = 0; i < 4; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vp9_highbd_idct8(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
|
||||
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 5);
|
||||
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[8], temp_out[8];
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j)
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
vp9_highbd_idct8(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j)
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
|
||||
ROUND_POWER_OF_TWO(temp_out[j], 5),
|
||||
bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[16 * 16];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[32];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_set1_epi16(32);
|
||||
const __m128i max = _mm_set1_epi16(3155);
|
||||
const __m128i min = _mm_set1_epi16(-3155);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 16; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
|
||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 32; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 32; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
array_transpose_16x16(inptr, inptr + 16);
|
||||
for (i = 0; i < 16; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vp9_highbd_idct16(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[2];
|
||||
for (i = 0; i < 16; i++) {
|
||||
inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
|
||||
inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
|
||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
|
||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
|
||||
inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
|
||||
inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
|
||||
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
|
||||
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[16], temp_out[16];
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j)
|
||||
temp_in[j] = out[j * 16 + i];
|
||||
vp9_highbd_idct16(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j)
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
|
||||
ROUND_POWER_OF_TWO(temp_out[j], 6),
|
||||
bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[16 * 16] = { 0 };
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[32];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_set1_epi16(32);
|
||||
const __m128i max = _mm_set1_epi16(3155);
|
||||
const __m128i min = _mm_set1_epi16(-3155);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 16; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
|
||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
// Since all non-zero dct coefficients are in upper-left 4x4 area,
|
||||
// we only need to consider first 4 rows here.
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform (N.B. This transposes inptr)
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
// N.B. Only first 4 cols contain non-zero coeffs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 16; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
// Use fact only first 4 rows contain non-zero coeffs
|
||||
array_transpose_8x8(inptr, inptr);
|
||||
array_transpose_8x8(inptr + 8, inptr + 16);
|
||||
for (i = 0; i < 4; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vp9_highbd_idct16(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[2];
|
||||
for (i = 0; i < 16; i++) {
|
||||
inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
|
||||
inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
|
||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
|
||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
|
||||
inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
|
||||
inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
|
||||
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
|
||||
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[16], temp_out[16];
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j)
|
||||
temp_in[j] = out[j * 16 + i];
|
||||
vp9_highbd_idct16(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j)
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
|
||||
ROUND_POWER_OF_TWO(temp_out[j], 6),
|
||||
bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
@@ -7,6 +7,7 @@
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include "vp9/common/vp9_common.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
|
||||
@@ -17,3 +18,16 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
|
||||
|
||||
return (sum + 32) >> 6;
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
|
||||
int i, j;
|
||||
int sum = 0;
|
||||
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
|
||||
for (i = 0; i < 8; ++i, s+=p)
|
||||
for (j = 0; j < 8; sum += s[j], ++j) {}
|
||||
|
||||
return (sum + 32) >> 6;
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
|
@@ -17,6 +17,7 @@
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
#include "vp9/common/vp9_systemdependent.h"
|
||||
#include "vp9/encoder/vp9_dct.h"
|
||||
|
||||
static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
|
||||
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
|
||||
@@ -26,7 +27,7 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
|
||||
return rv;
|
||||
}
|
||||
|
||||
static void fdct4(const tran_low_t *input, tran_low_t *output) {
|
||||
void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {
|
||||
tran_high_t step[4];
|
||||
tran_high_t temp1, temp2;
|
||||
|
||||
@@ -123,7 +124,7 @@ void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
|
||||
}
|
||||
}
|
||||
|
||||
static void fadst4(const tran_low_t *input, tran_low_t *output) {
|
||||
void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {
|
||||
tran_high_t x0, x1, x2, x3;
|
||||
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
|
||||
@@ -163,13 +164,6 @@ static void fadst4(const tran_low_t *input, tran_low_t *output) {
|
||||
output[3] = fdct_round_shift(s3);
|
||||
}
|
||||
|
||||
static const transform_2d FHT_4[] = {
|
||||
{ fdct4, fdct4 }, // DCT_DCT = 0
|
||||
{ fadst4, fdct4 }, // ADST_DCT = 1
|
||||
{ fdct4, fadst4 }, // DCT_ADST = 2
|
||||
{ fadst4, fadst4 } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
|
||||
int stride, int tx_type) {
|
||||
if (tx_type == DCT_DCT) {
|
||||
@@ -203,7 +197,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
|
||||
}
|
||||
}
|
||||
|
||||
static void fdct8(const tran_low_t *input, tran_low_t *output) {
|
||||
void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {
|
||||
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
|
||||
tran_high_t t0, t1, t2, t3; // needs32
|
||||
tran_high_t x0, x1, x2, x3; // canbe16
|
||||
@@ -331,7 +325,7 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
|
||||
|
||||
// Rows
|
||||
for (i = 0; i < 8; ++i) {
|
||||
fdct8(&intermediate[i * 8], &final_output[i * 8]);
|
||||
vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);
|
||||
for (j = 0; j < 8; ++j)
|
||||
final_output[j + i * 8] /= 2;
|
||||
}
|
||||
@@ -528,7 +522,7 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
|
||||
}
|
||||
}
|
||||
|
||||
static void fadst8(const tran_low_t *input, tran_low_t *output) {
|
||||
void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
|
||||
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
|
||||
tran_high_t x0 = input[7];
|
||||
@@ -599,13 +593,6 @@ static void fadst8(const tran_low_t *input, tran_low_t *output) {
|
||||
output[7] = - x1;
|
||||
}
|
||||
|
||||
static const transform_2d FHT_8[] = {
|
||||
{ fdct8, fdct8 }, // DCT_DCT = 0
|
||||
{ fadst8, fdct8 }, // ADST_DCT = 1
|
||||
{ fdct8, fadst8 }, // DCT_ADST = 2
|
||||
{ fadst8, fadst8 } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
|
||||
int stride, int tx_type) {
|
||||
if (tx_type == DCT_DCT) {
|
||||
@@ -694,7 +681,7 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
|
||||
}
|
||||
|
||||
// Rewrote to use same algorithm as others.
|
||||
static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
|
||||
void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {
|
||||
tran_high_t step1[8]; // canbe16
|
||||
tran_high_t step2[8]; // canbe16
|
||||
tran_high_t step3[8]; // canbe16
|
||||
@@ -835,7 +822,7 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
|
||||
out[15] = fdct_round_shift(temp2);
|
||||
}
|
||||
|
||||
static void fadst16(const tran_low_t *input, tran_low_t *output) {
|
||||
void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {
|
||||
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
|
||||
tran_high_t s9, s10, s11, s12, s13, s14, s15;
|
||||
|
||||
@@ -998,13 +985,6 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
|
||||
output[15] = - x1;
|
||||
}
|
||||
|
||||
static const transform_2d FHT_16[] = {
|
||||
{ fdct16, fdct16 }, // DCT_DCT = 0
|
||||
{ fadst16, fdct16 }, // ADST_DCT = 1
|
||||
{ fdct16, fadst16 }, // DCT_ADST = 2
|
||||
{ fadst16, fadst16 } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
|
||||
int stride, int tx_type) {
|
||||
if (tx_type == DCT_DCT) {
|
||||
@@ -1049,7 +1029,7 @@ static INLINE tran_high_t half_round_shift(tran_high_t input) {
|
||||
return rv;
|
||||
}
|
||||
|
||||
static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
|
||||
void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
|
||||
tran_high_t step[32];
|
||||
// Stage 1
|
||||
step[0] = input[0] + input[(32 - 1)];
|
||||
@@ -1392,7 +1372,7 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
|
||||
tran_high_t temp_in[32], temp_out[32];
|
||||
for (j = 0; j < 32; ++j)
|
||||
temp_in[j] = input[j * stride + i] * 4;
|
||||
fdct32(temp_in, temp_out, 0);
|
||||
vp9_fdct32(temp_in, temp_out, 0);
|
||||
for (j = 0; j < 32; ++j)
|
||||
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
|
||||
}
|
||||
@@ -1402,7 +1382,7 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
|
||||
tran_high_t temp_in[32], temp_out[32];
|
||||
for (j = 0; j < 32; ++j)
|
||||
temp_in[j] = output[j + i * 32];
|
||||
fdct32(temp_in, temp_out, 0);
|
||||
vp9_fdct32(temp_in, temp_out, 0);
|
||||
for (j = 0; j < 32; ++j)
|
||||
out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
||||
}
|
||||
@@ -1420,7 +1400,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
|
||||
tran_high_t temp_in[32], temp_out[32];
|
||||
for (j = 0; j < 32; ++j)
|
||||
temp_in[j] = input[j * stride + i] * 4;
|
||||
fdct32(temp_in, temp_out, 0);
|
||||
vp9_fdct32(temp_in, temp_out, 0);
|
||||
for (j = 0; j < 32; ++j)
|
||||
// TODO(cd): see quality impact of only doing
|
||||
// output[j * 32 + i] = (temp_out[j] + 1) >> 2;
|
||||
@@ -1433,7 +1413,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
|
||||
tran_high_t temp_in[32], temp_out[32];
|
||||
for (j = 0; j < 32; ++j)
|
||||
temp_in[j] = output[j + i * 32];
|
||||
fdct32(temp_in, temp_out, 1);
|
||||
vp9_fdct32(temp_in, temp_out, 1);
|
||||
for (j = 0; j < 32; ++j)
|
||||
out[j + i * 32] = temp_out[j];
|
||||
}
|
||||
|
61
vp9/encoder/vp9_dct.h
Normal file
61
vp9/encoder/vp9_dct.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP9_ENCODER_VP9_DCT_H_
|
||||
#define VP9_ENCODER_VP9_DCT_H_
|
||||
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
|
||||
void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
|
||||
void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
|
||||
int stride);
|
||||
void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride);
|
||||
void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
|
||||
int stride);
|
||||
|
||||
void vp9_fdct4(const tran_low_t *input, tran_low_t *output);
|
||||
void vp9_fadst4(const tran_low_t *input, tran_low_t *output);
|
||||
void vp9_fdct8(const tran_low_t *input, tran_low_t *output);
|
||||
void vp9_fadst8(const tran_low_t *input, tran_low_t *output);
|
||||
void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]);
|
||||
void vp9_fadst16(const tran_low_t *input, tran_low_t *output);
|
||||
void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round);
|
||||
|
||||
static const transform_2d FHT_4[] = {
|
||||
{ vp9_fdct4, vp9_fdct4 }, // DCT_DCT = 0
|
||||
{ vp9_fadst4, vp9_fdct4 }, // ADST_DCT = 1
|
||||
{ vp9_fdct4, vp9_fadst4 }, // DCT_ADST = 2
|
||||
{ vp9_fadst4, vp9_fadst4 } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
static const transform_2d FHT_8[] = {
|
||||
{ vp9_fdct8, vp9_fdct8 }, // DCT_DCT = 0
|
||||
{ vp9_fadst8, vp9_fdct8 }, // ADST_DCT = 1
|
||||
{ vp9_fdct8, vp9_fadst8 }, // DCT_ADST = 2
|
||||
{ vp9_fadst8, vp9_fadst8 } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
static const transform_2d FHT_16[] = {
|
||||
{ vp9_fdct16, vp9_fdct16 }, // DCT_DCT = 0
|
||||
{ vp9_fadst16, vp9_fdct16 }, // ADST_DCT = 1
|
||||
{ vp9_fdct16, vp9_fadst16 }, // DCT_ADST = 2
|
||||
{ vp9_fadst16, vp9_fadst16 } // ADST_ADST = 3
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP9_ENCODER_VP9_DCT_H_
|
@@ -515,8 +515,19 @@ static void choose_partitioning(VP9_COMP *cpi,
|
||||
int sum = 0;
|
||||
|
||||
if (x_idx < pixels_wide && y_idx < pixels_high) {
|
||||
int s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
|
||||
int d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
|
||||
int s_avg, d_avg;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||
s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
|
||||
d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
|
||||
} else {
|
||||
s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
|
||||
d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
|
||||
}
|
||||
#else
|
||||
s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
|
||||
d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
|
||||
#endif
|
||||
sum = s_avg - d_avg;
|
||||
sse = sum * sum;
|
||||
}
|
||||
@@ -3414,9 +3425,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (cm->use_highbitdepth)
|
||||
x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
|
||||
else
|
||||
x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
|
||||
else
|
||||
x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
|
||||
x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
|
||||
vp9_highbd_idct4x4_add;
|
||||
#else
|
||||
|
File diff suppressed because it is too large
Load Diff
1015
vp9/encoder/x86/vp9_dct_impl_sse2.c
Normal file
1015
vp9/encoder/x86/vp9_dct_impl_sse2.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -62,9 +62,40 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
|
||||
psllw m2, 2
|
||||
psllw m3, 2
|
||||
|
||||
%if CONFIG_VP9_HIGHBITDEPTH
|
||||
pxor m4, m4
|
||||
pxor m5, m5
|
||||
pcmpgtw m4, m0
|
||||
pcmpgtw m5, m1
|
||||
movq m6, m0
|
||||
movq m7, m1
|
||||
punpcklwd m0, m4
|
||||
punpcklwd m1, m5
|
||||
punpckhwd m6, m4
|
||||
punpckhwd m7, m5
|
||||
movq [outputq], m0
|
||||
movq [outputq + 8], m6
|
||||
movq [outputq + 16], m1
|
||||
movq [outputq + 24], m7
|
||||
pxor m4, m4
|
||||
pxor m5, m5
|
||||
pcmpgtw m4, m2
|
||||
pcmpgtw m5, m3
|
||||
movq m6, m2
|
||||
movq m7, m3
|
||||
punpcklwd m2, m4
|
||||
punpcklwd m3, m5
|
||||
punpckhwd m6, m4
|
||||
punpckhwd m7, m5
|
||||
movq [outputq + 32], m2
|
||||
movq [outputq + 40], m6
|
||||
movq [outputq + 48], m3
|
||||
movq [outputq + 56], m7
|
||||
%else
|
||||
movq [outputq], m0
|
||||
movq [outputq + 8], m1
|
||||
movq [outputq + 16], m2
|
||||
movq [outputq + 24], m3
|
||||
%endif
|
||||
|
||||
RET
|
||||
|
File diff suppressed because it is too large
Load Diff
368
vp9/encoder/x86/vp9_dct_sse2.h
Normal file
368
vp9/encoder/x86/vp9_dct_sse2.h
Normal file
@@ -0,0 +1,368 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP9_ENCODER_X86_VP9_DCT_SSE2_H_
|
||||
#define VP9_ENCODER_X86_VP9_DCT_SSE2_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define pair_set_epi32(a, b) \
|
||||
_mm_set_epi32(b, a, b, a)
|
||||
|
||||
void vp9_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
|
||||
void vp9_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
|
||||
void vp9_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
|
||||
void vp9_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output,
|
||||
int stride);
|
||||
void vp9_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output,
|
||||
int stride);
|
||||
void vp9_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output,
|
||||
int stride);
|
||||
|
||||
static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
|
||||
__m128i buf0, buf1;
|
||||
buf0 = _mm_mul_epu32(a, b);
|
||||
a = _mm_srli_epi64(a, 32);
|
||||
b = _mm_srli_epi64(b, 32);
|
||||
buf1 = _mm_mul_epu32(a, b);
|
||||
return _mm_add_epi64(buf0, buf1);
|
||||
}
|
||||
|
||||
static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
|
||||
__m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
|
||||
__m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
|
||||
return _mm_unpacklo_epi64(buf0, buf1);
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x2(__m128i reg0, __m128i reg1) {
|
||||
const __m128i max_overflow = _mm_set1_epi16(0x7fff);
|
||||
const __m128i min_overflow = _mm_set1_epi16(0x8000);
|
||||
__m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
|
||||
_mm_cmpeq_epi16(reg0, min_overflow));
|
||||
__m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
|
||||
_mm_cmpeq_epi16(reg1, min_overflow));
|
||||
cmp0 = _mm_or_si128(cmp0, cmp1);
|
||||
return _mm_movemask_epi8(cmp0);
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x4(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3) {
|
||||
const __m128i max_overflow = _mm_set1_epi16(0x7fff);
|
||||
const __m128i min_overflow = _mm_set1_epi16(0x8000);
|
||||
__m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
|
||||
_mm_cmpeq_epi16(reg0, min_overflow));
|
||||
__m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
|
||||
_mm_cmpeq_epi16(reg1, min_overflow));
|
||||
__m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(reg2, max_overflow),
|
||||
_mm_cmpeq_epi16(reg2, min_overflow));
|
||||
__m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(reg3, max_overflow),
|
||||
_mm_cmpeq_epi16(reg3, min_overflow));
|
||||
cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
|
||||
return _mm_movemask_epi8(cmp0);
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x8(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4,
|
||||
__m128i reg5, __m128i reg6, __m128i reg7) {
|
||||
int res0, res1;
|
||||
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
|
||||
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
|
||||
return res0 + res1;
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x12(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4,
|
||||
__m128i reg5, __m128i reg6, __m128i reg7,
|
||||
__m128i reg8, __m128i reg9, __m128i reg10,
|
||||
__m128i reg11) {
|
||||
int res0, res1;
|
||||
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
|
||||
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
|
||||
if (!res0)
|
||||
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
|
||||
return res0 + res1;
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x16(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4,
|
||||
__m128i reg5, __m128i reg6, __m128i reg7,
|
||||
__m128i reg8, __m128i reg9, __m128i reg10,
|
||||
__m128i reg11, __m128i reg12, __m128i reg13,
|
||||
__m128i reg14, __m128i reg15) {
|
||||
int res0, res1;
|
||||
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
|
||||
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
|
||||
if (!res0) {
|
||||
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
|
||||
if (!res1)
|
||||
res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
|
||||
}
|
||||
return res0 + res1;
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x32(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4,
|
||||
__m128i reg5, __m128i reg6, __m128i reg7,
|
||||
__m128i reg8, __m128i reg9, __m128i reg10,
|
||||
__m128i reg11, __m128i reg12, __m128i reg13,
|
||||
__m128i reg14, __m128i reg15, __m128i reg16,
|
||||
__m128i reg17, __m128i reg18, __m128i reg19,
|
||||
__m128i reg20, __m128i reg21, __m128i reg22,
|
||||
__m128i reg23, __m128i reg24, __m128i reg25,
|
||||
__m128i reg26, __m128i reg27, __m128i reg28,
|
||||
__m128i reg29, __m128i reg30, __m128i reg31) {
|
||||
int res0, res1;
|
||||
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
|
||||
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
|
||||
if (!res0) {
|
||||
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
|
||||
if (!res1) {
|
||||
res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
|
||||
if (!res0) {
|
||||
res0 = check_epi16_overflow_x4(reg16, reg17, reg18, reg19);
|
||||
if (!res1) {
|
||||
res1 = check_epi16_overflow_x4(reg20, reg21, reg22, reg23);
|
||||
if (!res0) {
|
||||
res0 = check_epi16_overflow_x4(reg24, reg25, reg26, reg27);
|
||||
if (!res1)
|
||||
res1 = check_epi16_overflow_x4(reg28, reg29, reg30, reg31);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return res0 + res1;
|
||||
}
|
||||
|
||||
static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, const __m128i* zero) {
|
||||
__m128i minus_one = _mm_set1_epi32(-1);
|
||||
// Check for overflows
|
||||
__m128i reg0_shifted = _mm_slli_epi64(reg0, 1);
|
||||
__m128i reg1_shifted = _mm_slli_epi64(reg1, 1);
|
||||
__m128i reg2_shifted = _mm_slli_epi64(reg2, 1);
|
||||
__m128i reg3_shifted = _mm_slli_epi64(reg3, 1);
|
||||
__m128i reg0_top_dwords = _mm_shuffle_epi32(
|
||||
reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
|
||||
__m128i reg1_top_dwords = _mm_shuffle_epi32(
|
||||
reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
|
||||
__m128i reg2_top_dwords = _mm_shuffle_epi32(
|
||||
reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
|
||||
__m128i reg3_top_dwords = _mm_shuffle_epi32(
|
||||
reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
|
||||
__m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
|
||||
__m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
|
||||
__m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
|
||||
__m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
|
||||
__m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
|
||||
__m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
|
||||
int overflow_01 = _mm_movemask_epi8(
|
||||
_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
|
||||
int overflow_23 = _mm_movemask_epi8(
|
||||
_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
|
||||
return (overflow_01 + overflow_23);
|
||||
}
|
||||
|
||||
static INLINE int k_check_epi32_overflow_8(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
|
||||
__m128i reg6, __m128i reg7, const __m128i* zero) {
|
||||
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
|
||||
}
|
||||
return overflow;
|
||||
}
|
||||
|
||||
static INLINE int k_check_epi32_overflow_16(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
|
||||
__m128i reg6, __m128i reg7, __m128i reg8, __m128i reg9,
|
||||
__m128i reg10, __m128i reg11, __m128i reg12, __m128i reg13,
|
||||
__m128i reg14, __m128i reg15, const __m128i* zero) {
|
||||
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
|
||||
}
|
||||
}
|
||||
}
|
||||
return overflow;
|
||||
}
|
||||
|
||||
static INLINE int k_check_epi32_overflow_32(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
|
||||
__m128i reg6, __m128i reg7, __m128i reg8, __m128i reg9,
|
||||
__m128i reg10, __m128i reg11, __m128i reg12, __m128i reg13,
|
||||
__m128i reg14, __m128i reg15, __m128i reg16, __m128i reg17,
|
||||
__m128i reg18, __m128i reg19, __m128i reg20, __m128i reg21,
|
||||
__m128i reg22, __m128i reg23, __m128i reg24, __m128i reg25,
|
||||
__m128i reg26, __m128i reg27, __m128i reg28, __m128i reg29,
|
||||
__m128i reg30, __m128i reg31, const __m128i* zero) {
|
||||
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg16, reg17, reg18, reg19, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg20, reg21,
|
||||
reg22, reg23, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg24, reg25,
|
||||
reg26, reg27, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg28, reg29,
|
||||
reg30, reg31, zero);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return overflow;
|
||||
}
|
||||
|
||||
static INLINE void store_output(const __m128i output, tran_low_t* dst_ptr) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
|
||||
__m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
|
||||
__m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
|
||||
_mm_store_si128((__m128i *)(dst_ptr), out0);
|
||||
_mm_store_si128((__m128i *)(dst_ptr + 4), out1);
|
||||
#else
|
||||
_mm_store_si128((__m128i *)(dst_ptr), output);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE void storeu_output(const __m128i output, tran_low_t* dst_ptr) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
|
||||
__m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
|
||||
__m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(dst_ptr), out0);
|
||||
_mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
|
||||
#else
|
||||
_mm_storeu_si128((__m128i *)(dst_ptr), output);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static INLINE __m128i mult_round_shift(const __m128i in0, const __m128i in1,
|
||||
const __m128i multiplier,
|
||||
const __m128i rounding,
|
||||
const int shift) {
|
||||
const __m128i u0 = _mm_madd_epi16(in0, multiplier);
|
||||
const __m128i u1 = _mm_madd_epi16(in1, multiplier);
|
||||
const __m128i v0 = _mm_add_epi32(u0, rounding);
|
||||
const __m128i v1 = _mm_add_epi32(u1, rounding);
|
||||
const __m128i w0 = _mm_srai_epi32(v0, shift);
|
||||
const __m128i w1 = _mm_srai_epi32(v1, shift);
|
||||
return _mm_packs_epi32(w0, w1);
|
||||
}
|
||||
|
||||
static INLINE void transpose_and_output8x8(
|
||||
const __m128i in00, const __m128i in01,
|
||||
const __m128i in02, const __m128i in03,
|
||||
const __m128i in04, const __m128i in05,
|
||||
const __m128i in06, const __m128i in07,
|
||||
const int pass, int16_t* out0_ptr,
|
||||
tran_low_t* out1_ptr) {
|
||||
// 00 01 02 03 04 05 06 07
|
||||
// 10 11 12 13 14 15 16 17
|
||||
// 20 21 22 23 24 25 26 27
|
||||
// 30 31 32 33 34 35 36 37
|
||||
// 40 41 42 43 44 45 46 47
|
||||
// 50 51 52 53 54 55 56 57
|
||||
// 60 61 62 63 64 65 66 67
|
||||
// 70 71 72 73 74 75 76 77
|
||||
const __m128i tr0_0 = _mm_unpacklo_epi16(in00, in01);
|
||||
const __m128i tr0_1 = _mm_unpacklo_epi16(in02, in03);
|
||||
const __m128i tr0_2 = _mm_unpackhi_epi16(in00, in01);
|
||||
const __m128i tr0_3 = _mm_unpackhi_epi16(in02, in03);
|
||||
const __m128i tr0_4 = _mm_unpacklo_epi16(in04, in05);
|
||||
const __m128i tr0_5 = _mm_unpacklo_epi16(in06, in07);
|
||||
const __m128i tr0_6 = _mm_unpackhi_epi16(in04, in05);
|
||||
const __m128i tr0_7 = _mm_unpackhi_epi16(in06, in07);
|
||||
// 00 10 01 11 02 12 03 13
|
||||
// 20 30 21 31 22 32 23 33
|
||||
// 04 14 05 15 06 16 07 17
|
||||
// 24 34 25 35 26 36 27 37
|
||||
// 40 50 41 51 42 52 43 53
|
||||
// 60 70 61 71 62 72 63 73
|
||||
// 54 54 55 55 56 56 57 57
|
||||
// 64 74 65 75 66 76 67 77
|
||||
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
|
||||
const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
|
||||
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
|
||||
const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
|
||||
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
|
||||
const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
|
||||
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
|
||||
const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
|
||||
// 00 10 20 30 01 11 21 31
|
||||
// 40 50 60 70 41 51 61 71
|
||||
// 02 12 22 32 03 13 23 33
|
||||
// 42 52 62 72 43 53 63 73
|
||||
// 04 14 24 34 05 15 21 36
|
||||
// 44 54 64 74 45 55 61 76
|
||||
// 06 16 26 36 07 17 27 37
|
||||
// 46 56 66 76 47 57 67 77
|
||||
const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
|
||||
const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
|
||||
const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
|
||||
const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
|
||||
const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
|
||||
const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
|
||||
const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
|
||||
const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
|
||||
// 00 10 20 30 40 50 60 70
|
||||
// 01 11 21 31 41 51 61 71
|
||||
// 02 12 22 32 42 52 62 72
|
||||
// 03 13 23 33 43 53 63 73
|
||||
// 04 14 24 34 44 54 64 74
|
||||
// 05 15 25 35 45 55 65 75
|
||||
// 06 16 26 36 46 56 66 76
|
||||
// 07 17 27 37 47 57 67 77
|
||||
if (pass == 0) {
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
|
||||
} else {
|
||||
storeu_output(tr2_0, (out1_ptr + 0 * 16));
|
||||
storeu_output(tr2_1, (out1_ptr + 1 * 16));
|
||||
storeu_output(tr2_2, (out1_ptr + 2 * 16));
|
||||
storeu_output(tr2_3, (out1_ptr + 3 * 16));
|
||||
storeu_output(tr2_4, (out1_ptr + 4 * 16));
|
||||
storeu_output(tr2_5, (out1_ptr + 5 * 16));
|
||||
storeu_output(tr2_6, (out1_ptr + 6 * 16));
|
||||
storeu_output(tr2_7, (out1_ptr + 7 * 16));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP9_ENCODER_X86_VP9_DCT_SSE2_H_
|
71
vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
Normal file
71
vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "vp9/common/vp9_common.h"
|
||||
|
||||
int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff,
|
||||
tran_low_t *dqcoeff, intptr_t block_size,
|
||||
int64_t *ssz, int bps) {
|
||||
int i, j, test;
|
||||
uint32_t temp[4];
|
||||
__m128i max, min, cmp0, cmp1, cmp2, cmp3;
|
||||
int64_t error = 0, sqcoeff = 0;
|
||||
int shift = 2 * (bps - 8);
|
||||
int rounding = shift > 0 ? 1 << (shift - 1) : 0;
|
||||
|
||||
for (i = 0; i < block_size; i+=8) {
|
||||
// Load the data into xmm registers
|
||||
__m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
|
||||
__m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
|
||||
__m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
|
||||
__m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
|
||||
// Check if any values require more than 15 bit
|
||||
max = _mm_set1_epi32(0x3fff);
|
||||
min = _mm_set1_epi32(0xffffc000);
|
||||
cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
|
||||
_mm_cmplt_epi32(mm_coeff, min));
|
||||
cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
|
||||
_mm_cmplt_epi32(mm_coeff2, min));
|
||||
cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
|
||||
_mm_cmplt_epi32(mm_dqcoeff, min));
|
||||
cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
|
||||
_mm_cmplt_epi32(mm_dqcoeff2, min));
|
||||
test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
|
||||
_mm_or_si128(cmp2, cmp3)));
|
||||
|
||||
if (!test) {
|
||||
__m128i mm_diff, error_sse2, sqcoeff_sse2;;
|
||||
mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
|
||||
mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
|
||||
mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
|
||||
error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
|
||||
sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
|
||||
_mm_storeu_si128((__m128i*)temp, error_sse2);
|
||||
error = error + temp[0] + temp[1] + temp[2] + temp[3];
|
||||
_mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
|
||||
sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
|
||||
} else {
|
||||
for (j = 0; j < 8; j++) {
|
||||
const int64_t diff = coeff[i+j] - dqcoeff[i+j];
|
||||
error += diff * diff;
|
||||
sqcoeff += (int64_t)coeff[i+j] * (int64_t)coeff[i+j];
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(error >= 0 && sqcoeff >= 0);
|
||||
error = (error + rounding) >> shift;
|
||||
sqcoeff = (sqcoeff + rounding) >> shift;
|
||||
|
||||
*ssz = sqcoeff;
|
||||
return error;
|
||||
}
|
173
vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
Normal file
173
vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
Normal file
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <emmintrin.h> // SSE2
|
||||
|
||||
#include "vp9/common/vp9_common.h"
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
// from vp9_idct.h: typedef int32_t tran_low_t;
|
||||
void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
|
||||
int skip_block,
|
||||
const int16_t *zbin_ptr, const int16_t *round_ptr,
|
||||
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t *dequant_ptr,
|
||||
int zbin_oq_value, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan) {
|
||||
int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
|
||||
__m128i zbins[2];
|
||||
__m128i nzbins[2];
|
||||
|
||||
zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value),
|
||||
(int)(zbin_ptr[1] + zbin_oq_value),
|
||||
(int)(zbin_ptr[1] + zbin_oq_value),
|
||||
(int)(zbin_ptr[0] + zbin_oq_value));
|
||||
zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value));
|
||||
|
||||
nzbins[0] = _mm_setzero_si128();
|
||||
nzbins[1] = _mm_setzero_si128();
|
||||
nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
|
||||
nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
|
||||
|
||||
(void)scan;
|
||||
|
||||
vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
|
||||
vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = ((int)count / 4) - 1; i >= 0; i--) {
|
||||
__m128i coeffs, cmp1, cmp2;
|
||||
int test;
|
||||
coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
|
||||
cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
|
||||
cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
|
||||
cmp1 = _mm_and_si128(cmp1, cmp2);
|
||||
test = _mm_movemask_epi8(cmp1);
|
||||
if (test == 0xffff)
|
||||
non_zero_regs--;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
// Quantization pass:
|
||||
for (i = 0; i < non_zero_regs; i++) {
|
||||
__m128i coeffs, coeffs_sign, tmp1, tmp2;
|
||||
int test;
|
||||
int abs_coeff[4];
|
||||
int coeff_sign[4];
|
||||
|
||||
coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
|
||||
coeffs_sign = _mm_srai_epi32(coeffs, 31);
|
||||
coeffs = _mm_sub_epi32(
|
||||
_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
|
||||
tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
|
||||
tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
|
||||
tmp1 = _mm_or_si128(tmp1, tmp2);
|
||||
test = _mm_movemask_epi8(tmp1);
|
||||
_mm_storeu_si128((__m128i*)abs_coeff, coeffs);
|
||||
_mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
|
||||
|
||||
for (j = 0; j < 4; j++) {
|
||||
if (test & (1 << (4*j))) {
|
||||
int k = 4 * i + j;
|
||||
int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
|
||||
INT32_MIN, INT32_MAX);
|
||||
tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
|
||||
quant_shift_ptr[k != 0]) >> 16; // quantization
|
||||
qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
|
||||
dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
|
||||
if (tmp)
|
||||
eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob_i + 1;
|
||||
}
|
||||
|
||||
|
||||
void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
|
||||
intptr_t n_coeffs, int skip_block,
|
||||
const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr,
|
||||
const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr,
|
||||
tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr,
|
||||
const int16_t *dequant_ptr,
|
||||
int zbin_oq_value, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan) {
|
||||
__m128i zbins[2];
|
||||
__m128i nzbins[2];
|
||||
int idx = 0;
|
||||
int idx_arr[1024];
|
||||
int i, eob = -1;
|
||||
const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
|
||||
const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
|
||||
(void)scan;
|
||||
zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value),
|
||||
(zbin1_tmp + zbin_oq_value),
|
||||
(zbin1_tmp + zbin_oq_value),
|
||||
(zbin0_tmp + zbin_oq_value));
|
||||
zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value));
|
||||
|
||||
nzbins[0] = _mm_setzero_si128();
|
||||
nzbins[1] = _mm_setzero_si128();
|
||||
nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
|
||||
nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
|
||||
|
||||
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = 0; i < n_coeffs / 4; i++) {
|
||||
__m128i coeffs, cmp1, cmp2;
|
||||
int test;
|
||||
coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
|
||||
cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
|
||||
cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
|
||||
cmp1 = _mm_and_si128(cmp1, cmp2);
|
||||
test = _mm_movemask_epi8(cmp1);
|
||||
if (!(test & 0xf))
|
||||
idx_arr[idx++] = i*4;
|
||||
if (!(test & 0xf0))
|
||||
idx_arr[idx++] = i*4 + 1;
|
||||
if (!(test & 0xf00))
|
||||
idx_arr[idx++] = i*4 + 2;
|
||||
if (!(test & 0xf000))
|
||||
idx_arr[idx++] = i*4 + 3;
|
||||
}
|
||||
|
||||
// Quantization pass: only process the coefficients selected in
|
||||
// pre-scan pass. Note: idx can be zero.
|
||||
for (i = 0; i < idx; i++) {
|
||||
const int rc = idx_arr[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
int64_t tmp = clamp(abs_coeff +
|
||||
ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
|
||||
INT32_MIN, INT32_MAX);
|
||||
tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
|
||||
quant_shift_ptr[rc != 0]) >> 15;
|
||||
|
||||
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
|
||||
|
||||
if (tmp)
|
||||
eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
284
vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm
Normal file
284
vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm
Normal file
@@ -0,0 +1,284 @@
|
||||
;
|
||||
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
|
||||
%macro HIGH_PROCESS_4x2x4 5-6 0
|
||||
movh m0, [srcq +%2*2]
|
||||
%if %1 == 1
|
||||
movu m4, [ref1q+%3*2]
|
||||
movu m5, [ref2q+%3*2]
|
||||
movu m6, [ref3q+%3*2]
|
||||
movu m7, [ref4q+%3*2]
|
||||
movhps m0, [srcq +%4*2]
|
||||
movhps m4, [ref1q+%5*2]
|
||||
movhps m5, [ref2q+%5*2]
|
||||
movhps m6, [ref3q+%5*2]
|
||||
movhps m7, [ref4q+%5*2]
|
||||
mova m3, m0
|
||||
mova m2, m0
|
||||
psubusw m3, m4
|
||||
psubusw m2, m5
|
||||
psubusw m4, m0
|
||||
psubusw m5, m0
|
||||
por m4, m3
|
||||
por m5, m2
|
||||
pmaddwd m4, m1
|
||||
pmaddwd m5, m1
|
||||
mova m3, m0
|
||||
mova m2, m0
|
||||
psubusw m3, m6
|
||||
psubusw m2, m7
|
||||
psubusw m6, m0
|
||||
psubusw m7, m0
|
||||
por m6, m3
|
||||
por m7, m2
|
||||
pmaddwd m6, m1
|
||||
pmaddwd m7, m1
|
||||
%else
|
||||
movu m2, [ref1q+%3*2]
|
||||
movhps m0, [srcq +%4*2]
|
||||
movhps m2, [ref1q+%5*2]
|
||||
mova m3, m0
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
pmaddwd m2, m1
|
||||
paddd m4, m2
|
||||
|
||||
movu m2, [ref2q+%3*2]
|
||||
mova m3, m0
|
||||
movhps m2, [ref2q+%5*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
pmaddwd m2, m1
|
||||
paddd m5, m2
|
||||
|
||||
movu m2, [ref3q+%3*2]
|
||||
mova m3, m0
|
||||
movhps m2, [ref3q+%5*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
pmaddwd m2, m1
|
||||
paddd m6, m2
|
||||
|
||||
movu m2, [ref4q+%3*2]
|
||||
mova m3, m0
|
||||
movhps m2, [ref4q+%5*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
pmaddwd m2, m1
|
||||
paddd m7, m2
|
||||
%endif
|
||||
%if %6 == 1
|
||||
lea srcq, [srcq +src_strideq*4]
|
||||
lea ref1q, [ref1q+ref_strideq*4]
|
||||
lea ref2q, [ref2q+ref_strideq*4]
|
||||
lea ref3q, [ref3q+ref_strideq*4]
|
||||
lea ref4q, [ref4q+ref_strideq*4]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
|
||||
%macro HIGH_PROCESS_8x2x4 5-6 0
|
||||
; 1st 8 px
|
||||
mova m0, [srcq +%2*2]
|
||||
%if %1 == 1
|
||||
movu m4, [ref1q+%3*2]
|
||||
movu m5, [ref2q+%3*2]
|
||||
movu m6, [ref3q+%3*2]
|
||||
movu m7, [ref4q+%3*2]
|
||||
mova m3, m0
|
||||
mova m2, m0
|
||||
psubusw m3, m4
|
||||
psubusw m2, m5
|
||||
psubusw m4, m0
|
||||
psubusw m5, m0
|
||||
por m4, m3
|
||||
por m5, m2
|
||||
pmaddwd m4, m1
|
||||
pmaddwd m5, m1
|
||||
mova m3, m0
|
||||
mova m2, m0
|
||||
psubusw m3, m6
|
||||
psubusw m2, m7
|
||||
psubusw m6, m0
|
||||
psubusw m7, m0
|
||||
por m6, m3
|
||||
por m7, m2
|
||||
pmaddwd m6, m1
|
||||
pmaddwd m7, m1
|
||||
%else
|
||||
mova m3, m0
|
||||
movu m2, [ref1q+%3*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
mova m3, m0
|
||||
pmaddwd m2, m1
|
||||
paddd m4, m2
|
||||
movu m2, [ref2q+%3*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
mova m3, m0
|
||||
pmaddwd m2, m1
|
||||
paddd m5, m2
|
||||
movu m2, [ref3q+%3*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
mova m3, m0
|
||||
pmaddwd m2, m1
|
||||
paddd m6, m2
|
||||
movu m2, [ref4q+%3*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
pmaddwd m2, m1
|
||||
paddd m7, m2
|
||||
%endif
|
||||
|
||||
; 2nd 8 px
|
||||
mova m0, [srcq +(%4)*2]
|
||||
mova m3, m0
|
||||
movu m2, [ref1q+(%5)*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
mova m3, m0
|
||||
pmaddwd m2, m1
|
||||
paddd m4, m2
|
||||
movu m2, [ref2q+(%5)*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
mova m3, m0
|
||||
pmaddwd m2, m1
|
||||
paddd m5, m2
|
||||
movu m2, [ref3q+(%5)*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
por m2, m3
|
||||
mova m3, m0
|
||||
pmaddwd m2, m1
|
||||
paddd m6, m2
|
||||
movu m2, [ref4q+(%5)*2]
|
||||
psubusw m3, m2
|
||||
psubusw m2, m0
|
||||
%if %6 == 1
|
||||
lea srcq, [srcq +src_strideq*4]
|
||||
lea ref1q, [ref1q+ref_strideq*4]
|
||||
lea ref2q, [ref2q+ref_strideq*4]
|
||||
lea ref3q, [ref3q+ref_strideq*4]
|
||||
lea ref4q, [ref4q+ref_strideq*4]
|
||||
%endif
|
||||
por m2, m3
|
||||
pmaddwd m2, m1
|
||||
paddd m7, m2
|
||||
%endmacro
|
||||
|
||||
; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
|
||||
%macro HIGH_PROCESS_16x2x4 5-6 0
|
||||
HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
|
||||
HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
|
||||
%endmacro
|
||||
|
||||
; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
|
||||
%macro HIGH_PROCESS_32x2x4 5-6 0
|
||||
HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
|
||||
HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
|
||||
%endmacro
|
||||
|
||||
; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
|
||||
%macro HIGH_PROCESS_64x2x4 5-6 0
|
||||
HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
|
||||
HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
|
||||
%endmacro
|
||||
|
||||
; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref[4], int ref_stride,
|
||||
; unsigned int res[4]);
|
||||
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
|
||||
%macro HIGH_SADNXN4D 2
|
||||
%if UNIX64
|
||||
cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \
|
||||
res, ref2, ref3, ref4, one
|
||||
%else
|
||||
cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \
|
||||
ref2, ref3, ref4, one
|
||||
%endif
|
||||
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
mov ref2q, [ref1q+gprsize*1]
|
||||
mov ref3q, [ref1q+gprsize*2]
|
||||
mov ref4q, [ref1q+gprsize*3]
|
||||
mov ref1q, [ref1q+gprsize*0]
|
||||
|
||||
; convert byte pointers to short pointers
|
||||
shl srcq, 1
|
||||
shl ref2q, 1
|
||||
shl ref3q, 1
|
||||
shl ref4q, 1
|
||||
shl ref1q, 1
|
||||
|
||||
mov oned, 0x00010001
|
||||
movd m1, oned
|
||||
pshufd m1, m1, 0x0
|
||||
|
||||
HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
|
||||
%rep (%2-4)/2
|
||||
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
|
||||
%endrep
|
||||
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
|
||||
; N.B. HIGH_PROCESS outputs dwords (32 bits)
|
||||
; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
|
||||
movhlps m0, m4
|
||||
movhlps m1, m5
|
||||
movhlps m2, m6
|
||||
movhlps m3, m7
|
||||
paddd m4, m0
|
||||
paddd m5, m1
|
||||
paddd m6, m2
|
||||
paddd m7, m3
|
||||
punpckldq m4, m5
|
||||
punpckldq m6, m7
|
||||
movhlps m0, m4
|
||||
movhlps m1, m6
|
||||
paddd m4, m0
|
||||
paddd m6, m1
|
||||
punpcklqdq m4, m6
|
||||
movifnidn r4, r4mp
|
||||
movu [r4], m4
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
|
||||
INIT_XMM sse2
|
||||
HIGH_SADNXN4D 64, 64
|
||||
HIGH_SADNXN4D 64, 32
|
||||
HIGH_SADNXN4D 32, 64
|
||||
HIGH_SADNXN4D 32, 32
|
||||
HIGH_SADNXN4D 32, 16
|
||||
HIGH_SADNXN4D 16, 32
|
||||
HIGH_SADNXN4D 16, 16
|
||||
HIGH_SADNXN4D 16, 8
|
||||
HIGH_SADNXN4D 8, 16
|
||||
HIGH_SADNXN4D 8, 8
|
||||
HIGH_SADNXN4D 8, 4
|
||||
HIGH_SADNXN4D 4, 8
|
||||
HIGH_SADNXN4D 4, 4
|
363
vp9/encoder/x86/vp9_highbd_sad_sse2.asm
Normal file
363
vp9/encoder/x86/vp9_highbd_sad_sse2.asm
Normal file
@@ -0,0 +1,363 @@
|
||||
;
|
||||
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro HIGH_SAD_FN 4
|
||||
%if %4 == 0
|
||||
%if %3 == 5
|
||||
cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
|
||||
%else ; %3 == 7
|
||||
cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
|
||||
src_stride3, ref_stride3, n_rows
|
||||
%endif ; %3 == 5/7
|
||||
%else ; avg
|
||||
%if %3 == 5
|
||||
cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
|
||||
second_pred, n_rows
|
||||
%else ; %3 == 7
|
||||
cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
|
||||
ref, ref_stride, \
|
||||
second_pred, \
|
||||
src_stride3, ref_stride3
|
||||
%if ARCH_X86_64
|
||||
%define n_rowsd r7d
|
||||
%else ; x86-32
|
||||
%define n_rowsd dword r0m
|
||||
%endif ; x86-32/64
|
||||
%endif ; %3 == 5/7
|
||||
%endif ; avg/sad
|
||||
movsxdifnidn src_strideq, src_strided
|
||||
movsxdifnidn ref_strideq, ref_strided
|
||||
%if %3 == 7
|
||||
lea src_stride3q, [src_strideq*3]
|
||||
lea ref_stride3q, [ref_strideq*3]
|
||||
%endif ; %3 == 7
|
||||
; convert src, ref & second_pred to short ptrs (from byte ptrs)
|
||||
shl srcq, 1
|
||||
shl refq, 1
|
||||
%if %4 == 1
|
||||
shl second_predq, 1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro HIGH_SAD64XN 1-2 0
|
||||
HIGH_SAD_FN 64, %1, 5, %2
|
||||
mov n_rowsd, %1
|
||||
pxor m0, m0
|
||||
pxor m6, m6
|
||||
|
||||
.loop:
|
||||
; first half of each row
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+32]
|
||||
movu m4, [refq+48]
|
||||
%if %2 == 1
|
||||
pavgw m1, [second_predq+mmsize*0]
|
||||
pavgw m2, [second_predq+mmsize*1]
|
||||
pavgw m3, [second_predq+mmsize*2]
|
||||
pavgw m4, [second_predq+mmsize*3]
|
||||
lea second_predq, [second_predq+mmsize*4]
|
||||
%endif
|
||||
mova m5, [srcq]
|
||||
psubusw m5, m1
|
||||
psubusw m1, [srcq]
|
||||
por m1, m5
|
||||
mova m5, [srcq+16]
|
||||
psubusw m5, m2
|
||||
psubusw m2, [srcq+16]
|
||||
por m2, m5
|
||||
mova m5, [srcq+32]
|
||||
psubusw m5, m3
|
||||
psubusw m3, [srcq+32]
|
||||
por m3, m5
|
||||
mova m5, [srcq+48]
|
||||
psubusw m5, m4
|
||||
psubusw m4, [srcq+48]
|
||||
por m4, m5
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
movhlps m2, m1
|
||||
movhlps m4, m3
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
punpcklwd m1, m6
|
||||
punpcklwd m3, m6
|
||||
paddd m0, m1
|
||||
paddd m0, m3
|
||||
; second half of each row
|
||||
movu m1, [refq+64]
|
||||
movu m2, [refq+80]
|
||||
movu m3, [refq+96]
|
||||
movu m4, [refq+112]
|
||||
%if %2 == 1
|
||||
pavgw m1, [second_predq+mmsize*0]
|
||||
pavgw m2, [second_predq+mmsize*1]
|
||||
pavgw m3, [second_predq+mmsize*2]
|
||||
pavgw m4, [second_predq+mmsize*3]
|
||||
lea second_predq, [second_predq+mmsize*4]
|
||||
%endif
|
||||
mova m5, [srcq+64]
|
||||
psubusw m5, m1
|
||||
psubusw m1, [srcq+64]
|
||||
por m1, m5
|
||||
mova m5, [srcq+80]
|
||||
psubusw m5, m2
|
||||
psubusw m2, [srcq+80]
|
||||
por m2, m5
|
||||
mova m5, [srcq+96]
|
||||
psubusw m5, m3
|
||||
psubusw m3, [srcq+96]
|
||||
por m3, m5
|
||||
mova m5, [srcq+112]
|
||||
psubusw m5, m4
|
||||
psubusw m4, [srcq+112]
|
||||
por m4, m5
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
movhlps m2, m1
|
||||
movhlps m4, m3
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
punpcklwd m1, m6
|
||||
punpcklwd m3, m6
|
||||
lea refq, [refq+ref_strideq*2]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*2]
|
||||
paddd m0, m3
|
||||
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
punpckldq m0, m6
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
|
||||
HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
|
||||
HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
|
||||
HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
|
||||
|
||||
|
||||
; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro HIGH_SAD32XN 1-2 0
|
||||
HIGH_SAD_FN 32, %1, 5, %2
|
||||
mov n_rowsd, %1
|
||||
pxor m0, m0
|
||||
pxor m6, m6
|
||||
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+32]
|
||||
movu m4, [refq+48]
|
||||
%if %2 == 1
|
||||
pavgw m1, [second_predq+mmsize*0]
|
||||
pavgw m2, [second_predq+mmsize*1]
|
||||
pavgw m3, [second_predq+mmsize*2]
|
||||
pavgw m4, [second_predq+mmsize*3]
|
||||
lea second_predq, [second_predq+mmsize*4]
|
||||
%endif
|
||||
mova m5, [srcq]
|
||||
psubusw m5, m1
|
||||
psubusw m1, [srcq]
|
||||
por m1, m5
|
||||
mova m5, [srcq+16]
|
||||
psubusw m5, m2
|
||||
psubusw m2, [srcq+16]
|
||||
por m2, m5
|
||||
mova m5, [srcq+32]
|
||||
psubusw m5, m3
|
||||
psubusw m3, [srcq+32]
|
||||
por m3, m5
|
||||
mova m5, [srcq+48]
|
||||
psubusw m5, m4
|
||||
psubusw m4, [srcq+48]
|
||||
por m4, m5
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
movhlps m2, m1
|
||||
movhlps m4, m3
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
punpcklwd m1, m6
|
||||
punpcklwd m3, m6
|
||||
lea refq, [refq+ref_strideq*2]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*2]
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
punpckldq m0, m6
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
|
||||
HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
|
||||
HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
|
||||
HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
|
||||
HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
|
||||
HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
|
||||
|
||||
; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro HIGH_SAD16XN 1-2 0
|
||||
HIGH_SAD_FN 16, %1, 5, %2
|
||||
mov n_rowsd, %1/2
|
||||
pxor m0, m0
|
||||
pxor m6, m6
|
||||
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+16]
|
||||
movu m3, [refq+ref_strideq*2]
|
||||
movu m4, [refq+ref_strideq*2+16]
|
||||
%if %2 == 1
|
||||
pavgw m1, [second_predq+mmsize*0]
|
||||
pavgw m2, [second_predq+16]
|
||||
pavgw m3, [second_predq+mmsize*2]
|
||||
pavgw m4, [second_predq+mmsize*2+16]
|
||||
lea second_predq, [second_predq+mmsize*4]
|
||||
%endif
|
||||
mova m5, [srcq]
|
||||
psubusw m5, m1
|
||||
psubusw m1, [srcq]
|
||||
por m1, m5
|
||||
mova m5, [srcq+16]
|
||||
psubusw m5, m2
|
||||
psubusw m2, [srcq+16]
|
||||
por m2, m5
|
||||
mova m5, [srcq+src_strideq*2]
|
||||
psubusw m5, m3
|
||||
psubusw m3, [srcq+src_strideq*2]
|
||||
por m3, m5
|
||||
mova m5, [srcq+src_strideq*2+16]
|
||||
psubusw m5, m4
|
||||
psubusw m4, [srcq+src_strideq*2+16]
|
||||
por m4, m5
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
movhlps m2, m1
|
||||
movhlps m4, m3
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
punpcklwd m1, m6
|
||||
punpcklwd m3, m6
|
||||
lea refq, [refq+ref_strideq*4]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
punpckldq m0, m6
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
|
||||
HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
|
||||
HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
|
||||
HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
|
||||
HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
|
||||
HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
|
||||
|
||||
|
||||
; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
|
||||
; uint8_t *ref, int ref_stride);
|
||||
%macro HIGH_SAD8XN 1-2 0
|
||||
HIGH_SAD_FN 8, %1, 7, %2
|
||||
mov n_rowsd, %1/4
|
||||
pxor m0, m0
|
||||
pxor m6, m6
|
||||
|
||||
.loop:
|
||||
movu m1, [refq]
|
||||
movu m2, [refq+ref_strideq*2]
|
||||
movu m3, [refq+ref_strideq*4]
|
||||
movu m4, [refq+ref_stride3q*2]
|
||||
%if %2 == 1
|
||||
pavgw m1, [second_predq+mmsize*0]
|
||||
pavgw m2, [second_predq+mmsize*1]
|
||||
pavgw m3, [second_predq+mmsize*2]
|
||||
pavgw m4, [second_predq+mmsize*3]
|
||||
lea second_predq, [second_predq+mmsize*4]
|
||||
%endif
|
||||
mova m5, [srcq]
|
||||
psubusw m5, m1
|
||||
psubusw m1, [srcq]
|
||||
por m1, m5
|
||||
mova m5, [srcq+src_strideq*2]
|
||||
psubusw m5, m2
|
||||
psubusw m2, [srcq+src_strideq*2]
|
||||
por m2, m5
|
||||
mova m5, [srcq+src_strideq*4]
|
||||
psubusw m5, m3
|
||||
psubusw m3, [srcq+src_strideq*4]
|
||||
por m3, m5
|
||||
mova m5, [srcq+src_stride3q*2]
|
||||
psubusw m5, m4
|
||||
psubusw m4, [srcq+src_stride3q*2]
|
||||
por m4, m5
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
movhlps m2, m1
|
||||
movhlps m4, m3
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
punpcklwd m1, m6
|
||||
punpcklwd m3, m6
|
||||
lea refq, [refq+ref_strideq*8]
|
||||
paddd m0, m1
|
||||
lea srcq, [srcq+src_strideq*8]
|
||||
paddd m0, m3
|
||||
dec n_rowsd
|
||||
jg .loop
|
||||
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
punpckldq m0, m6
|
||||
movhlps m1, m0
|
||||
paddd m0, m1
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
|
||||
HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
|
||||
HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
|
||||
HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
|
||||
HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
|
||||
HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
|
1043
vp9/encoder/x86/vp9_highbd_subpel_variance.asm
Normal file
1043
vp9/encoder/x86/vp9_highbd_subpel_variance.asm
Normal file
File diff suppressed because it is too large
Load Diff
313
vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
Normal file
313
vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
Normal file
@@ -0,0 +1,313 @@
|
||||
;
|
||||
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;unsigned int vp9_highbd_calc16x16var_sse2
|
||||
;(
|
||||
; unsigned char * src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char * ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int * SSE,
|
||||
; int * Sum
|
||||
;)
|
||||
global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
|
||||
sym(vp9_highbd_calc16x16var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
push rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;[src_ptr]
|
||||
mov rdi, arg(2) ;[ref_ptr]
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
||||
add rax, rax ; source stride in bytes
|
||||
add rdx, rdx ; recon stride in bytes
|
||||
|
||||
; Prefetch data
|
||||
prefetcht0 [rsi]
|
||||
prefetcht0 [rsi+16]
|
||||
prefetcht0 [rsi+rax]
|
||||
prefetcht0 [rsi+rax+16]
|
||||
lea rbx, [rsi+rax*2]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+16]
|
||||
prefetcht0 [rbx+rax]
|
||||
prefetcht0 [rbx+rax+16]
|
||||
|
||||
prefetcht0 [rdi]
|
||||
prefetcht0 [rdi+16]
|
||||
prefetcht0 [rdi+rdx]
|
||||
prefetcht0 [rdi+rdx+16]
|
||||
lea rbx, [rdi+rdx*2]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+16]
|
||||
prefetcht0 [rbx+rdx]
|
||||
prefetcht0 [rbx+rdx+16]
|
||||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
||||
|
||||
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
|
||||
mov rcx, 16
|
||||
|
||||
.var16loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rdi]
|
||||
|
||||
lea rbx, [rsi+rax*2]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+16]
|
||||
prefetcht0 [rbx+rax]
|
||||
prefetcht0 [rbx+rax+16]
|
||||
lea rbx, [rdi+rdx*2]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+16]
|
||||
prefetcht0 [rbx+rdx]
|
||||
prefetcht0 [rbx+rdx+16]
|
||||
|
||||
pxor xmm5, xmm5
|
||||
|
||||
psubw xmm1, xmm2
|
||||
movdqu xmm3, XMMWORD PTR [rsi+16]
|
||||
paddw xmm5, xmm1
|
||||
pmaddwd xmm1, xmm1
|
||||
movdqu xmm2, XMMWORD PTR [rdi+16]
|
||||
paddd xmm6, xmm1
|
||||
|
||||
psubw xmm3, xmm2
|
||||
movdqu xmm1, XMMWORD PTR [rsi+rax]
|
||||
paddw xmm5, xmm3
|
||||
pmaddwd xmm3, xmm3
|
||||
movdqu xmm2, XMMWORD PTR [rdi+rdx]
|
||||
paddd xmm6, xmm3
|
||||
|
||||
psubw xmm1, xmm2
|
||||
movdqu xmm3, XMMWORD PTR [rsi+rax+16]
|
||||
paddw xmm5, xmm1
|
||||
pmaddwd xmm1, xmm1
|
||||
movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
|
||||
paddd xmm6, xmm1
|
||||
|
||||
psubw xmm3, xmm2
|
||||
paddw xmm5, xmm3
|
||||
pmaddwd xmm3, xmm3
|
||||
paddd xmm6, xmm3
|
||||
|
||||
movdqa xmm1, xmm5
|
||||
movdqa xmm2, xmm5
|
||||
pcmpgtw xmm1, xmm0
|
||||
pcmpeqw xmm2, xmm0
|
||||
por xmm1, xmm2
|
||||
pcmpeqw xmm1, xmm0
|
||||
movdqa xmm2, xmm5
|
||||
punpcklwd xmm5, xmm1
|
||||
punpckhwd xmm2, xmm1
|
||||
paddd xmm7, xmm5
|
||||
paddd xmm7, xmm2
|
||||
|
||||
lea rsi, [rsi + 2*rax]
|
||||
lea rdi, [rdi + 2*rdx]
|
||||
sub rcx, 2
|
||||
jnz .var16loop
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
punpckldq xmm6, xmm0
|
||||
|
||||
punpckhdq xmm4, xmm0
|
||||
movdqa xmm5, xmm7
|
||||
|
||||
paddd xmm6, xmm4
|
||||
punpckldq xmm7, xmm0
|
||||
|
||||
punpckhdq xmm5, xmm0
|
||||
paddd xmm7, xmm5
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
movdqa xmm5, xmm7
|
||||
|
||||
psrldq xmm4, 8
|
||||
psrldq xmm5, 8
|
||||
|
||||
paddd xmm6, xmm4
|
||||
paddd xmm7, xmm5
|
||||
|
||||
mov rdi, arg(4) ; [SSE]
|
||||
mov rax, arg(5) ; [Sum]
|
||||
|
||||
movd DWORD PTR [rdi], xmm6
|
||||
movd DWORD PTR [rax], xmm7
|
||||
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;unsigned int vp9_highbd_calc8x8var_sse2
|
||||
;(
|
||||
; unsigned char * src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char * ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int * SSE,
|
||||
; int * Sum
|
||||
;)
|
||||
global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
|
||||
sym(vp9_highbd_calc8x8var_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SAVE_XMM 7
|
||||
push rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;[src_ptr]
|
||||
mov rdi, arg(2) ;[ref_ptr]
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
||||
add rax, rax ; source stride in bytes
|
||||
add rdx, rdx ; recon stride in bytes
|
||||
|
||||
; Prefetch data
|
||||
prefetcht0 [rsi]
|
||||
prefetcht0 [rsi+rax]
|
||||
lea rbx, [rsi+rax*2]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rax]
|
||||
|
||||
prefetcht0 [rdi]
|
||||
prefetcht0 [rdi+rdx]
|
||||
lea rbx, [rdi+rdx*2]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rdx]
|
||||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
||||
|
||||
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
|
||||
mov rcx, 8
|
||||
|
||||
.var8loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rdi]
|
||||
|
||||
lea rbx, [rsi+rax*4]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rax]
|
||||
lea rbx, [rbx+rax*2]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rax]
|
||||
lea rbx, [rdi+rdx*4]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rdx]
|
||||
lea rbx, [rbx+rdx*2]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rdx]
|
||||
|
||||
pxor xmm5, xmm5
|
||||
|
||||
psubw xmm1, xmm2
|
||||
movdqu xmm3, XMMWORD PTR [rsi+rax]
|
||||
paddw xmm5, xmm1
|
||||
pmaddwd xmm1, xmm1
|
||||
movdqu xmm2, XMMWORD PTR [rdi+rdx]
|
||||
paddd xmm6, xmm1
|
||||
|
||||
lea rsi, [rsi + 2*rax]
|
||||
lea rdi, [rdi + 2*rdx]
|
||||
|
||||
psubw xmm3, xmm2
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
paddw xmm5, xmm3
|
||||
pmaddwd xmm3, xmm3
|
||||
movdqu xmm2, XMMWORD PTR [rdi]
|
||||
paddd xmm6, xmm3
|
||||
|
||||
psubw xmm1, xmm2
|
||||
movdqu xmm3, XMMWORD PTR [rsi+rax]
|
||||
paddw xmm5, xmm1
|
||||
pmaddwd xmm1, xmm1
|
||||
movdqu xmm2, XMMWORD PTR [rdi+rdx]
|
||||
paddd xmm6, xmm1
|
||||
|
||||
psubw xmm3, xmm2
|
||||
paddw xmm5, xmm3
|
||||
pmaddwd xmm3, xmm3
|
||||
paddd xmm6, xmm3
|
||||
|
||||
movdqa xmm1, xmm5
|
||||
movdqa xmm2, xmm5
|
||||
pcmpgtw xmm1, xmm0
|
||||
pcmpeqw xmm2, xmm0
|
||||
por xmm1, xmm2
|
||||
pcmpeqw xmm1, xmm0
|
||||
movdqa xmm2, xmm5
|
||||
punpcklwd xmm5, xmm1
|
||||
punpckhwd xmm2, xmm1
|
||||
paddd xmm7, xmm5
|
||||
paddd xmm7, xmm2
|
||||
|
||||
lea rsi, [rsi + 2*rax]
|
||||
lea rdi, [rdi + 2*rdx]
|
||||
sub rcx, 4
|
||||
jnz .var8loop
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
punpckldq xmm6, xmm0
|
||||
|
||||
punpckhdq xmm4, xmm0
|
||||
movdqa xmm5, xmm7
|
||||
|
||||
paddd xmm6, xmm4
|
||||
punpckldq xmm7, xmm0
|
||||
|
||||
punpckhdq xmm5, xmm0
|
||||
paddd xmm7, xmm5
|
||||
|
||||
movdqa xmm4, xmm6
|
||||
movdqa xmm5, xmm7
|
||||
|
||||
psrldq xmm4, 8
|
||||
psrldq xmm5, 8
|
||||
|
||||
paddd xmm6, xmm4
|
||||
paddd xmm7, xmm5
|
||||
|
||||
mov rdi, arg(4) ; [SSE]
|
||||
mov rax, arg(5) ; [Sum]
|
||||
|
||||
movd DWORD PTR [rdi], xmm6
|
||||
movd DWORD PTR [rax], xmm7
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbx
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
613
vp9/encoder/x86/vp9_highbd_variance_sse2.c
Normal file
613
vp9/encoder/x86/vp9_highbd_variance_sse2.c
Normal file
@@ -0,0 +1,613 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "vp9/common/vp9_common.h"
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
typedef unsigned int (*high_variance_fn_t) (const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
unsigned int vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
unsigned int *sse, int *sum);
|
||||
|
||||
static void highbd_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, unsigned int *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
|
||||
*sse = 0;
|
||||
*sum = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
*sse += sse0;
|
||||
*sum += sum0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, unsigned int *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
uint64_t sse_long = 0;
|
||||
int64_t sum_long = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
sse_long += sse0;
|
||||
sum_long += sum0;
|
||||
}
|
||||
}
|
||||
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
|
||||
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
|
||||
}
|
||||
|
||||
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
|
||||
const uint16_t *ref, int ref_stride,
|
||||
int w, int h, unsigned int *sse, int *sum,
|
||||
high_variance_fn_t var_fn, int block_size) {
|
||||
int i, j;
|
||||
uint64_t sse_long = 0;
|
||||
int64_t sum_long = 0;
|
||||
|
||||
for (i = 0; i < h; i += block_size) {
|
||||
for (j = 0; j < w; j += block_size) {
|
||||
unsigned int sse0;
|
||||
int sum0;
|
||||
var_fn(src + src_stride * i + j, src_stride,
|
||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||
sse_long += sse0;
|
||||
sum_long += sum0;
|
||||
}
|
||||
}
|
||||
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
|
||||
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
|
||||
}
|
||||
|
||||
|
||||
#define HIGH_GET_VAR(S) \
|
||||
void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
} \
|
||||
\
|
||||
void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
|
||||
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
|
||||
} \
|
||||
\
|
||||
void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
unsigned int *sse, int *sum) { \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||
sse, sum); \
|
||||
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
|
||||
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
|
||||
}
|
||||
|
||||
HIGH_GET_VAR(16);
|
||||
HIGH_GET_VAR(8);
|
||||
|
||||
#undef HIGH_GET_VAR
|
||||
|
||||
#define VAR_FN(w, h, block_size, shift) \
|
||||
unsigned int vp9_highbd_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
|
||||
block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
} \
|
||||
unsigned int vp9_highbd_10_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
|
||||
block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
} \
|
||||
unsigned int vp9_highbd_12_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, \
|
||||
unsigned int *sse) { \
|
||||
int sum; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
|
||||
block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
}
|
||||
|
||||
VAR_FN(64, 64, 16, 12);
|
||||
VAR_FN(64, 32, 16, 11);
|
||||
VAR_FN(32, 64, 16, 11);
|
||||
VAR_FN(32, 32, 16, 10);
|
||||
VAR_FN(32, 16, 16, 9);
|
||||
VAR_FN(16, 32, 16, 9);
|
||||
VAR_FN(16, 16, 16, 8);
|
||||
VAR_FN(16, 8, 8, 7);
|
||||
VAR_FN(8, 16, 8, 7);
|
||||
VAR_FN(8, 8, 8, 6);
|
||||
|
||||
#undef VAR_FN
|
||||
|
||||
unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
const uint8_t *ref8, int ref_stride,
|
||||
unsigned int *sse) {
|
||||
int sum;
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
||||
return *sse;
|
||||
}
|
||||
|
||||
#define DECL(w, opt) \
|
||||
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint16_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
int height, unsigned int *sse);
|
||||
#define DECLS(opt1, opt2) \
|
||||
DECL(8, opt1); \
|
||||
DECL(16, opt1)
|
||||
|
||||
DECLS(sse2, sse);
|
||||
// DECLS(ssse3, ssse3);
|
||||
#undef DECLS
|
||||
#undef DECL
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
unsigned int \
|
||||
vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst8, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr) { \
|
||||
unsigned int sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst, dst_stride, h, \
|
||||
&sse); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
|
||||
src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, \
|
||||
dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
unsigned int vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst8, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr) { \
|
||||
unsigned int sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst, dst_stride, \
|
||||
h, &sse); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
|
||||
src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, \
|
||||
dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 2); \
|
||||
sse = ROUND_POWER_OF_TWO(sse, 4); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
unsigned int vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst8, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr) { \
|
||||
int start_row; \
|
||||
unsigned int sse; \
|
||||
int se = 0; \
|
||||
uint64_t long_sse = 0; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
for (start_row = 0; start_row < h; start_row +=16) { \
|
||||
unsigned int sse2; \
|
||||
int height = h - start_row < 16 ? h - start_row : 16; \
|
||||
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf) { \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 16 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 32 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
|
||||
src + 48 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
|
||||
dst_stride, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
}\
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 4); \
|
||||
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
#define FNS(opt1, opt2) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (int64_t));
|
||||
|
||||
|
||||
FNS(sse2, sse);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
|
||||
|
||||
#define DECL(w, opt) \
|
||||
int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int x_offset, int y_offset, \
|
||||
const uint16_t *dst, \
|
||||
ptrdiff_t dst_stride, \
|
||||
const uint16_t *sec, \
|
||||
ptrdiff_t sec_stride, \
|
||||
int height, \
|
||||
unsigned int *sse);
|
||||
#define DECLS(opt1) \
|
||||
DECL(16, opt1) \
|
||||
DECL(8, opt1)
|
||||
|
||||
DECLS(sse2);
|
||||
#undef DECL
|
||||
#undef DECLS
|
||||
|
||||
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
|
||||
unsigned int vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst8, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
unsigned int sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
sec, w, h, &sse); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
sec + 16, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
sec + 32, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
sec + 48, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
unsigned int vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst8, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
unsigned int sse; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src, src_stride, x_offset, \
|
||||
y_offset, dst, dst_stride, \
|
||||
sec, w, h, &sse); \
|
||||
if (w > wf) { \
|
||||
unsigned int sse2; \
|
||||
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16, dst_stride, \
|
||||
sec + 16, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32, dst_stride, \
|
||||
sec + 32, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48, src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48, dst_stride, \
|
||||
sec + 48, w, h, &sse2); \
|
||||
se += se2; \
|
||||
sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 2); \
|
||||
sse = ROUND_POWER_OF_TWO(sse, 4); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
} \
|
||||
unsigned int vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
|
||||
const uint8_t *src8, \
|
||||
int src_stride, \
|
||||
int x_offset, \
|
||||
int y_offset, \
|
||||
const uint8_t *dst8, \
|
||||
int dst_stride, \
|
||||
unsigned int *sse_ptr, \
|
||||
const uint8_t *sec8) { \
|
||||
int start_row; \
|
||||
unsigned int sse; \
|
||||
int se = 0; \
|
||||
uint64_t long_sse = 0; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
|
||||
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
|
||||
for (start_row = 0; start_row < h; start_row +=16) { \
|
||||
unsigned int sse2; \
|
||||
int height = h - start_row < 16 ? h - start_row : 16; \
|
||||
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + (start_row * src_stride), src_stride, x_offset, \
|
||||
y_offset, dst + (start_row * dst_stride), dst_stride, \
|
||||
sec + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf) { \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 16 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 16 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 16 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
if (w > wf * 2) { \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 32 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 32 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 32 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
|
||||
src + 48 + (start_row * src_stride), src_stride, \
|
||||
x_offset, y_offset, \
|
||||
dst + 48 + (start_row * dst_stride), dst_stride, \
|
||||
sec + 48 + (start_row * w), w, height, &sse2); \
|
||||
se += se2; \
|
||||
long_sse += sse2; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
se = ROUND_POWER_OF_TWO(se, 4); \
|
||||
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
|
||||
*sse_ptr = sse; \
|
||||
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
|
||||
}
|
||||
|
||||
|
||||
#define FNS(opt1) \
|
||||
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
|
||||
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
|
||||
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
|
||||
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
|
||||
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
|
||||
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
|
||||
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
|
||||
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
|
||||
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
|
||||
FN(8, 4, 8, 3, 2, opt1, (int64_t));
|
||||
|
||||
FNS(sse2);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
14
vp9/vp9cx.mk
14
vp9/vp9cx.mk
@@ -24,6 +24,7 @@ VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_cost.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_cost.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_dct.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_dct.h
|
||||
VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c
|
||||
VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
|
||||
@@ -101,6 +102,12 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
|
||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
|
||||
@@ -109,6 +116,11 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
@@ -120,7 +132,9 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
|
||||
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
|
||||
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_impl_sse2.c
|
||||
|
||||
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
|
||||
|
@@ -18,58 +18,56 @@
|
||||
#include <stdlib.h>
|
||||
static void once(void (*func)(void))
|
||||
{
|
||||
static CRITICAL_SECTION *lock;
|
||||
static LONG waiters;
|
||||
static int done;
|
||||
void *lock_ptr = &lock;
|
||||
static CRITICAL_SECTION *lock;
|
||||
static LONG waiters;
|
||||
static int done;
|
||||
void *lock_ptr = &lock;
|
||||
|
||||
/* If the initialization is complete, return early. This isn't just an
|
||||
* optimization, it prevents races on the destruction of the global
|
||||
* lock.
|
||||
*/
|
||||
if(done)
|
||||
return;
|
||||
/* If the initialization is complete, return early. This isn't just an
|
||||
* optimization, it prevents races on the destruction of the global
|
||||
* lock.
|
||||
*/
|
||||
if (done)
|
||||
return;
|
||||
|
||||
InterlockedIncrement(&waiters);
|
||||
InterlockedIncrement(&waiters);
|
||||
|
||||
/* Get a lock. We create one and try to make it the one-true-lock,
|
||||
* throwing it away if we lost the race.
|
||||
*/
|
||||
/* Get a lock. We create one and try to make it the one-true-lock,
|
||||
* throwing it away if we lost the race.
|
||||
*/
|
||||
|
||||
{
|
||||
/* Scope to protect access to new_lock */
|
||||
CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
|
||||
InitializeCriticalSection(new_lock);
|
||||
if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
|
||||
{
|
||||
/* Scope to protect access to new_lock */
|
||||
CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
|
||||
InitializeCriticalSection(new_lock);
|
||||
if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
|
||||
{
|
||||
DeleteCriticalSection(new_lock);
|
||||
free(new_lock);
|
||||
}
|
||||
DeleteCriticalSection(new_lock);
|
||||
free(new_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/* At this point, we have a lock that can be synchronized on. We don't
|
||||
* care which thread actually performed the allocation.
|
||||
*/
|
||||
/* At this point, we have a lock that can be synchronized on. We don't
|
||||
* care which thread actually performed the allocation.
|
||||
*/
|
||||
|
||||
EnterCriticalSection(lock);
|
||||
EnterCriticalSection(lock);
|
||||
|
||||
if (!done)
|
||||
{
|
||||
func();
|
||||
done = 1;
|
||||
}
|
||||
if (!done) {
|
||||
func();
|
||||
done = 1;
|
||||
}
|
||||
|
||||
LeaveCriticalSection(lock);
|
||||
LeaveCriticalSection(lock);
|
||||
|
||||
/* Last one out should free resources. The destructed objects are
|
||||
* protected by checking if(done) above.
|
||||
*/
|
||||
if(!InterlockedDecrement(&waiters))
|
||||
{
|
||||
DeleteCriticalSection(lock);
|
||||
free(lock);
|
||||
lock = NULL;
|
||||
}
|
||||
/* Last one out should free resources. The destructed objects are
|
||||
* protected by checking if(done) above.
|
||||
*/
|
||||
if (!InterlockedDecrement(&waiters)) {
|
||||
DeleteCriticalSection(lock);
|
||||
free(lock);
|
||||
lock = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -78,25 +76,24 @@ static void once(void (*func)(void))
|
||||
#include <os2.h>
|
||||
static void once(void (*func)(void))
|
||||
{
|
||||
static int done;
|
||||
static int done;
|
||||
|
||||
/* If the initialization is complete, return early. */
|
||||
if(done)
|
||||
return;
|
||||
/* If the initialization is complete, return early. */
|
||||
if (done)
|
||||
return;
|
||||
|
||||
/* Causes all other threads in the process to block themselves
|
||||
* and give up their time slice.
|
||||
*/
|
||||
DosEnterCritSec();
|
||||
/* Causes all other threads in the process to block themselves
|
||||
* and give up their time slice.
|
||||
*/
|
||||
DosEnterCritSec();
|
||||
|
||||
if (!done)
|
||||
{
|
||||
func();
|
||||
done = 1;
|
||||
}
|
||||
if (!done) {
|
||||
func();
|
||||
done = 1;
|
||||
}
|
||||
|
||||
/* Restores normal thread dispatching for the current process. */
|
||||
DosExitCritSec();
|
||||
/* Restores normal thread dispatching for the current process. */
|
||||
DosExitCritSec();
|
||||
}
|
||||
|
||||
|
||||
@@ -104,8 +101,8 @@ static void once(void (*func)(void))
|
||||
#include <pthread.h>
|
||||
static void once(void (*func)(void))
|
||||
{
|
||||
static pthread_once_t lock = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&lock, func);
|
||||
static pthread_once_t lock = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&lock, func);
|
||||
}
|
||||
|
||||
|
||||
@@ -117,13 +114,12 @@ static void once(void (*func)(void))
|
||||
|
||||
static void once(void (*func)(void))
|
||||
{
|
||||
static int done;
|
||||
static int done;
|
||||
|
||||
if(!done)
|
||||
{
|
||||
func();
|
||||
done = 1;
|
||||
}
|
||||
if (!done) {
|
||||
func();
|
||||
done = 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
Reference in New Issue
Block a user