Merge "Added high bitdepth sse2 transform functions"
This commit is contained in:
commit
dcb29c1406
@ -264,6 +264,8 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
|
||||
|
||||
typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
|
||||
typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
|
||||
typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
|
||||
Idct16x16Param;
|
||||
|
||||
void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
|
||||
int /*tx_type*/) {
|
||||
@ -311,7 +313,33 @@ void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
|
||||
}
|
||||
#endif
|
||||
|
||||
void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_10_add_c(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_10_add_c(in, out, stride, 12);
|
||||
}
|
||||
|
||||
#if HAVE_SSE2
|
||||
void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
|
||||
void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
#endif // HAVE_SSE2
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
class Trans16x16TestBase {
|
||||
public:
|
||||
@ -518,7 +546,7 @@ class Trans16x16TestBase {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
double out_r[kNumCoeffs];
|
||||
@ -534,13 +562,13 @@ class Trans16x16TestBase {
|
||||
src16[j] = rnd.Rand16() & mask_;
|
||||
dst16[j] = rnd.Rand16() & mask_;
|
||||
in[j] = src16[j] - dst16[j];
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
}
|
||||
|
||||
reference_16x16_dct_2d(in, out_r);
|
||||
for (int j = 0; j < kNumCoeffs; ++j)
|
||||
coeff[j] = round(out_r[j]);
|
||||
coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
|
||||
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
|
||||
@ -548,7 +576,7 @@ class Trans16x16TestBase {
|
||||
} else {
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
|
||||
16));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
@ -557,7 +585,7 @@ class Trans16x16TestBase {
|
||||
bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
|
||||
#else
|
||||
const uint32_t diff = dst[j] - src[j];
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
const uint32_t error = diff * diff;
|
||||
EXPECT_GE(1u, error)
|
||||
<< "Error: 16x16 IDCT has error " << error
|
||||
@ -565,6 +593,64 @@ class Trans16x16TestBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CompareInvReference(IdctFunc ref_txfm, int thresh) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = 10000;
|
||||
const int eob = 10;
|
||||
const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
if (j < eob) {
|
||||
// Random values less than the threshold, either positive or negative
|
||||
coeff[scan[j]] = rnd(thresh) * (1 - 2 * (i % 2));
|
||||
} else {
|
||||
coeff[scan[j]] = 0;
|
||||
}
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
dst[j] = 0;
|
||||
ref[j] = 0;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
} else {
|
||||
dst16[j] = 0;
|
||||
ref16[j] = 0;
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
}
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
ref_txfm(coeff, ref, pitch_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
|
||||
} else {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
|
||||
pitch_));
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const uint32_t diff =
|
||||
bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
|
||||
#else
|
||||
const uint32_t diff = dst[j] - ref[j];
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
const uint32_t error = diff * diff;
|
||||
EXPECT_EQ(0u, error)
|
||||
<< "Error: 16x16 IDCT Comparison has error " << error
|
||||
<< " at index " << j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int pitch_;
|
||||
int tx_type_;
|
||||
vpx_bit_depth_t bit_depth_;
|
||||
@ -590,10 +676,10 @@ class Trans16x16DCT
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
switch (bit_depth_) {
|
||||
case 10:
|
||||
case VPX_BITS_10:
|
||||
inv_txfm_ref = idct16x16_10_ref;
|
||||
break;
|
||||
case 12:
|
||||
case VPX_BITS_12:
|
||||
inv_txfm_ref = idct16x16_12_ref;
|
||||
break;
|
||||
default:
|
||||
@ -703,6 +789,37 @@ TEST_P(Trans16x16HT, QuantCheck) {
|
||||
RunQuantCheck(429, 729);
|
||||
}
|
||||
|
||||
class InvTrans16x16DCT
|
||||
: public Trans16x16TestBase,
|
||||
public ::testing::TestWithParam<Idct16x16Param> {
|
||||
public:
|
||||
virtual ~InvTrans16x16DCT() {}
|
||||
|
||||
virtual void SetUp() {
|
||||
ref_txfm_ = GET_PARAM(0);
|
||||
inv_txfm_ = GET_PARAM(1);
|
||||
thresh_ = GET_PARAM(2);
|
||||
bit_depth_ = GET_PARAM(3);
|
||||
pitch_ = 16;
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
}
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
|
||||
void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
|
||||
inv_txfm_(out, dst, stride);
|
||||
}
|
||||
|
||||
IdctFunc ref_txfm_;
|
||||
IdctFunc inv_txfm_;
|
||||
int thresh_;
|
||||
};
|
||||
|
||||
TEST_P(InvTrans16x16DCT, CompareReference) {
|
||||
CompareInvReference(ref_txfm_, thresh_);
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
@ -717,7 +834,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
C, Trans16x16DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -743,7 +860,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -770,7 +887,52 @@ INSTANTIATE_TEST_CASE_P(
|
||||
VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
|
||||
VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans16x16DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fdct16x16_sse2,
|
||||
&idct16x16_10, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct16x16_c,
|
||||
&idct16x16_256_add_10_sse2, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct16x16_sse2,
|
||||
&idct16x16_12, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fdct16x16_c,
|
||||
&idct16x16_256_add_12_sse2, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_fdct16x16_sse2,
|
||||
&vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans16x16HT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
|
||||
VPX_BITS_8)));
|
||||
// Optimizations take effect at a threshold of 3155, so we use a value close to
|
||||
// that to test both branches.
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, InvTrans16x16DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&idct16x16_10_add_10_c,
|
||||
&idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
|
||||
make_tuple(&idct16x16_10,
|
||||
&idct16x16_256_add_10_sse2, 3167, VPX_BITS_10),
|
||||
make_tuple(&idct16x16_10_add_12_c,
|
||||
&idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
|
||||
make_tuple(&idct16x16_12,
|
||||
&idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
|
||||
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -778,5 +940,5 @@ INSTANTIATE_TEST_CASE_P(
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0,
|
||||
VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
} // namespace
|
||||
|
@ -79,6 +79,10 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
|
||||
Trans32x32Param;
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void idct32x32_8(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct32x32_1024_add_c(in, out, stride, 8);
|
||||
}
|
||||
|
||||
void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct32x32_1024_add_c(in, out, stride, 10);
|
||||
}
|
||||
@ -86,7 +90,7 @@ void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct32x32_1024_add_c(in, out, stride, 12);
|
||||
}
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
|
||||
public:
|
||||
@ -114,7 +118,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
uint32_t max_error = 0;
|
||||
int64_t total_error = 0;
|
||||
const int count_test_block = 1000;
|
||||
const int count_test_block = 10000;
|
||||
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
|
||||
@ -127,7 +131,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Initialize a test block with input range [-mask_, mask_].
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
if (bit_depth_ == 8) {
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
src[j] = rnd.Rand8();
|
||||
dst[j] = rnd.Rand8();
|
||||
test_input_block[j] = src[j] - dst[j];
|
||||
@ -282,7 +286,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
|
||||
|
||||
reference_32x32_dct_2d(in, out_r);
|
||||
for (int j = 0; j < kNumCoeffs; ++j)
|
||||
coeff[j] = round(out_r[j]);
|
||||
coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
@ -331,7 +335,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
&vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fdct32x32_rd_c,
|
||||
&vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -341,7 +345,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
&vp9_idct32x32_1024_add_neon, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fdct32x32_rd_c,
|
||||
&vp9_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -351,7 +355,23 @@ INSTANTIATE_TEST_CASE_P(
|
||||
&vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fdct32x32_rd_sse2,
|
||||
&vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans32x32Test,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1,
|
||||
VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1,
|
||||
VPX_BITS_12),
|
||||
make_tuple(&vp9_fdct32x32_sse2, &vp9_idct32x32_1024_add_c, 0,
|
||||
VPX_BITS_8),
|
||||
make_tuple(&vp9_fdct32x32_rd_sse2, &vp9_idct32x32_1024_add_c, 1,
|
||||
VPX_BITS_8)));
|
||||
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -361,5 +381,5 @@ INSTANTIATE_TEST_CASE_P(
|
||||
&vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fdct32x32_rd_avx2,
|
||||
&vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
} // namespace
|
||||
|
@ -75,7 +75,17 @@ void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_iwht4x4_16_add_c(in, out, stride, 12);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
#endif // HAVE_SSE2
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
class Trans4x4TestBase {
|
||||
public:
|
||||
@ -416,7 +426,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
C, Trans4x4DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -442,7 +452,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -456,7 +466,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
C, Trans4x4WHT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -471,7 +481,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
|
||||
!CONFIG_EMULATE_HARDWARE
|
||||
@ -494,6 +504,33 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans4x4DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_fdct4x4_sse2, &vp9_idct4x4_16_add_c, 0,
|
||||
VPX_BITS_8)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans4x4HT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
|
||||
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
} // namespace
|
||||
|
@ -71,6 +71,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
|
||||
|
||||
typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
|
||||
typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
|
||||
typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
|
||||
|
||||
void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
|
||||
vp9_fdct8x8_c(in, out, stride);
|
||||
@ -96,7 +97,33 @@ void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
|
||||
vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
|
||||
}
|
||||
#endif
|
||||
|
||||
void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_10_add_c(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_10_add_c(in, out, stride, 12);
|
||||
}
|
||||
|
||||
#if HAVE_SSE2
|
||||
void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
|
||||
void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
|
||||
}
|
||||
|
||||
void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
|
||||
vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
|
||||
}
|
||||
#endif // HAVE_SSE2
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
class FwdTrans8x8TestBase {
|
||||
public:
|
||||
@ -146,9 +173,10 @@ class FwdTrans8x8TestBase {
|
||||
memset(count_sign_block, 0, sizeof(count_sign_block));
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Initialize a test block with input range [-15, 15].
|
||||
// Initialize a test block with input range [-mask_ / 16, mask_ / 16].
|
||||
for (int j = 0; j < 64; ++j)
|
||||
test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
|
||||
test_input_block[j] = ((rnd.Rand16() & mask_) >> 4) -
|
||||
((rnd.Rand16() & mask_) >> 4);
|
||||
ASM_REGISTER_STATE_CHECK(
|
||||
RunFwdTxfm(test_input_block, test_output_block, pitch_));
|
||||
|
||||
@ -188,7 +216,7 @@ class FwdTrans8x8TestBase {
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Initialize a test block with input range [-255, 255].
|
||||
// Initialize a test block with input range [-mask_, mask_].
|
||||
for (int j = 0; j < 64; ++j) {
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
src[j] = rnd.Rand8();
|
||||
@ -427,6 +455,63 @@ class FwdTrans8x8TestBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CompareInvReference(IdctFunc ref_txfm, int thresh) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int count_test_block = 10000;
|
||||
const int eob = 12;
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
|
||||
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
|
||||
#endif
|
||||
const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;
|
||||
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
if (j < eob) {
|
||||
// Random values less than the threshold, either positive or negative
|
||||
coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
|
||||
} else {
|
||||
coeff[scan[j]] = 0;
|
||||
}
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
dst[j] = 0;
|
||||
ref[j] = 0;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
} else {
|
||||
dst16[j] = 0;
|
||||
ref16[j] = 0;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if (bit_depth_ == VPX_BITS_8) {
|
||||
ref_txfm(coeff, ref, pitch_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
} else {
|
||||
ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
|
||||
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
|
||||
pitch_));
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int j = 0; j < kNumCoeffs; ++j) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const uint32_t diff =
|
||||
bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
|
||||
#else
|
||||
const uint32_t diff = dst[j] - ref[j];
|
||||
#endif
|
||||
const uint32_t error = diff * diff;
|
||||
EXPECT_EQ(0u, error)
|
||||
<< "Error: 8x8 IDCT has error " << error
|
||||
<< " at index " << j;
|
||||
}
|
||||
}
|
||||
}
|
||||
int pitch_;
|
||||
int tx_type_;
|
||||
FhtFunc fwd_txfm_ref;
|
||||
@ -526,6 +611,38 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) {
|
||||
RunExtremalCheck();
|
||||
}
|
||||
|
||||
class InvTrans8x8DCT
|
||||
: public FwdTrans8x8TestBase,
|
||||
public ::testing::TestWithParam<Idct8x8Param> {
|
||||
public:
|
||||
virtual ~InvTrans8x8DCT() {}
|
||||
|
||||
virtual void SetUp() {
|
||||
ref_txfm_ = GET_PARAM(0);
|
||||
inv_txfm_ = GET_PARAM(1);
|
||||
thresh_ = GET_PARAM(2);
|
||||
pitch_ = 8;
|
||||
bit_depth_ = GET_PARAM(3);
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
}
|
||||
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
|
||||
inv_txfm_(out, dst, stride);
|
||||
}
|
||||
void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
|
||||
|
||||
IdctFunc ref_txfm_;
|
||||
IdctFunc inv_txfm_;
|
||||
int thresh_;
|
||||
};
|
||||
|
||||
TEST_P(InvTrans8x8DCT, CompareReference) {
|
||||
CompareInvReference(ref_txfm_, thresh_);
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
@ -540,7 +657,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
C, FwdTrans8x8DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -566,7 +683,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -581,7 +698,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@ -596,7 +713,45 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
|
||||
#endif
|
||||
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, FwdTrans8x8DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_fdct8x8_c,
|
||||
&idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct8x8_sse2,
|
||||
&idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_fdct8x8_c,
|
||||
&idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_fdct8x8_sse2,
|
||||
&idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
|
||||
make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, FwdTrans8x8HT,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
|
||||
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
|
||||
|
||||
// Optimizations take effect at a threshold of 6201, so we use a value close to
|
||||
// that to test both branches.
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, InvTrans8x8DCT,
|
||||
::testing::Values(
|
||||
make_tuple(&idct8x8_10_add_10_c,
|
||||
&idct8x8_10_add_10_sse2, 6225, VPX_BITS_10),
|
||||
make_tuple(&idct8x8_10,
|
||||
&idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
|
||||
make_tuple(&idct8x8_10_add_12_c,
|
||||
&idct8x8_10_add_12_sse2, 6225, VPX_BITS_12),
|
||||
make_tuple(&idct8x8_12,
|
||||
&idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
|
||||
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
|
||||
!CONFIG_EMULATE_HARDWARE
|
||||
|
@ -112,9 +112,6 @@ typedef struct {
|
||||
// Common for both INTER and INTRA blocks
|
||||
BLOCK_SIZE sb_type;
|
||||
PREDICTION_MODE mode;
|
||||
#if CONFIG_FILTERINTRA
|
||||
int filterbit, uv_filterbit;
|
||||
#endif
|
||||
TX_SIZE tx_size;
|
||||
int8_t skip;
|
||||
int8_t segment_id;
|
||||
@ -130,17 +127,11 @@ typedef struct {
|
||||
uint8_t mode_context[MAX_REF_FRAMES];
|
||||
INTERP_FILTER interp_filter;
|
||||
|
||||
#if CONFIG_EXT_TX
|
||||
EXT_TX_TYPE ext_txfrm;
|
||||
#endif
|
||||
} MB_MODE_INFO;
|
||||
|
||||
typedef struct MODE_INFO {
|
||||
struct MODE_INFO *src_mi;
|
||||
MB_MODE_INFO mbmi;
|
||||
#if CONFIG_FILTERINTRA
|
||||
int b_filter_info[4];
|
||||
#endif
|
||||
b_mode_info bmi[4];
|
||||
} MODE_INFO;
|
||||
|
||||
@ -149,17 +140,6 @@ static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
|
||||
: mi->mbmi.mode;
|
||||
}
|
||||
|
||||
#if CONFIG_FILTERINTRA
|
||||
static INLINE int is_filter_allowed(PREDICTION_MODE mode) {
|
||||
(void)mode;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static INLINE int is_filter_enabled(TX_SIZE txsize) {
|
||||
return (txsize < TX_SIZES);
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
|
||||
return mbmi->ref_frame[0] > INTRA_FRAME;
|
||||
}
|
||||
@ -257,33 +237,13 @@ static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
|
||||
|
||||
extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
|
||||
|
||||
#if CONFIG_EXT_TX
|
||||
static TX_TYPE ext_tx_to_txtype(EXT_TX_TYPE ext_tx) {
|
||||
switch (ext_tx) {
|
||||
case NORM:
|
||||
default:
|
||||
return DCT_DCT;
|
||||
case ALT:
|
||||
return ADST_ADST;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
|
||||
const MACROBLOCKD *xd) {
|
||||
const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
|
||||
|
||||
#if CONFIG_EXT_TX
|
||||
if (plane_type != PLANE_TYPE_Y || xd->lossless)
|
||||
return DCT_DCT;
|
||||
|
||||
if (is_inter_block(mbmi)) {
|
||||
return ext_tx_to_txtype(mbmi->ext_txfrm);
|
||||
}
|
||||
#else
|
||||
if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
|
||||
return DCT_DCT;
|
||||
#endif
|
||||
|
||||
return intra_mode_to_tx_type_lookup[mbmi->mode];
|
||||
}
|
||||
|
||||
@ -291,17 +251,8 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
|
||||
const MACROBLOCKD *xd, int ib) {
|
||||
const MODE_INFO *const mi = xd->mi[0].src_mi;
|
||||
|
||||
#if CONFIG_EXT_TX
|
||||
if (plane_type != PLANE_TYPE_Y || xd->lossless)
|
||||
return DCT_DCT;
|
||||
|
||||
if (is_inter_block(&mi->mbmi)) {
|
||||
return ext_tx_to_txtype(mi->mbmi.ext_txfrm);
|
||||
}
|
||||
#else
|
||||
if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
|
||||
return DCT_DCT;
|
||||
#endif
|
||||
|
||||
return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
|
||||
}
|
||||
|
@ -750,27 +750,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct4x4_1_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct4x4_16_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_1_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_64_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_10_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_1_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_256_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_10_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct32x32_1024_add/;
|
||||
|
||||
@ -796,6 +781,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_iwht4x4_16_add/;
|
||||
|
||||
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
|
||||
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
|
||||
|
||||
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct4x4_16_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_64_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_10_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_256_add/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_10_add/;
|
||||
|
||||
} else {
|
||||
|
||||
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct4x4_16_add sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_64_add sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct8x8_10_add sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_256_add sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
|
||||
specialize qw/vp9_highbd_idct16x16_10_add sse2/;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
@ -1184,43 +1205,43 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
|
||||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht4x4/;
|
||||
specialize qw/vp9_fht4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht8x8/;
|
||||
specialize qw/vp9_fht8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht16x16/;
|
||||
specialize qw/vp9_fht16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fwht4x4/;
|
||||
specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
|
||||
|
||||
add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct4x4_1/;
|
||||
specialize qw/vp9_fdct4x4_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct4x4/;
|
||||
specialize qw/vp9_fdct4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct8x8_1/;
|
||||
specialize qw/vp9_fdct8x8_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct8x8/;
|
||||
specialize qw/vp9_fdct8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct16x16_1/;
|
||||
specialize qw/vp9_fdct16x16_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct16x16/;
|
||||
specialize qw/vp9_fdct16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct32x32_1/;
|
||||
specialize qw/vp9_fdct32x32_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct32x32/;
|
||||
specialize qw/vp9_fdct32x32 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fdct32x32_rd/;
|
||||
specialize qw/vp9_fdct32x32_rd sse2/;
|
||||
} else {
|
||||
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht4x4 sse2/;
|
||||
@ -1882,40 +1903,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
|
||||
# fdct functions
|
||||
add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_highbd_fht4x4/;
|
||||
specialize qw/vp9_highbd_fht4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_highbd_fht8x8/;
|
||||
specialize qw/vp9_highbd_fht8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_highbd_fht16x16/;
|
||||
specialize qw/vp9_highbd_fht16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fwht4x4/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct4x4/;
|
||||
specialize qw/vp9_highbd_fdct4x4 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct8x8_1/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct8x8/;
|
||||
specialize qw/vp9_highbd_fdct8x8 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct16x16_1/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct16x16/;
|
||||
specialize qw/vp9_highbd_fdct16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct32x32_1/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct32x32/;
|
||||
specialize qw/vp9_highbd_fdct32x32 sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_highbd_fdct32x32_rd/;
|
||||
specialize qw/vp9_highbd_fdct32x32_rd sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
|
||||
specialize qw/vp9_highbd_temporal_filter_apply/;
|
||||
|
@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
|
||||
#include "vp9/common/vp9_idct.h"
|
||||
|
||||
#define RECON_AND_STORE4X4(dest, in_x) \
|
||||
{ \
|
||||
@ -3985,3 +3986,573 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
|
||||
dest += 8 - (stride * 32);
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
|
||||
__m128i ubounded, retval;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
|
||||
ubounded = _mm_cmpgt_epi16(value, max);
|
||||
retval = _mm_andnot_si128(ubounded, value);
|
||||
ubounded = _mm_and_si128(ubounded, max);
|
||||
retval = _mm_or_si128(retval, ubounded);
|
||||
retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
|
||||
return retval;
|
||||
}
|
||||
|
||||
void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[4 * 4];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j;
|
||||
__m128i inptr[4];
|
||||
__m128i sign_bits[2];
|
||||
__m128i temp_mm, min_input, max_input;
|
||||
int test;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
int optimised_cols = 0;
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i eight = _mm_set1_epi16(8);
|
||||
const __m128i max = _mm_set1_epi16(12043);
|
||||
const __m128i min = _mm_set1_epi16(-12043);
|
||||
// Load input into __m128i
|
||||
inptr[0] = _mm_loadu_si128((const __m128i *)input);
|
||||
inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
|
||||
inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
|
||||
inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
|
||||
|
||||
// Pack to 16 bits
|
||||
inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
|
||||
inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
|
||||
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp_mm = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp_mm);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct4_sse2(inptr);
|
||||
|
||||
// Check the min & max values
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp_mm = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp_mm);
|
||||
|
||||
if (test) {
|
||||
transpose_4x4(inptr);
|
||||
sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
|
||||
sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
|
||||
inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
|
||||
inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
|
||||
inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
|
||||
inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
|
||||
_mm_storeu_si128((__m128i*)outptr, inptr[0]);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]);
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vp9_highbd_idct4(input, outptr, bd);
|
||||
input += 4;
|
||||
outptr += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct4_sse2(inptr);
|
||||
|
||||
// Final round and shift
|
||||
inptr[0] = _mm_add_epi16(inptr[0], eight);
|
||||
inptr[1] = _mm_add_epi16(inptr[1], eight);
|
||||
|
||||
inptr[0] = _mm_srai_epi16(inptr[0], 4);
|
||||
inptr[1] = _mm_srai_epi16(inptr[1], 4);
|
||||
|
||||
// Reconstruction and Store
|
||||
{
|
||||
__m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
|
||||
__m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
|
||||
d0 = _mm_unpacklo_epi64(d0,
|
||||
_mm_loadl_epi64((const __m128i *)(dest + stride)));
|
||||
d2 = _mm_unpacklo_epi64(d2,
|
||||
_mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
|
||||
d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
|
||||
d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
|
||||
// store input0
|
||||
_mm_storel_epi64((__m128i *)dest, d0);
|
||||
// store input1
|
||||
d0 = _mm_srli_si128(d0, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + stride), d0);
|
||||
// store input2
|
||||
_mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
|
||||
// store input3
|
||||
d2 = _mm_srli_si128(d2, 8);
|
||||
_mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[4], temp_out[4];
|
||||
// Columns
|
||||
for (i = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
temp_in[j] = out[j * 4 + i];
|
||||
vp9_highbd_idct4(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 4; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[8 * 8];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[8];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i sixteen = _mm_set1_epi16(16);
|
||||
const __m128i max = _mm_set1_epi16(6201);
|
||||
const __m128i min = _mm_set1_epi16(-6201);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 8; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 8; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
array_transpose_8x8(inptr, inptr);
|
||||
for (i = 0; i < 8; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vp9_highbd_idct8(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
|
||||
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 5);
|
||||
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[8], temp_out[8];
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j)
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
vp9_highbd_idct8(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[8 * 8] = { 0 };
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[8];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i sixteen = _mm_set1_epi16(16);
|
||||
const __m128i max = _mm_set1_epi16(6201);
|
||||
const __m128i min = _mm_set1_epi16(-6201);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 8; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
// only first 4 row has non-zero coefs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
// N.B. Only first 4 cols contain non-zero coeffs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
// Use fact only first 4 rows contain non-zero coeffs
|
||||
array_transpose_4X8(inptr, inptr);
|
||||
for (i = 0; i < 4; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
|
||||
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vp9_highbd_idct8(input, outptr, bd);
|
||||
input += 8;
|
||||
outptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct8_sse2(inptr);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
|
||||
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
|
||||
inptr[i] = _mm_srai_epi16(inptr[i], 5);
|
||||
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[8], temp_out[8];
|
||||
for (i = 0; i < 8; ++i) {
|
||||
for (j = 0; j < 8; ++j)
|
||||
temp_in[j] = out[j * 8 + i];
|
||||
vp9_highbd_idct8(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 8; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[16 * 16];
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[32];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_set1_epi16(32);
|
||||
const __m128i max = _mm_set1_epi16(3155);
|
||||
const __m128i min = _mm_set1_epi16(-3155);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 16; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
|
||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 32; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 32; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
array_transpose_16x16(inptr, inptr + 16);
|
||||
for (i = 0; i < 16; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vp9_highbd_idct16(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[2];
|
||||
for (i = 0; i < 16; i++) {
|
||||
inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
|
||||
inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
|
||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
|
||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
|
||||
inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
|
||||
inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
|
||||
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
|
||||
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[16], temp_out[16];
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j)
|
||||
temp_in[j] = out[j * 16 + i];
|
||||
vp9_highbd_idct16(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
|
||||
int stride, int bd) {
|
||||
tran_low_t out[16 * 16] = { 0 };
|
||||
tran_low_t *outptr = out;
|
||||
int i, j, test;
|
||||
__m128i inptr[32];
|
||||
__m128i min_input, max_input, temp1, temp2, sign_bits;
|
||||
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i rounding = _mm_set1_epi16(32);
|
||||
const __m128i max = _mm_set1_epi16(3155);
|
||||
const __m128i min = _mm_set1_epi16(-3155);
|
||||
int optimised_cols = 0;
|
||||
|
||||
// Load input into __m128i & pack to 16 bits
|
||||
for (i = 0; i < 16; i++) {
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
|
||||
inptr[i] = _mm_packs_epi32(temp1, temp2);
|
||||
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
|
||||
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
|
||||
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
|
||||
}
|
||||
|
||||
// Find the min & max for the row transform
|
||||
// Since all non-zero dct coefficients are in upper-left 4x4 area,
|
||||
// we only need to consider first 4 rows here.
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 4; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (!test) {
|
||||
// Do the row transform (N.B. This transposes inptr)
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Find the min & max for the column transform
|
||||
// N.B. Only first 4 cols contain non-zero coeffs
|
||||
max_input = _mm_max_epi16(inptr[0], inptr[1]);
|
||||
min_input = _mm_min_epi16(inptr[0], inptr[1]);
|
||||
for (i = 2; i < 16; i++) {
|
||||
max_input = _mm_max_epi16(max_input, inptr[i]);
|
||||
min_input = _mm_min_epi16(min_input, inptr[i]);
|
||||
}
|
||||
max_input = _mm_cmpgt_epi16(max_input, max);
|
||||
min_input = _mm_cmplt_epi16(min_input, min);
|
||||
temp1 = _mm_or_si128(max_input, min_input);
|
||||
test = _mm_movemask_epi8(temp1);
|
||||
|
||||
if (test) {
|
||||
// Use fact only first 4 rows contain non-zero coeffs
|
||||
array_transpose_8x8(inptr, inptr);
|
||||
array_transpose_8x8(inptr + 8, inptr + 16);
|
||||
for (i = 0; i < 4; i++) {
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
|
||||
sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
|
||||
temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
|
||||
temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
|
||||
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
|
||||
}
|
||||
} else {
|
||||
// Set to use the optimised transform for the column
|
||||
optimised_cols = 1;
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised row transform
|
||||
for (i = 0; i < 4; ++i) {
|
||||
vp9_highbd_idct16(input, outptr, bd);
|
||||
input += 16;
|
||||
outptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (optimised_cols) {
|
||||
idct16_sse2(inptr, inptr + 16);
|
||||
|
||||
// Final round & shift and Reconstruction and Store
|
||||
{
|
||||
__m128i d[2];
|
||||
for (i = 0; i < 16; i++) {
|
||||
inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
|
||||
inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
|
||||
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
|
||||
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
|
||||
inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
|
||||
inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
|
||||
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
|
||||
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
|
||||
// Store
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
|
||||
_mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Run the un-optimised column transform
|
||||
tran_low_t temp_in[16], temp_out[16];
|
||||
for (i = 0; i < 16; ++i) {
|
||||
for (j = 0; j < 16; ++j)
|
||||
temp_in[j] = out[j * 16 + i];
|
||||
vp9_highbd_idct16(temp_in, temp_out, bd);
|
||||
for (j = 0; j < 16; ++j) {
|
||||
dest[j * stride + i] = highbd_clip_pixel_add(
|
||||
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
@ -3535,9 +3535,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (cm->use_highbitdepth)
|
||||
x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
|
||||
else
|
||||
x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
|
||||
else
|
||||
x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
|
||||
x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
|
||||
vp9_highbd_idct4x4_add;
|
||||
#else
|
||||
|
File diff suppressed because it is too large
Load Diff
1011
vp9/encoder/x86/vp9_dct_impl_sse2.c
Normal file
1011
vp9/encoder/x86/vp9_dct_impl_sse2.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -62,9 +62,40 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
|
||||
psllw m2, 2
|
||||
psllw m3, 2
|
||||
|
||||
%if CONFIG_VP9_HIGHBITDEPTH
|
||||
pxor m4, m4
|
||||
pxor m5, m5
|
||||
pcmpgtw m4, m0
|
||||
pcmpgtw m5, m1
|
||||
movq m6, m0
|
||||
movq m7, m1
|
||||
punpcklwd m0, m4
|
||||
punpcklwd m1, m5
|
||||
punpckhwd m6, m4
|
||||
punpckhwd m7, m5
|
||||
movq [outputq], m0
|
||||
movq [outputq + 8], m6
|
||||
movq [outputq + 16], m1
|
||||
movq [outputq + 24], m7
|
||||
pxor m4, m4
|
||||
pxor m5, m5
|
||||
pcmpgtw m4, m2
|
||||
pcmpgtw m5, m3
|
||||
movq m6, m2
|
||||
movq m7, m3
|
||||
punpcklwd m2, m4
|
||||
punpcklwd m3, m5
|
||||
punpckhwd m6, m4
|
||||
punpckhwd m7, m5
|
||||
movq [outputq + 32], m2
|
||||
movq [outputq + 40], m6
|
||||
movq [outputq + 48], m3
|
||||
movq [outputq + 56], m7
|
||||
%else
|
||||
movq [outputq], m0
|
||||
movq [outputq + 8], m1
|
||||
movq [outputq + 16], m2
|
||||
movq [outputq + 24], m3
|
||||
%endif
|
||||
|
||||
RET
|
||||
|
File diff suppressed because it is too large
Load Diff
373
vp9/encoder/x86/vp9_dct_sse2.h
Normal file
373
vp9/encoder/x86/vp9_dct_sse2.h
Normal file
@ -0,0 +1,373 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP9_ENCODER_X86_VP9_DCT_SSE2_H_
|
||||
#define VP9_ENCODER_X86_VP9_DCT_SSE2_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define pair_set_epi32(a, b) \
|
||||
_mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
|
||||
|
||||
void vp9_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
|
||||
void vp9_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
|
||||
void vp9_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
|
||||
void vp9_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output,
|
||||
int stride);
|
||||
void vp9_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output,
|
||||
int stride);
|
||||
void vp9_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output,
|
||||
int stride);
|
||||
|
||||
static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
|
||||
__m128i buf0, buf1;
|
||||
buf0 = _mm_mul_epu32(a, b);
|
||||
a = _mm_srli_epi64(a, 32);
|
||||
b = _mm_srli_epi64(b, 32);
|
||||
buf1 = _mm_mul_epu32(a, b);
|
||||
return _mm_add_epi64(buf0, buf1);
|
||||
}
|
||||
|
||||
static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
|
||||
__m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
|
||||
__m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
|
||||
return _mm_unpacklo_epi64(buf0, buf1);
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x2(__m128i reg0, __m128i reg1) {
|
||||
const __m128i max_overflow = _mm_set1_epi16(0x7fff);
|
||||
const __m128i min_overflow = _mm_set1_epi16(0x8000);
|
||||
__m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
|
||||
_mm_cmpeq_epi16(reg0, min_overflow));
|
||||
__m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
|
||||
_mm_cmpeq_epi16(reg1, min_overflow));
|
||||
cmp0 = _mm_or_si128(cmp0, cmp1);
|
||||
return _mm_movemask_epi8(cmp0);
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x4(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3) {
|
||||
const __m128i max_overflow = _mm_set1_epi16(0x7fff);
|
||||
const __m128i min_overflow = _mm_set1_epi16(0x8000);
|
||||
__m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
|
||||
_mm_cmpeq_epi16(reg0, min_overflow));
|
||||
__m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
|
||||
_mm_cmpeq_epi16(reg1, min_overflow));
|
||||
__m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(reg2, max_overflow),
|
||||
_mm_cmpeq_epi16(reg2, min_overflow));
|
||||
__m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(reg3, max_overflow),
|
||||
_mm_cmpeq_epi16(reg3, min_overflow));
|
||||
cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
|
||||
return _mm_movemask_epi8(cmp0);
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x8(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3,
|
||||
__m128i reg4, __m128i reg5,
|
||||
__m128i reg6, __m128i reg7) {
|
||||
int res0, res1;
|
||||
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
|
||||
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
|
||||
return res0 + res1;
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x12(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4,
|
||||
__m128i reg5, __m128i reg6, __m128i reg7,
|
||||
__m128i reg8, __m128i reg9, __m128i reg10,
|
||||
__m128i reg11) {
|
||||
int res0, res1;
|
||||
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
|
||||
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
|
||||
if (!res0)
|
||||
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
|
||||
return res0 + res1;
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x16(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4,
|
||||
__m128i reg5, __m128i reg6, __m128i reg7,
|
||||
__m128i reg8, __m128i reg9, __m128i reg10,
|
||||
__m128i reg11, __m128i reg12, __m128i reg13,
|
||||
__m128i reg14, __m128i reg15) {
|
||||
int res0, res1;
|
||||
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
|
||||
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
|
||||
if (!res0) {
|
||||
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
|
||||
if (!res1)
|
||||
res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
|
||||
}
|
||||
return res0 + res1;
|
||||
}
|
||||
|
||||
static INLINE int check_epi16_overflow_x32(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, __m128i reg4,
|
||||
__m128i reg5, __m128i reg6, __m128i reg7,
|
||||
__m128i reg8, __m128i reg9, __m128i reg10,
|
||||
__m128i reg11, __m128i reg12, __m128i reg13,
|
||||
__m128i reg14, __m128i reg15, __m128i reg16,
|
||||
__m128i reg17, __m128i reg18, __m128i reg19,
|
||||
__m128i reg20, __m128i reg21, __m128i reg22,
|
||||
__m128i reg23, __m128i reg24, __m128i reg25,
|
||||
__m128i reg26, __m128i reg27, __m128i reg28,
|
||||
__m128i reg29, __m128i reg30, __m128i reg31) {
|
||||
int res0, res1;
|
||||
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
|
||||
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
|
||||
if (!res0) {
|
||||
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
|
||||
if (!res1) {
|
||||
res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
|
||||
if (!res0) {
|
||||
res0 = check_epi16_overflow_x4(reg16, reg17, reg18, reg19);
|
||||
if (!res1) {
|
||||
res1 = check_epi16_overflow_x4(reg20, reg21, reg22, reg23);
|
||||
if (!res0) {
|
||||
res0 = check_epi16_overflow_x4(reg24, reg25, reg26, reg27);
|
||||
if (!res1)
|
||||
res1 = check_epi16_overflow_x4(reg28, reg29, reg30, reg31);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return res0 + res1;
|
||||
}
|
||||
|
||||
static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3, const __m128i* zero) {
|
||||
__m128i minus_one = _mm_set1_epi32(-1);
|
||||
// Check for overflows
|
||||
__m128i reg0_shifted = _mm_slli_epi64(reg0, 1);
|
||||
__m128i reg1_shifted = _mm_slli_epi64(reg1, 1);
|
||||
__m128i reg2_shifted = _mm_slli_epi64(reg2, 1);
|
||||
__m128i reg3_shifted = _mm_slli_epi64(reg3, 1);
|
||||
__m128i reg0_top_dwords = _mm_shuffle_epi32(
|
||||
reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
|
||||
__m128i reg1_top_dwords = _mm_shuffle_epi32(
|
||||
reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
|
||||
__m128i reg2_top_dwords = _mm_shuffle_epi32(
|
||||
reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
|
||||
__m128i reg3_top_dwords = _mm_shuffle_epi32(
|
||||
reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
|
||||
__m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
|
||||
__m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
|
||||
__m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
|
||||
__m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
|
||||
__m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
|
||||
__m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
|
||||
int overflow_01 = _mm_movemask_epi8(
|
||||
_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
|
||||
int overflow_23 = _mm_movemask_epi8(
|
||||
_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
|
||||
return (overflow_01 + overflow_23);
|
||||
}
|
||||
|
||||
static INLINE int k_check_epi32_overflow_8(__m128i reg0, __m128i reg1,
|
||||
__m128i reg2, __m128i reg3,
|
||||
__m128i reg4, __m128i reg5,
|
||||
__m128i reg6, __m128i reg7,
|
||||
const __m128i* zero) {
|
||||
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
|
||||
}
|
||||
return overflow;
|
||||
}
|
||||
|
||||
static INLINE int k_check_epi32_overflow_16(
|
||||
__m128i reg0, __m128i reg1, __m128i reg2, __m128i reg3,
|
||||
__m128i reg4, __m128i reg5, __m128i reg6, __m128i reg7,
|
||||
__m128i reg8, __m128i reg9, __m128i reg10, __m128i reg11,
|
||||
__m128i reg12, __m128i reg13, __m128i reg14, __m128i reg15,
|
||||
const __m128i* zero) {
|
||||
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
|
||||
}
|
||||
}
|
||||
}
|
||||
return overflow;
|
||||
}
|
||||
|
||||
static INLINE int k_check_epi32_overflow_32(
|
||||
__m128i reg0, __m128i reg1, __m128i reg2, __m128i reg3,
|
||||
__m128i reg4, __m128i reg5, __m128i reg6, __m128i reg7,
|
||||
__m128i reg8, __m128i reg9, __m128i reg10, __m128i reg11,
|
||||
__m128i reg12, __m128i reg13, __m128i reg14, __m128i reg15,
|
||||
__m128i reg16, __m128i reg17, __m128i reg18, __m128i reg19,
|
||||
__m128i reg20, __m128i reg21, __m128i reg22, __m128i reg23,
|
||||
__m128i reg24, __m128i reg25, __m128i reg26, __m128i reg27,
|
||||
__m128i reg28, __m128i reg29, __m128i reg30, __m128i reg31,
|
||||
const __m128i* zero) {
|
||||
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg16, reg17, reg18, reg19, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg20, reg21,
|
||||
reg22, reg23, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg24, reg25,
|
||||
reg26, reg27, zero);
|
||||
if (!overflow) {
|
||||
overflow = k_check_epi32_overflow_4(reg28, reg29,
|
||||
reg30, reg31, zero);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return overflow;
|
||||
}
|
||||
|
||||
static INLINE void store_output(const __m128i output, tran_low_t* dst_ptr) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
|
||||
__m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
|
||||
__m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
|
||||
_mm_store_si128((__m128i *)(dst_ptr), out0);
|
||||
_mm_store_si128((__m128i *)(dst_ptr + 4), out1);
|
||||
#else
|
||||
_mm_store_si128((__m128i *)(dst_ptr), output);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
|
||||
static INLINE void storeu_output(const __m128i output, tran_low_t* dst_ptr) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
|
||||
__m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
|
||||
__m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
|
||||
_mm_storeu_si128((__m128i *)(dst_ptr), out0);
|
||||
_mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
|
||||
#else
|
||||
_mm_storeu_si128((__m128i *)(dst_ptr), output);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
|
||||
|
||||
static INLINE __m128i mult_round_shift(const __m128i in0, const __m128i in1,
|
||||
const __m128i multiplier,
|
||||
const __m128i rounding,
|
||||
const int shift) {
|
||||
const __m128i u0 = _mm_madd_epi16(in0, multiplier);
|
||||
const __m128i u1 = _mm_madd_epi16(in1, multiplier);
|
||||
const __m128i v0 = _mm_add_epi32(u0, rounding);
|
||||
const __m128i v1 = _mm_add_epi32(u1, rounding);
|
||||
const __m128i w0 = _mm_srai_epi32(v0, shift);
|
||||
const __m128i w1 = _mm_srai_epi32(v1, shift);
|
||||
return _mm_packs_epi32(w0, w1);
|
||||
}
|
||||
|
||||
static INLINE void transpose_and_output8x8(
|
||||
const __m128i in00, const __m128i in01,
|
||||
const __m128i in02, const __m128i in03,
|
||||
const __m128i in04, const __m128i in05,
|
||||
const __m128i in06, const __m128i in07,
|
||||
const int pass, int16_t* out0_ptr,
|
||||
tran_low_t* out1_ptr) {
|
||||
// 00 01 02 03 04 05 06 07
|
||||
// 10 11 12 13 14 15 16 17
|
||||
// 20 21 22 23 24 25 26 27
|
||||
// 30 31 32 33 34 35 36 37
|
||||
// 40 41 42 43 44 45 46 47
|
||||
// 50 51 52 53 54 55 56 57
|
||||
// 60 61 62 63 64 65 66 67
|
||||
// 70 71 72 73 74 75 76 77
|
||||
const __m128i tr0_0 = _mm_unpacklo_epi16(in00, in01);
|
||||
const __m128i tr0_1 = _mm_unpacklo_epi16(in02, in03);
|
||||
const __m128i tr0_2 = _mm_unpackhi_epi16(in00, in01);
|
||||
const __m128i tr0_3 = _mm_unpackhi_epi16(in02, in03);
|
||||
const __m128i tr0_4 = _mm_unpacklo_epi16(in04, in05);
|
||||
const __m128i tr0_5 = _mm_unpacklo_epi16(in06, in07);
|
||||
const __m128i tr0_6 = _mm_unpackhi_epi16(in04, in05);
|
||||
const __m128i tr0_7 = _mm_unpackhi_epi16(in06, in07);
|
||||
// 00 10 01 11 02 12 03 13
|
||||
// 20 30 21 31 22 32 23 33
|
||||
// 04 14 05 15 06 16 07 17
|
||||
// 24 34 25 35 26 36 27 37
|
||||
// 40 50 41 51 42 52 43 53
|
||||
// 60 70 61 71 62 72 63 73
|
||||
// 54 54 55 55 56 56 57 57
|
||||
// 64 74 65 75 66 76 67 77
|
||||
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
|
||||
const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
|
||||
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
|
||||
const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
|
||||
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
|
||||
const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
|
||||
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
|
||||
const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
|
||||
// 00 10 20 30 01 11 21 31
|
||||
// 40 50 60 70 41 51 61 71
|
||||
// 02 12 22 32 03 13 23 33
|
||||
// 42 52 62 72 43 53 63 73
|
||||
// 04 14 24 34 05 15 21 36
|
||||
// 44 54 64 74 45 55 61 76
|
||||
// 06 16 26 36 07 17 27 37
|
||||
// 46 56 66 76 47 57 67 77
|
||||
const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
|
||||
const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
|
||||
const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
|
||||
const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
|
||||
const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
|
||||
const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
|
||||
const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
|
||||
const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
|
||||
// 00 10 20 30 40 50 60 70
|
||||
// 01 11 21 31 41 51 61 71
|
||||
// 02 12 22 32 42 52 62 72
|
||||
// 03 13 23 33 43 53 63 73
|
||||
// 04 14 24 34 44 54 64 74
|
||||
// 05 15 25 35 45 55 65 75
|
||||
// 06 16 26 36 46 56 66 76
|
||||
// 07 17 27 37 47 57 67 77
|
||||
if (pass == 0) {
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
|
||||
_mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
|
||||
} else {
|
||||
storeu_output(tr2_0, (out1_ptr + 0 * 16));
|
||||
storeu_output(tr2_1, (out1_ptr + 1 * 16));
|
||||
storeu_output(tr2_2, (out1_ptr + 2 * 16));
|
||||
storeu_output(tr2_3, (out1_ptr + 3 * 16));
|
||||
storeu_output(tr2_4, (out1_ptr + 4 * 16));
|
||||
storeu_output(tr2_5, (out1_ptr + 5 * 16));
|
||||
storeu_output(tr2_6, (out1_ptr + 6 * 16));
|
||||
storeu_output(tr2_7, (out1_ptr + 7 * 16));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP9_ENCODER_X86_VP9_DCT_SSE2_H_
|
@ -135,7 +135,9 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
|
||||
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_impl_sse2.c
|
||||
|
||||
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
|
||||
|
Loading…
Reference in New Issue
Block a user