From a3a69b400c5efbaa5c4b241a517de1cdd70c3bc1 Mon Sep 17 00:00:00 2001 From: Yi Luo Date: Fri, 13 May 2016 10:08:13 -0700 Subject: [PATCH] HBD inverse HT 4x4 SSE4.1 optimization - Tx_type: DCT_DCT, DCT_ADST, ADST_DCT, ADST_ADST. - Encoder overall instruction count drops 2.91%. - Decoder overall instruction count drops 1.01%. - Add unit test to test bit-exact result against C. Change-Id: I908c9e0e5106c58f67dd72d28760e6c9ce54278e --- test/test.mk | 4 + test/vp10_iht4x4_test.cc | 141 ++++++++++++++ vp10/common/vp10_rtcd_defs.pl | 2 +- vp10/common/x86/highbd_inv_txfm_sse4.c | 258 +++++++++++++++++++++++++ vp10/vp10cx.mk | 1 + 5 files changed, 405 insertions(+), 1 deletion(-) create mode 100644 test/vp10_iht4x4_test.cc create mode 100644 vp10/common/x86/highbd_inv_txfm_sse4.c diff --git a/test/test.mk b/test/test.mk index 59f054cfa..03b589e39 100644 --- a/test/test.mk +++ b/test/test.mk @@ -180,6 +180,10 @@ ifeq ($(CONFIG_EXT_INTER),yes) LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc endif + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_iht4x4_test.cc +endif # CONFIG_VP9_HIGHBITDEPTH endif # VP10 ## Multi-codec / unconditional whitebox tests. diff --git a/test/vp10_iht4x4_test.cc b/test/vp10_iht4x4_test.cc new file mode 100644 index 000000000..1cad40281 --- /dev/null +++ b/test/vp10_iht4x4_test.cc @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp10_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx_ports/mem.h" + +namespace { + +using std::tr1::tuple; +using libvpx_test::ACMRandom; + +void iht4x4_ref(const int32_t *coeff, uint16_t *output, int stride, + int tx_type, int bd) { + vp10_inv_txfm2d_add_4x4_c(coeff, output, stride, tx_type, bd); +} + +typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride, + int tx_type, int bd); + +// IhbdHt4x4Param argument list: +// +typedef tuple IHbdHt4x4Param; + +class VP10HighbdInvTrans4x4HT : + public ::testing::TestWithParam { + public: + virtual ~VP10HighbdInvTrans4x4HT() {} + + virtual void SetUp() { + inv_txfm_ = GET_PARAM(0); + inv_txfm_ref_ = iht4x4_ref; + tx_type_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + num_coeffs_ = 4 * 4; + + coeffs_ = reinterpret_cast( + vpx_memalign(16, sizeof(int32_t) * num_coeffs_)); + output_ = reinterpret_cast( + vpx_memalign(16, sizeof(uint16_t) * num_coeffs_)); + output_ref_ = reinterpret_cast( + vpx_memalign(16, sizeof(uint16_t) * num_coeffs_)); + } + + virtual void TearDown() { + vpx_free(coeffs_); + vpx_free(output_); + vpx_free(output_ref_); + libvpx_test::ClearSystemState(); + } + + protected: + void RunBitexactCheck(); + + private: + IHbdHtFunc inv_txfm_; + IHbdHtFunc inv_txfm_ref_; + int tx_type_; + int bit_depth_; + int num_coeffs_; + int32_t *coeffs_; + uint16_t *output_; + uint16_t *output_ref_; + + int32_t clamp(int32_t number, int bit) { + int32_t ret = number; + const int32_t max = (int32_t)(1 << bit) - 1; + const int32_t min = -max; + + if (number > max) { + ret = max; + } else if (number < min) { + ret = min; + } + return ret; + } +}; + +void VP10HighbdInvTrans4x4HT::RunBitexactCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int stride = 4; + const int num_tests = 2000000; + int i; + int j; + const uint16_t mask = (1 << bit_depth_) - 1; + + for (i = 0; i < num_tests; ++i) { + for (j = 0; j < num_coeffs_; ++j) { + coeffs_[j] = clamp((rnd.Rand16() - rnd.Rand16()) << 2, 18); + output_ref_[j] = rnd.Rand16() & mask; + output_[j] = output_ref_[j]; + } + + inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_); + ASM_REGISTER_STATE_CHECK(inv_txfm_(coeffs_, output_, stride, tx_type_, + bit_depth_)); + + for (j = 0; j < num_coeffs_; ++j) { + EXPECT_EQ(output_ref_[j], output_[j]) + << "Not bit-exact result at index: " << j + << "At test block: " << i; + } + } +} + +TEST_P(VP10HighbdInvTrans4x4HT, InvTransResultCheck) { + RunBitexactCheck(); +} + +using std::tr1::make_tuple; + +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +const IHbdHt4x4Param kArrayIht4x4Param[] = { + make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 0, 10), + make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 0, 12), + make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 1, 10), + make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 1, 12), + make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 2, 10), + make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 2, 12), + make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 3, 10), + make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 3, 12) +}; + +INSTANTIATE_TEST_CASE_P( + SSE4_1, VP10HighbdInvTrans4x4HT, + ::testing::ValuesIn(kArrayIht4x4Param)); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + +} // namespace diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 0e59bfe2f..3952fbb81 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -627,7 +627,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { #inv txfm add_proto qw/void vp10_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd"; - specialize qw/vp10_inv_txfm2d_add_4x4/; + specialize qw/vp10_inv_txfm2d_add_4x4 sse4_1/; add_proto qw/void vp10_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd"; specialize qw/vp10_inv_txfm2d_add_8x8/; add_proto qw/void vp10_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd"; diff --git a/vp10/common/x86/highbd_inv_txfm_sse4.c b/vp10/common/x86/highbd_inv_txfm_sse4.c new file mode 100644 index 000000000..0c623dfd3 --- /dev/null +++ b/vp10/common/x86/highbd_inv_txfm_sse4.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include /* SSE4.1 */ + +#include "./vp10_rtcd.h" +#include "./vpx_config.h" +#include "vp10/common/vp10_inv_txfm2d_cfg.h" + + +static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { + in[0] = _mm_loadu_si128((const __m128i *)(coeff + 0)); + in[1] = _mm_loadu_si128((const __m128i *)(coeff + 4)); + in[2] = _mm_loadu_si128((const __m128i *)(coeff + 8)); + in[3] = _mm_loadu_si128((const __m128i *)(coeff + 12)); +} + +static void idct4x4_sse4_1(__m128i *in, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3, x, y; + + v0 = _mm_unpacklo_epi32(in[0], in[1]); + v1 = _mm_unpackhi_epi32(in[0], in[1]); + v2 = _mm_unpacklo_epi32(in[2], in[3]); + v3 = _mm_unpackhi_epi32(in[2], in[3]); + + u0 = _mm_unpacklo_epi64(v0, v2); + u1 = _mm_unpackhi_epi64(v0, v2); + u2 = _mm_unpacklo_epi64(v1, v3); + u3 = _mm_unpackhi_epi64(v1, v3); + + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u2, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u1, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u1, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + in[0] = _mm_add_epi32(v0, v3); + in[1] = _mm_add_epi32(v1, v2); + in[2] = _mm_sub_epi32(v1, v2); + in[3] = _mm_sub_epi32(v0, v3); +} + +static void iadst4x4_sse4_1(__m128i *in, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3, x, y; + + v0 = _mm_unpacklo_epi32(in[0], in[1]); + v1 = _mm_unpackhi_epi32(in[0], in[1]); + v2 = _mm_unpacklo_epi32(in[2], in[3]); + v3 = _mm_unpackhi_epi32(in[2], in[3]); + + u0 = _mm_unpacklo_epi64(v0, v2); + u1 = _mm_unpackhi_epi64(v0, v2); + u2 = _mm_unpacklo_epi64(v1, v3); + u3 = _mm_unpackhi_epi64(v1, v3); + + // stage 0 + // stage 1 + u1 = _mm_sub_epi32(zero, u1); + u3 = _mm_sub_epi32(zero, u3); + + // stage 2 + v0 = u0; + v1 = u3; + x = _mm_mullo_epi32(u1, cospi32); + y = _mm_mullo_epi32(u2, cospi32); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + v3 = _mm_sub_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + // stage 3 + u0 = _mm_add_epi32(v0, v2); + u1 = _mm_add_epi32(v1, v3); + u2 = _mm_sub_epi32(v0, v2); + u3 = _mm_sub_epi32(v1, v3); + + // stage 4 + x = _mm_mullo_epi32(u0, cospi8); + y = _mm_mullo_epi32(u1, cospi56); + in[3] = _mm_add_epi32(x, y); + in[3] = _mm_add_epi32(in[3], rnding); + in[3] = _mm_srai_epi32(in[3], bit); + + x = _mm_mullo_epi32(u0, cospi56); + y = _mm_mullo_epi32(u1, cospim8); + in[0] = _mm_add_epi32(x, y); + in[0] = _mm_add_epi32(in[0], rnding); + in[0] = _mm_srai_epi32(in[0], bit); + + x = _mm_mullo_epi32(u2, cospi40); + y = _mm_mullo_epi32(u3, cospi24); + in[1] = _mm_add_epi32(x, y); + in[1] = _mm_add_epi32(in[1], rnding); + in[1] = _mm_srai_epi32(in[1], bit); + + x = _mm_mullo_epi32(u2, cospi24); + y = _mm_mullo_epi32(u3, cospim40); + in[2] = _mm_add_epi32(x, y); + in[2] = _mm_add_epi32(in[2], rnding); + in[2] = _mm_srai_epi32(in[2], bit); +} + +static INLINE void round_shift_4x4(__m128i *in, int shift) { + __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rnding); + in[1] = _mm_add_epi32(in[1], rnding); + in[2] = _mm_add_epi32(in[2], rnding); + in[3] = _mm_add_epi32(in[3], rnding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); +} + +static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i clamped, mask; + + mask = _mm_cmpgt_epi16(u, max); + clamped = _mm_andnot_si128(mask, u); + mask = _mm_and_si128(mask, max); + clamped = _mm_or_si128(mask, clamped); + mask = _mm_cmpgt_epi16(clamped, zero); + clamped = _mm_and_si128(clamped, mask); + + return clamped; +} + +static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, + int flipud, int fliplr, int shift, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + round_shift_4x4(in, shift); + + v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride)); + v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride)); + v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); + v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); + + v0 = _mm_unpacklo_epi16(v0, zero); + v1 = _mm_unpacklo_epi16(v1, zero); + v2 = _mm_unpacklo_epi16(v2, zero); + v3 = _mm_unpacklo_epi16(v3, zero); + + u0 = _mm_add_epi32(in[0], v0); + u1 = _mm_add_epi32(in[1], v1); + u2 = _mm_add_epi32(in[2], v2); + u3 = _mm_add_epi32(in[3], v3); + + v0 = _mm_packus_epi32(u0, u1); + v2 = _mm_packus_epi32(u2, u3); + + u0 = highbd_clamp_epi16(v0, bd); + u2 = highbd_clamp_epi16(v2, bd); + + v0 = _mm_unpacklo_epi64(u0, u0); + v1 = _mm_unpackhi_epi64(u0, u0); + v2 = _mm_unpacklo_epi64(u2, u2); + v3 = _mm_unpackhi_epi64(u2, u2); + + _mm_storel_epi64((__m128i *)(output + 0 * stride), v0); + _mm_storel_epi64((__m128i *)(output + 1 * stride), v1); + _mm_storel_epi64((__m128i *)(output + 2 * stride), v2); + _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); + + (void) flipud; + (void) fliplr; +} + +void vp10_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, + int stride, int tx_type, int bd) { + __m128i in[4]; + const TXFM_2D_CFG *cfg = NULL; + + switch (tx_type) { + case DCT_DCT: + cfg = &inv_txfm_2d_cfg_dct_dct_4; + load_buffer_4x4(coeff, in); + idct4x4_sse4_1(in, cfg->cos_bit_row[2]); + idct4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + break; + case ADST_DCT: + cfg = &inv_txfm_2d_cfg_adst_dct_4; + load_buffer_4x4(coeff, in); + idct4x4_sse4_1(in, cfg->cos_bit_row[2]); + iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + break; + case DCT_ADST: + cfg = &inv_txfm_2d_cfg_dct_adst_4; + load_buffer_4x4(coeff, in); + iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + idct4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + break; + case ADST_ADST: + cfg = &inv_txfm_2d_cfg_adst_adst_4; + load_buffer_4x4(coeff, in); + iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + iadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd); + break; + default: + assert(0); + } +} diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk index da90fe668..1aaac15be 100644 --- a/vp10/vp10cx.mk +++ b/vp10/vp10cx.mk @@ -118,6 +118,7 @@ VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c +VP10_CX_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c endif ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)