From 1a800f6539a547d131bb4304ed894e7ba0c6550e Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Fri, 2 Sep 2016 16:05:53 +0100 Subject: [PATCH] Add SSE2 versions of av1_fht8x16 and av1_fht16x8 Encoder speedup ~2% with ext-tx + rect-tx Change-Id: Id56ddf102a887de31d181bde6d8ef8c4f03da945 --- aom_dsp/x86/synonyms.h | 4 +- av1/common/av1_rtcd_defs.pl | 243 +++++++------------- av1/encoder/dct.c | 14 +- av1/encoder/x86/dct_intrin_sse2.c | 362 +++++++++++++++++++++++++++++- test/av1_fht16x8_test.cc | 95 ++++++++ test/av1_fht8x16_test.cc | 95 ++++++++ test/test.mk | 4 + test/transform_test_base.h | 2 +- 8 files changed, 647 insertions(+), 172 deletions(-) create mode 100644 test/av1_fht16x8_test.cc create mode 100644 test/av1_fht8x16_test.cc diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h index da893a6d3..15fa99175 100644 --- a/aom_dsp/x86/synonyms.h +++ b/aom_dsp/x86/synonyms.h @@ -68,13 +68,13 @@ static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { } static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1)); + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); return _mm_srli_epi32(v_tmp_d, bits); } static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1)); + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); const __m128i v_tmp_d = _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 3e003672b..4bf89f9a8 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl @@ -51,7 +51,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { } # -# dct +# Inverse dct # if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure @@ -368,10 +368,22 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") { # fdct functions -if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { - add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht4x4 sse2/; +add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; +specialize qw/av1_fht4x4 sse2/; +add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; +specialize qw/av1_fwht4x4/; + +add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; +specialize qw/av1_fht8x8 sse2/; + +add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; +specialize qw/av1_fht16x16 sse2/; + +add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; +specialize qw/av1_fht32x32/; + +if (aom_config("CONFIG_EXT_TX") eq "yes") { add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/av1_fht4x8/; @@ -379,56 +391,84 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { specialize qw/av1_fht8x4/; add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht8x16/; + specialize qw/av1_fht8x16 sse2/; add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht16x8/; + specialize qw/av1_fht16x8 sse2/; add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/av1_fht16x32/; add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/av1_fht32x16/; +} - add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht8x8 sse2/; +if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") { + add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct4x4/; - add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht16x16 sse2/; + add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct4x4_1/; - add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht32x32/; + add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct8x8/; - add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fwht4x4/; + add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct8x8_1/; + + add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct16x16/; + + add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct16x16_1/; + + add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct32x32/; + + add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct32x32_rd/; + + add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct32x32_1/; +} else { + add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct4x4 sse2/; + + add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct4x4_1 sse2/; + + add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct8x8 sse2/; + + add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct8x8_1 sse2/; + + add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct16x16 sse2/; + + add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct16x16_1 sse2/; + + add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct32x32 sse2/; + + add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct32x32_rd sse2/; + + add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/av1_fdct32x32_1 sse2/; +} + +if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") { + if (aom_config("CONFIG_EXT_TX") ne "yes") { + specialize qw/av1_fht4x4 msa/; + specialize qw/av1_fht8x8 msa/; + specialize qw/av1_fht16x16 msa/; + } +} + +if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4/; - - add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4_1/; - - add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8/; - - add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8_1/; - - add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16/; - - add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16_1/; - - add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32/; - - add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_rd/; - - add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_1/; - add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/av1_highbd_fdct4x4/; @@ -453,33 +493,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/av1_highbd_fdct32x32_1/; } else { - add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4 sse2/; - - add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4_1 sse2/; - - add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8 sse2/; - - add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8_1 sse2/; - - add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16 sse2/; - - add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16_1 sse2/; - - add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32 sse2/; - - add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_rd sse2/; - - add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_1 sse2/; - add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/av1_highbd_fdct4x4 sse2/; @@ -504,100 +517,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/av1_highbd_fdct32x32_1/; } -} else { - add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht4x4 sse2/; - - add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht4x8/; - - add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht8x4/; - - add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht8x16/; - - add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht16x8/; - - add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht16x32/; - - add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht32x16/; - - add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht8x8 sse2/; - - add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht16x16 sse2/; - - if (aom_config("CONFIG_EXT_TX") ne "yes") { - specialize qw/av1_fht4x4 msa/; - specialize qw/av1_fht8x8 msa/; - specialize qw/av1_fht16x16 msa/; - } - - add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/av1_fht32x32/; - - add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fwht4x4/; - if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4/; - - add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4_1/; - - add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8/; - - add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8_1/; - - add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16/; - - add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16_1/; - - add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32/; - - add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_rd/; - - add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_1/; - } else { - add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4 sse2/; - - add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct4x4_1 sse2/; - - add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8 sse2/; - - add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct8x8_1 sse2/; - - add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16 sse2/; - - add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct16x16_1 sse2/; - - add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32 sse2/; - - add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_rd sse2/; - - add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/av1_fdct32x32_1 sse2/; - } } add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type"; diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c index 40f5bdf91..697f7918c 100644 --- a/av1/encoder/dct.c +++ b/av1/encoder/dct.c @@ -1311,8 +1311,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride, // Columns for (i = 0; i < n; ++i) { for (j = 0; j < n2; ++j) - temp_in[j] = - (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); + temp_in[j] = ROUND_POWER_OF_TWO_SIGNED(input[j * stride + i] * 4 * Sqrt2, + DCT_CONST_BITS); ht.cols(temp_in, temp_out); for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j]; } @@ -1321,7 +1321,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride, for (i = 0; i < n2; ++i) { for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n]; ht.rows(temp_in, temp_out); - for (j = 0; j < n; ++j) output[j + i * n] = (temp_out[j] + 1) >> 2; + for (j = 0; j < n; ++j) + output[j + i * n] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } // Note: overall scale factor of transform is 8 times unitary } @@ -1358,8 +1359,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride, // Columns for (i = 0; i < n2; ++i) { for (j = 0; j < n; ++j) - temp_in[j] = - (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); + temp_in[j] = ROUND_POWER_OF_TWO_SIGNED(input[j * stride + i] * 4 * Sqrt2, + DCT_CONST_BITS); ht.cols(temp_in, temp_out); for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j]; } @@ -1368,7 +1369,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride, for (i = 0; i < n; ++i) { for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; ht.rows(temp_in, temp_out); - for (j = 0; j < n2; ++j) output[j + i * n2] = (temp_out[j] + 1) >> 2; + for (j = 0; j < n2; ++j) + output[j + i * n2] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } // Note: overall scale factor of transform is 8 times unitary } diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c index 673503ee7..31bb0b9bc 100644 --- a/av1/encoder/x86/dct_intrin_sse2.c +++ b/av1/encoder/x86/dct_intrin_sse2.c @@ -12,10 +12,11 @@ #include #include // SSE2 -#include "./av1_rtcd.h" #include "./aom_dsp_rtcd.h" +#include "./av1_rtcd.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/fwd_txfm_sse2.h" +#include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_ports/mem.h" @@ -2584,3 +2585,362 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, default: assert(0); break; } } + +#if CONFIG_EXT_TX +static INLINE void scale_sqrt2_8x8(__m128i *in) { + // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)' + // for each element + const __m128i v_scale_w = _mm_set1_epi16(Sqrt2); + + const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); + const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); + const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); + const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); + const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); + const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); + const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); + const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); + const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w); + const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w); + const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w); + const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w); + const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w); + const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w); + const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w); + const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w); + + const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); + const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); + const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w); + const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w); + const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w); + const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w); + const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w); + const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w); + const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w); + const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w); + + in[0] = _mm_packs_epi32(xx_roundn_epi32(v_p0a_d, DCT_CONST_BITS), + xx_roundn_epi32(v_p0b_d, DCT_CONST_BITS)); + in[1] = _mm_packs_epi32(xx_roundn_epi32(v_p1a_d, DCT_CONST_BITS), + xx_roundn_epi32(v_p1b_d, DCT_CONST_BITS)); + in[2] = _mm_packs_epi32(xx_roundn_epi32(v_p2a_d, DCT_CONST_BITS), + xx_roundn_epi32(v_p2b_d, DCT_CONST_BITS)); + in[3] = _mm_packs_epi32(xx_roundn_epi32(v_p3a_d, DCT_CONST_BITS), + xx_roundn_epi32(v_p3b_d, DCT_CONST_BITS)); + in[4] = _mm_packs_epi32(xx_roundn_epi32(v_p4a_d, DCT_CONST_BITS), + xx_roundn_epi32(v_p4b_d, DCT_CONST_BITS)); + in[5] = _mm_packs_epi32(xx_roundn_epi32(v_p5a_d, DCT_CONST_BITS), + xx_roundn_epi32(v_p5b_d, DCT_CONST_BITS)); + in[6] = _mm_packs_epi32(xx_roundn_epi32(v_p6a_d, DCT_CONST_BITS), + xx_roundn_epi32(v_p6b_d, DCT_CONST_BITS)); + in[7] = _mm_packs_epi32(xx_roundn_epi32(v_p7a_d, DCT_CONST_BITS), + xx_roundn_epi32(v_p7b_d, DCT_CONST_BITS)); +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr) { + // Load 2 8x8 blocks + const int16_t *t = input; + const int16_t *b = input + 8 * stride; + + if (flipud) { + const int16_t *const tmp = t; + t = b; + b = tmp; + } + + load_buffer_8x8(t, in, stride, flipud, fliplr); + scale_sqrt2_8x8(in); + load_buffer_8x8(b, in + 8, stride, flipud, fliplr); + scale_sqrt2_8x8(in + 8); +} + +void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[16]; + + __m128i *const t = in; // Alias to top 8x8 sub block + __m128i *const b = in + 8; // Alias to bottom 8x8 sub block + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x16(input, in, stride, 0, 0); + fdct16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fdct8_sse2(t); + fdct8_sse2(b); + break; + case ADST_DCT: + load_buffer_8x16(input, in, stride, 0, 0); + fadst16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fdct8_sse2(t); + fdct8_sse2(b); + break; + case DCT_ADST: + load_buffer_8x16(input, in, stride, 0, 0); + fdct16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + break; + case ADST_ADST: + load_buffer_8x16(input, in, stride, 0, 0); + fadst16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_8x16(input, in, stride, 1, 0); + fadst16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fdct8_sse2(t); + fdct8_sse2(b); + break; + case DCT_FLIPADST: + load_buffer_8x16(input, in, stride, 0, 1); + fdct16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + break; + case FLIPADST_FLIPADST: + load_buffer_8x16(input, in, stride, 1, 1); + fadst16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + break; + case ADST_FLIPADST: + load_buffer_8x16(input, in, stride, 0, 1); + fadst16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + break; + case FLIPADST_ADST: + load_buffer_8x16(input, in, stride, 1, 0); + fadst16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + break; + case IDTX: + load_buffer_8x16(input, in, stride, 0, 0); + fidtx16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fidtx8_sse2(t); + fidtx8_sse2(b); + break; + case V_DCT: + load_buffer_8x16(input, in, stride, 0, 0); + fdct16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fidtx8_sse2(t); + fidtx8_sse2(b); + break; + case H_DCT: + load_buffer_8x16(input, in, stride, 0, 0); + fidtx16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fdct8_sse2(t); + fdct8_sse2(b); + break; + case V_ADST: + load_buffer_8x16(input, in, stride, 0, 0); + fadst16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fidtx8_sse2(t); + fidtx8_sse2(b); + break; + case H_ADST: + load_buffer_8x16(input, in, stride, 0, 0); + fidtx16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + break; + case V_FLIPADST: + load_buffer_8x16(input, in, stride, 1, 0); + fadst16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fidtx8_sse2(t); + fidtx8_sse2(b); + break; + case H_FLIPADST: + load_buffer_8x16(input, in, stride, 0, 1); + fidtx16_8col(in); + array_transpose_8x8(t, t); + array_transpose_8x8(b, b); + fadst8_sse2(t); + fadst8_sse2(b); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + right_shift_8x8(t, 2); + right_shift_8x8(b, 2); + write_buffer_8x8(output, t, 8); + write_buffer_8x8(output + 64, b, 8); +} + +static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr) { + // Load 2 8x8 blocks + const int16_t *l = input; + const int16_t *r = input + 8; + + if (fliplr) { + const int16_t *const tmp = l; + l = r; + r = tmp; + } + + // load first 8 columns + load_buffer_8x8(l, in, stride, flipud, fliplr); + scale_sqrt2_8x8(in); + load_buffer_8x8(r, in + 8, stride, flipud, fliplr); + scale_sqrt2_8x8(in + 8); +} + +void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + __m128i in[16]; + + __m128i *const l = in; // Alias to left 8x8 sub block + __m128i *const r = in + 8; // Alias to right 8x8 sub block, which we store + // in the second half of the array + + switch (tx_type) { + case DCT_DCT: + load_buffer_16x8(input, in, stride, 0, 0); + fdct8_sse2(l); + fdct8_sse2(r); + fdct16_8col(in); + break; + case ADST_DCT: + load_buffer_16x8(input, in, stride, 0, 0); + fadst8_sse2(l); + fadst8_sse2(r); + fdct16_8col(in); + break; + case DCT_ADST: + load_buffer_16x8(input, in, stride, 0, 0); + fdct8_sse2(l); + fdct8_sse2(r); + fadst16_8col(in); + break; + case ADST_ADST: + load_buffer_16x8(input, in, stride, 0, 0); + fadst8_sse2(l); + fadst8_sse2(r); + fadst16_8col(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + load_buffer_16x8(input, in, stride, 1, 0); + fadst8_sse2(l); + fadst8_sse2(r); + fdct16_8col(in); + break; + case DCT_FLIPADST: + load_buffer_16x8(input, in, stride, 0, 1); + fdct8_sse2(l); + fdct8_sse2(r); + fadst16_8col(in); + break; + case FLIPADST_FLIPADST: + load_buffer_16x8(input, in, stride, 1, 1); + fadst8_sse2(l); + fadst8_sse2(r); + fadst16_8col(in); + break; + case ADST_FLIPADST: + load_buffer_16x8(input, in, stride, 0, 1); + fadst8_sse2(l); + fadst8_sse2(r); + fadst16_8col(in); + break; + case FLIPADST_ADST: + load_buffer_16x8(input, in, stride, 1, 0); + fadst8_sse2(l); + fadst8_sse2(r); + fadst16_8col(in); + break; + case IDTX: + load_buffer_16x8(input, in, stride, 0, 0); + fidtx8_sse2(l); + fidtx8_sse2(r); + fidtx16_8col(in); + break; + case V_DCT: + load_buffer_16x8(input, in, stride, 0, 0); + fdct8_sse2(l); + fdct8_sse2(r); + fidtx16_8col(in); + break; + case H_DCT: + load_buffer_16x8(input, in, stride, 0, 0); + fidtx8_sse2(l); + fidtx8_sse2(r); + fdct16_8col(in); + break; + case V_ADST: + load_buffer_16x8(input, in, stride, 0, 0); + fadst8_sse2(l); + fadst8_sse2(r); + fidtx16_8col(in); + break; + case H_ADST: + load_buffer_16x8(input, in, stride, 0, 0); + fidtx8_sse2(l); + fidtx8_sse2(r); + fadst16_8col(in); + break; + case V_FLIPADST: + load_buffer_16x8(input, in, stride, 1, 0); + fadst8_sse2(l); + fadst8_sse2(r); + fidtx16_8col(in); + break; + case H_FLIPADST: + load_buffer_16x8(input, in, stride, 0, 1); + fidtx8_sse2(l); + fidtx8_sse2(r); + fadst16_8col(in); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + array_transpose_8x8(l, l); + array_transpose_8x8(r, r); + right_shift_8x8(l, 2); + right_shift_8x8(r, 2); + write_buffer_8x8(output, l, 16); + write_buffer_8x8(output + 8, r, 16); +} +#endif // CONFIG_EXT_TX diff --git a/test/av1_fht16x8_test.cc b/test/av1_fht16x8_test.cc new file mode 100644 index 000000000..3917bf59a --- /dev/null +++ b/test/av1_fht16x8_test.cc @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./aom_dsp_rtcd.h" +#include "./av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/transform_test_base.h" +#include "test/util.h" + +using libaom_test::ACMRandom; + +namespace { +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); +using std::tr1::tuple; +using libaom_test::FhtFunc; +typedef tuple Ht16x8Param; + +void fht16x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { + av1_fht16x8_c(in, out, stride, tx_type); +} + +class AV1Trans16x8HT : public libaom_test::TransformTestBase, + public ::testing::TestWithParam { + public: + virtual ~AV1Trans16x8HT() {} + + virtual void SetUp() { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + pitch_ = 16; + fwd_txfm_ref = fht16x8_ref; + bit_depth_ = GET_PARAM(3); + mask_ = (1 << bit_depth_) - 1; + num_coeffs_ = GET_PARAM(4); + } + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { + fwd_txfm_(in, out, stride, tx_type_); + } + + void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride, tx_type_); + } + + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; +}; + +TEST_P(AV1Trans16x8HT, CoeffCheck) { RunCoeffCheck(); } + +using std::tr1::make_tuple; + +#if HAVE_SSE2 +const Ht16x8Param kArrayHt16x8Param_sse2[] = { + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 0, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 1, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 2, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 3, AOM_BITS_8, 128), +#if CONFIG_EXT_TX + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 4, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 5, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 6, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 7, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 8, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 9, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 10, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 11, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 12, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 13, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 14, AOM_BITS_8, 128), + make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_c, 15, AOM_BITS_8, 128) +#endif // CONFIG_EXT_TX +}; +INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x8HT, + ::testing::ValuesIn(kArrayHt16x8Param_sse2)); +#endif // HAVE_SSE2 + +} // namespace diff --git a/test/av1_fht8x16_test.cc b/test/av1_fht8x16_test.cc new file mode 100644 index 000000000..be50d7cbf --- /dev/null +++ b/test/av1_fht8x16_test.cc @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./aom_dsp_rtcd.h" +#include "./av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/transform_test_base.h" +#include "test/util.h" + +using libaom_test::ACMRandom; + +namespace { +typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); +using std::tr1::tuple; +using libaom_test::FhtFunc; +typedef tuple Ht8x16Param; + +void fht8x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { + av1_fht8x16_c(in, out, stride, tx_type); +} + +class AV1Trans8x16HT : public libaom_test::TransformTestBase, + public ::testing::TestWithParam { + public: + virtual ~AV1Trans8x16HT() {} + + virtual void SetUp() { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + pitch_ = 8; + fwd_txfm_ref = fht8x16_ref; + bit_depth_ = GET_PARAM(3); + mask_ = (1 << bit_depth_) - 1; + num_coeffs_ = GET_PARAM(4); + } + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { + fwd_txfm_(in, out, stride, tx_type_); + } + + void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride, tx_type_); + } + + FhtFunc fwd_txfm_; + IhtFunc inv_txfm_; +}; + +TEST_P(AV1Trans8x16HT, CoeffCheck) { RunCoeffCheck(); } + +using std::tr1::make_tuple; + +#if HAVE_SSE2 +const Ht8x16Param kArrayHt8x16Param_sse2[] = { + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 0, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 1, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 2, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 3, AOM_BITS_8, 128), +#if CONFIG_EXT_TX + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 4, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 5, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 6, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 7, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 8, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 9, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 10, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 11, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 12, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 13, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 14, AOM_BITS_8, 128), + make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_c, 15, AOM_BITS_8, 128) +#endif // CONFIG_EXT_TX +}; +INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x16HT, + ::testing::ValuesIn(kArrayHt8x16Param_sse2)); +#endif // HAVE_SSE2 + +} // namespace diff --git a/test/test.mk b/test/test.mk index ba41e2633..30b1b6813 100644 --- a/test/test.mk +++ b/test/test.mk @@ -132,6 +132,10 @@ LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_dct_test.cc LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht4x4_test.cc LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x8_test.cc LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x16_test.cc +ifeq ($(CONFIG_EXT_TX),yes) +LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x16_test.cc +LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x8_test.cc +endif LIBAOM_TEST_SRCS-$(CONFIG_ANS) += av1_ans_test.cc LIBAOM_TEST_SRCS-$(CONFIG_EXT_TILE) += av1_ext_tile_test.cc diff --git a/test/transform_test_base.h b/test/transform_test_base.h index a128b3b5f..432f39881 100644 --- a/test/transform_test_base.h +++ b/test/transform_test_base.h @@ -137,7 +137,7 @@ class TransformTestBase { // The minimum quant value is 4. for (int j = 0; j < num_coeffs_; ++j) { - EXPECT_EQ(output_block[j], output_ref_block[j]) + ASSERT_EQ(output_block[j], output_ref_block[j]) << "Error: not bit-exact result at index: " << j << " at test block: " << i; }