From 731720000229bcc85c0b95516dd744c9e95a5465 Mon Sep 17 00:00:00 2001 From: Yi Luo Date: Fri, 28 Oct 2016 10:52:04 -0700 Subject: [PATCH] Hybrid inverse transforms 16x16 AVX2 optimization - Add unit tests to verify the bit-exact result. - User level time reduction (EXT_TX): encoder: 3.63% decoder: 2.36% - Also add tx_type=V_DCT...H_FLIPADST SSE2 for 16x16 inv txfm. Change-Id: Idc6d9e8254aa536e5f18a87fa0d37c6bd551c083 --- aom_dsp/x86/txfm_common_avx2.h | 177 +++++++++ av1/av1_common.mk | 2 + av1/common/av1_rtcd_defs.pl | 4 +- av1/common/idct.c | 7 +- av1/common/x86/hybrid_inv_txfm_avx2.c | 503 +++++++++++++++++++++++++ av1/common/x86/idct_intrin_sse2.c | 154 ++++---- av1/encoder/x86/hybrid_fwd_txfm_avx2.c | 175 +-------- test/av1_fht16x16_test.cc | 52 +-- test/transform_test_base.h | 2 +- 9 files changed, 805 insertions(+), 271 deletions(-) create mode 100644 av1/common/x86/hybrid_inv_txfm_avx2.c diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h index 7dc17f033..39e9b8e2a 100644 --- a/aom_dsp/x86/txfm_common_avx2.h +++ b/aom_dsp/x86/txfm_common_avx2.h @@ -14,6 +14,8 @@ #include +#include "aom_dsp/txfm_common.h" + #define pair256_set_epi16(a, b) \ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ @@ -24,4 +26,179 @@ _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \ (int)(b), (int)(a)) +static INLINE void mm256_reverse_epi16(__m256i *u) { + const __m256i control = _mm256_set_epi16( + 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100, + 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E); + __m256i v = _mm256_shuffle_epi8(*u, control); + *u = _mm256_permute2x128_si256(v, v, 1); +} + +static INLINE void mm256_transpose_16x16(__m256i *in) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} + +static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) { + const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + __m256i y0 = _mm256_madd_epi16(a0, cospi); + __m256i y1 = _mm256_madd_epi16(a1, cospi); + + y0 = _mm256_add_epi32(y0, dct_rounding); + y1 = _mm256_add_epi32(y1, dct_rounding); + y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS); + y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS); + + return _mm256_packs_epi32(y0, y1); +} + +static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i sqrt2_epi16 = _mm256_set1_epi16(c); + const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + __m256i u0, u1; + int i = 0; + + while (i < 16) { + in[i] = _mm256_slli_epi16(in[i], 1); + + u0 = _mm256_unpacklo_epi16(zero, in[i]); + u1 = _mm256_unpackhi_epi16(zero, in[i]); + + u0 = _mm256_madd_epi16(u0, sqrt2_epi16); + u1 = _mm256_madd_epi16(u1, sqrt2_epi16); + + u0 = _mm256_add_epi32(u0, dct_const_rounding); + u1 = _mm256_add_epi32(u1, dct_const_rounding); + + u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); + u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); + in[i] = _mm256_packs_epi32(u0, u1); + i++; + } +} + #endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H diff --git a/av1/av1_common.mk b/av1/av1_common.mk index 43b76ad62..18048d48b 100644 --- a/av1/av1_common.mk +++ b/av1/av1_common.mk @@ -122,6 +122,8 @@ AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct8x8_msa.c AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c +AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c + ifeq ($(CONFIG_AV1_ENCODER),yes) AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_sse2.c AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_dct32x32_impl_sse2.h diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index e66826f0a..8fac06c9b 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl @@ -114,7 +114,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { specialize qw/av1_iht8x8_64_add sse2/; add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/av1_iht16x16_256_add sse2/; + specialize qw/av1_iht16x16_256_add sse2 avx2/; } } else { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 @@ -175,7 +175,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { specialize qw/av1_iht8x8_64_add sse2 neon dspr2/; add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/av1_iht16x16_256_add sse2 dspr2/; + specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/; if (aom_config("CONFIG_EXT_TX") ne "yes") { specialize qw/av1_iht4x4_16_add msa/; diff --git a/av1/common/idct.c b/av1/common/idct.c index 81581a47a..96b34ee53 100644 --- a/av1/common/idct.c +++ b/av1/common/idct.c @@ -984,17 +984,12 @@ void av1_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, int stride, case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: - av1_iht16x16_256_add(input, dest, stride, tx_type); - break; case V_DCT: case H_DCT: case V_ADST: case H_ADST: case V_FLIPADST: - case H_FLIPADST: - // Use C version since DST only exists in C code - av1_iht16x16_256_add_c(input, dest, stride, tx_type); - break; + case H_FLIPADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break; case IDTX: inv_idtx_add_c(input, dest, stride, 16, tx_type); break; #endif // CONFIG_EXT_TX default: assert(0); break; diff --git a/av1/common/x86/hybrid_inv_txfm_avx2.c b/av1/common/x86/hybrid_inv_txfm_avx2.c new file mode 100644 index 000000000..754152c43 --- /dev/null +++ b/av1/common/x86/hybrid_inv_txfm_avx2.c @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // avx2 + +#include "./aom_config.h" +#include "./av1_rtcd.h" + +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { +#if CONFIG_AOM_HIGHBITDEPTH + *in = _mm256_setr_epi16( + (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], + (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], + (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], + (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], + (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], + (int16_t)coeff[15]); +#else + *in = _mm256_loadu_si256((const __m256i *)coeff); +#endif +} + +static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) { + int i = 0; + while (i < 16) { + load_coeff(coeff + (i << 4), &in[i]); + i += 1; + } +} + +static void recon_and_store(const __m256i *res, uint8_t *output) { + const __m128i zero = _mm_setzero_si128(); + __m128i x = _mm_loadu_si128((__m128i const *)output); + __m128i p0 = _mm_unpacklo_epi8(x, zero); + __m128i p1 = _mm_unpackhi_epi8(x, zero); + + p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res)); + p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1)); + x = _mm_packus_epi16(p0, p1); + _mm_storeu_si128((__m128i *)output, x); +} + +#define IDCT_ROUNDING_POS (6) + +static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) { + const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1)); + int i = 0; + + while (i < 16) { + in[i] = _mm256_add_epi16(in[i], rounding); + in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS); + recon_and_store(&in[i], output + i * stride); + i += 1; + } +} + +static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1, + const __m256i *c0, const __m256i *c1, + __m256i *b0, __m256i *b1) { + __m256i x0, x1; + x0 = _mm256_unpacklo_epi16(*a0, *a1); + x1 = _mm256_unpackhi_epi16(*a0, *a1); + *b0 = butter_fly(x0, x1, *c0); + *b1 = butter_fly(x0, x1, *c1); +} + +static void idct16_avx2(__m256i *in) { + const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64); + const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64); + const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64); + const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64); + const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64); + const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64); + const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64); + const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64); + const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64); + const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64); + const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64); + const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64); + const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64); + const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64); + const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i t0, t1, t2, t3, t4, t5, t6, t7; + + // stage 1, (0-7) + u0 = in[0]; + u1 = in[8]; + u2 = in[4]; + u3 = in[12]; + u4 = in[2]; + u5 = in[10]; + u6 = in[6]; + u7 = in[14]; + + // stage 2, (0-7) + // stage 3, (0-7) + t0 = u0; + t1 = u1; + t2 = u2; + t3 = u3; + unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7); + unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6); + + // stage 4, (0-7) + unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1); + unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3); + u4 = _mm256_add_epi16(t4, t5); + u5 = _mm256_sub_epi16(t4, t5); + u6 = _mm256_sub_epi16(t7, t6); + u7 = _mm256_add_epi16(t7, t6); + + // stage 5, (0-7) + t0 = _mm256_add_epi16(u0, u3); + t1 = _mm256_add_epi16(u1, u2); + t2 = _mm256_sub_epi16(u1, u2); + t3 = _mm256_sub_epi16(u0, u3); + t4 = u4; + t7 = u7; + unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6); + + // stage 6, (0-7) + u0 = _mm256_add_epi16(t0, t7); + u1 = _mm256_add_epi16(t1, t6); + u2 = _mm256_add_epi16(t2, t5); + u3 = _mm256_add_epi16(t3, t4); + u4 = _mm256_sub_epi16(t3, t4); + u5 = _mm256_sub_epi16(t2, t5); + u6 = _mm256_sub_epi16(t1, t6); + u7 = _mm256_sub_epi16(t0, t7); + + // stage 1, (8-15) + v0 = in[1]; + v1 = in[9]; + v2 = in[5]; + v3 = in[13]; + v4 = in[3]; + v5 = in[11]; + v6 = in[7]; + v7 = in[15]; + + // stage 2, (8-15) + unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7); + unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6); + unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5); + unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4); + + // stage 3, (8-15) + v0 = _mm256_add_epi16(t0, t1); + v1 = _mm256_sub_epi16(t0, t1); + v2 = _mm256_sub_epi16(t3, t2); + v3 = _mm256_add_epi16(t2, t3); + v4 = _mm256_add_epi16(t4, t5); + v5 = _mm256_sub_epi16(t4, t5); + v6 = _mm256_sub_epi16(t7, t6); + v7 = _mm256_add_epi16(t6, t7); + + // stage 4, (8-15) + t0 = v0; + t7 = v7; + t3 = v3; + t4 = v4; + unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6); + unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5); + + // stage 5, (8-15) + v0 = _mm256_add_epi16(t0, t3); + v1 = _mm256_add_epi16(t1, t2); + v2 = _mm256_sub_epi16(t1, t2); + v3 = _mm256_sub_epi16(t0, t3); + v4 = _mm256_sub_epi16(t7, t4); + v5 = _mm256_sub_epi16(t6, t5); + v6 = _mm256_add_epi16(t6, t5); + v7 = _mm256_add_epi16(t7, t4); + + // stage 6, (8-15) + t0 = v0; + t1 = v1; + t6 = v6; + t7 = v7; + unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5); + unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4); + + // stage 7 + in[0] = _mm256_add_epi16(u0, t7); + in[1] = _mm256_add_epi16(u1, t6); + in[2] = _mm256_add_epi16(u2, t5); + in[3] = _mm256_add_epi16(u3, t4); + in[4] = _mm256_add_epi16(u4, t3); + in[5] = _mm256_add_epi16(u5, t2); + in[6] = _mm256_add_epi16(u6, t1); + in[7] = _mm256_add_epi16(u7, t0); + in[8] = _mm256_sub_epi16(u7, t0); + in[9] = _mm256_sub_epi16(u6, t1); + in[10] = _mm256_sub_epi16(u5, t2); + in[11] = _mm256_sub_epi16(u4, t3); + in[12] = _mm256_sub_epi16(u3, t4); + in[13] = _mm256_sub_epi16(u2, t5); + in[14] = _mm256_sub_epi16(u1, t6); + in[15] = _mm256_sub_epi16(u0, t7); +} + +static void idct16(__m256i *in) { + mm256_transpose_16x16(in); + idct16_avx2(in); +} + +static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1, + const __m256i *c0, const __m256i *c1, + __m256i *b) { + __m256i x0, x1; + x0 = _mm256_unpacklo_epi16(*a0, *a1); + x1 = _mm256_unpackhi_epi16(*a0, *a1); + b[0] = _mm256_madd_epi16(x0, *c0); + b[1] = _mm256_madd_epi16(x1, *c0); + b[2] = _mm256_madd_epi16(x0, *c1); + b[3] = _mm256_madd_epi16(x1, *c1); +} + +static INLINE void group_rounding(__m256i *a, int num) { + const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + int i; + for (i = 0; i < num; ++i) { + a[i] = _mm256_add_epi32(a[i], dct_rounding); + a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS); + } +} + +static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) { + __m256i x[4]; + x[0] = _mm256_add_epi32(a[0], b[0]); + x[1] = _mm256_add_epi32(a[1], b[1]); + x[2] = _mm256_add_epi32(a[2], b[2]); + x[3] = _mm256_add_epi32(a[3], b[3]); + + group_rounding(x, 4); + + out[0] = _mm256_packs_epi32(x[0], x[1]); + out[1] = _mm256_packs_epi32(x[2], x[3]); +} + +static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) { + __m256i x[4]; + x[0] = _mm256_sub_epi32(a[0], b[0]); + x[1] = _mm256_sub_epi32(a[1], b[1]); + x[2] = _mm256_sub_epi32(a[2], b[2]); + x[3] = _mm256_sub_epi32(a[3], b[3]); + + group_rounding(x, 4); + + out[0] = _mm256_packs_epi32(x[0], x[1]); + out[1] = _mm256_packs_epi32(x[2], x[3]); +} + +static INLINE void butterfly_rnd(__m256i *a, __m256i *out) { + group_rounding(a, 4); + out[0] = _mm256_packs_epi32(a[0], a[1]); + out[1] = _mm256_packs_epi32(a[2], a[3]); +} + +static void iadst16_avx2(__m256i *in) { + const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64); + const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64); + const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64); + const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64); + const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64); + const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64); + const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64); + const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64); + const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64); + const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64); + const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64); + const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64); + const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64); + const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64); + const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64); + const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64); + const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64); + const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64); + const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64); + const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64); + const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64); + const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64); + const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64); + const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64); + const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64); + const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64); + const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + const __m256i zero = _mm256_setzero_si256(); + __m256i x[16], s[16]; + __m256i u[4], v[4]; + + // stage 1 + butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u); + butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v); + add_rnd(u, v, &x[0]); + sub_rnd(u, v, &x[8]); + + butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u); + butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v); + add_rnd(u, v, &x[2]); + sub_rnd(u, v, &x[10]); + + butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u); + butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v); + add_rnd(u, v, &x[4]); + sub_rnd(u, v, &x[12]); + + butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u); + butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v); + add_rnd(u, v, &x[6]); + sub_rnd(u, v, &x[14]); + + // stage 2 + s[0] = _mm256_add_epi16(x[0], x[4]); + s[1] = _mm256_add_epi16(x[1], x[5]); + s[2] = _mm256_add_epi16(x[2], x[6]); + s[3] = _mm256_add_epi16(x[3], x[7]); + s[4] = _mm256_sub_epi16(x[0], x[4]); + s[5] = _mm256_sub_epi16(x[1], x[5]); + s[6] = _mm256_sub_epi16(x[2], x[6]); + s[7] = _mm256_sub_epi16(x[3], x[7]); + butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u); + butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v); + add_rnd(u, v, &s[8]); + sub_rnd(u, v, &s[12]); + + butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u); + butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v); + add_rnd(u, v, &s[10]); + sub_rnd(u, v, &s[14]); + + // stage 3 + x[0] = _mm256_add_epi16(s[0], s[2]); + x[1] = _mm256_add_epi16(s[1], s[3]); + x[2] = _mm256_sub_epi16(s[0], s[2]); + x[3] = _mm256_sub_epi16(s[1], s[3]); + + x[8] = _mm256_add_epi16(s[8], s[10]); + x[9] = _mm256_add_epi16(s[9], s[11]); + x[10] = _mm256_sub_epi16(s[8], s[10]); + x[11] = _mm256_sub_epi16(s[9], s[11]); + + butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u); + butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v); + add_rnd(u, v, &x[4]); + sub_rnd(u, v, &x[6]); + + butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u); + butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v); + add_rnd(u, v, &x[12]); + sub_rnd(u, v, &x[14]); + + // stage 4 + butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u); + butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v); + butterfly_rnd(u, &x[2]); + butterfly_rnd(v, &x[6]); + + butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u); + butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v); + butterfly_rnd(u, &x[10]); + butterfly_rnd(v, &x[14]); + + in[0] = x[0]; + in[1] = _mm256_sub_epi16(zero, x[8]); + in[2] = x[12]; + in[3] = _mm256_sub_epi16(zero, x[4]); + in[4] = x[6]; + in[5] = x[14]; + in[6] = x[10]; + in[7] = x[2]; + in[8] = x[3]; + in[9] = x[11]; + in[10] = x[15]; + in[11] = x[7]; + in[12] = x[5]; + in[13] = _mm256_sub_epi16(zero, x[13]); + in[14] = x[9]; + in[15] = _mm256_sub_epi16(zero, x[1]); +} + +static void iadst16(__m256i *in) { + mm256_transpose_16x16(in); + iadst16_avx2(in); +} + +#if CONFIG_EXT_TX +static void flip_row(__m256i *in, int rows) { + int i; + for (i = 0; i < rows; ++i) { + mm256_reverse_epi16(&in[i]); + } +} + +static void flip_col(uint8_t **dest, int *stride, int rows) { + *dest = *dest + (rows - 1) * (*stride); + *stride = -*stride; +} + +static void iidtx16(__m256i *in) { + mm256_transpose_16x16(in); + txfm_scaling16_avx2(Sqrt2, in); +} +#endif + +void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m256i in[16]; + + load_buffer_16x16(input, in); + switch (tx_type) { + case DCT_DCT: + idct16(in); + idct16(in); + break; + case ADST_DCT: + idct16(in); + iadst16(in); + break; + case DCT_ADST: + iadst16(in); + idct16(in); + break; + case ADST_ADST: + iadst16(in); + iadst16(in); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + idct16(in); + iadst16(in); + flip_col(&dest, &stride, 16); + break; + case DCT_FLIPADST: + iadst16(in); + idct16(in); + flip_row(in, 16); + break; + case FLIPADST_FLIPADST: + iadst16(in); + iadst16(in); + flip_row(in, 16); + flip_col(&dest, &stride, 16); + break; + case ADST_FLIPADST: + iadst16(in); + iadst16(in); + flip_row(in, 16); + break; + case FLIPADST_ADST: + iadst16(in); + iadst16(in); + flip_col(&dest, &stride, 16); + break; + case V_DCT: + iidtx16(in); + idct16(in); + break; + case H_DCT: + idct16(in); + iidtx16(in); + break; + case V_ADST: + iidtx16(in); + iadst16(in); + break; + case H_ADST: + iadst16(in); + iidtx16(in); + break; + case V_FLIPADST: + iidtx16(in); + iadst16(in); + flip_col(&dest, &stride, 16); + break; + case H_FLIPADST: + iadst16(in); + iidtx16(in); + flip_row(in, 16); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + write_buffer_16x16(in, stride, dest); +} diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c index 10102e7ff..5d28a2831 100644 --- a/av1/common/x86/idct_intrin_sse2.c +++ b/av1/common/x86/idct_intrin_sse2.c @@ -242,69 +242,6 @@ void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest + 7 * stride, in[7]); } -void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride, int tx_type) { - __m128i in[32]; - __m128i *in0 = &in[0]; - __m128i *in1 = &in[16]; - - load_buffer_8x16(input, in0); - input += 8; - load_buffer_8x16(input, in1); - - switch (tx_type) { - case DCT_DCT: - aom_idct16_sse2(in0, in1); - aom_idct16_sse2(in0, in1); - break; - case ADST_DCT: - aom_idct16_sse2(in0, in1); - aom_iadst16_sse2(in0, in1); - break; - case DCT_ADST: - aom_iadst16_sse2(in0, in1); - aom_idct16_sse2(in0, in1); - break; - case ADST_ADST: - aom_iadst16_sse2(in0, in1); - aom_iadst16_sse2(in0, in1); - break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - aom_idct16_sse2(in0, in1); - aom_iadst16_sse2(in0, in1); - FLIPUD_PTR(dest, stride, 16); - break; - case DCT_FLIPADST: - aom_iadst16_sse2(in0, in1); - aom_idct16_sse2(in0, in1); - FLIPLR_16x16(in0, in1); - break; - case FLIPADST_FLIPADST: - aom_iadst16_sse2(in0, in1); - aom_iadst16_sse2(in0, in1); - FLIPUD_PTR(dest, stride, 16); - FLIPLR_16x16(in0, in1); - break; - case ADST_FLIPADST: - aom_iadst16_sse2(in0, in1); - aom_iadst16_sse2(in0, in1); - FLIPLR_16x16(in0, in1); - break; - case FLIPADST_ADST: - aom_iadst16_sse2(in0, in1); - aom_iadst16_sse2(in0, in1); - FLIPUD_PTR(dest, stride, 16); - break; -#endif // CONFIG_EXT_TX - default: assert(0); break; - } - - write_buffer_8x16(dest, in0, stride); - dest += 8; - write_buffer_8x16(dest, in1, stride); -} - #if CONFIG_EXT_TX static void iidtx16_8col(__m128i *in) { const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); @@ -501,7 +438,98 @@ static void iidtx16_sse2(__m128i *in0, __m128i *in1) { iidtx16_8col(in0); iidtx16_8col(in1); } +#endif +void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m128i in[32]; + __m128i *in0 = &in[0]; + __m128i *in1 = &in[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + switch (tx_type) { + case DCT_DCT: + aom_idct16_sse2(in0, in1); + aom_idct16_sse2(in0, in1); + break; + case ADST_DCT: + aom_idct16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + break; + case DCT_ADST: + aom_iadst16_sse2(in0, in1); + aom_idct16_sse2(in0, in1); + break; + case ADST_ADST: + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + break; +#if CONFIG_EXT_TX + case FLIPADST_DCT: + aom_idct16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + break; + case DCT_FLIPADST: + aom_iadst16_sse2(in0, in1); + aom_idct16_sse2(in0, in1); + FLIPLR_16x16(in0, in1); + break; + case FLIPADST_FLIPADST: + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + FLIPLR_16x16(in0, in1); + break; + case ADST_FLIPADST: + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPLR_16x16(in0, in1); + break; + case FLIPADST_ADST: + aom_iadst16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + break; + case V_DCT: + iidtx16_sse2(in0, in1); + aom_idct16_sse2(in0, in1); + break; + case H_DCT: + aom_idct16_sse2(in0, in1); + iidtx16_sse2(in0, in1); + break; + case V_ADST: + iidtx16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + break; + case H_ADST: + aom_iadst16_sse2(in0, in1); + iidtx16_sse2(in0, in1); + break; + case V_FLIPADST: + iidtx16_sse2(in0, in1); + aom_iadst16_sse2(in0, in1); + FLIPUD_PTR(dest, stride, 16); + break; + case H_FLIPADST: + aom_iadst16_sse2(in0, in1); + iidtx16_sse2(in0, in1); + FLIPLR_16x16(in0, in1); + break; +#endif // CONFIG_EXT_TX + default: assert(0); break; + } + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} + +#if CONFIG_EXT_TX static void iidtx8_sse2(__m128i *in) { in[0] = _mm_slli_epi16(in[0], 1); in[1] = _mm_slli_epi16(in[1], 1); diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c index f4bd1427d..77ae724f6 100644 --- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c +++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c @@ -18,14 +18,6 @@ #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/txfm_common_avx2.h" -static INLINE void mm256_reverse_epi16(__m256i *u) { - const __m256i control = _mm256_set_epi16( - 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100, - 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E); - __m256i v = _mm256_shuffle_epi8(*u, control); - *u = _mm256_permute2x128_si256(v, v, 1); -} - static int32_t get_16x16_sum(const int16_t *input, int stride) { __m256i r0, r1, r2, r3, u0, u1; __m256i zero = _mm256_setzero_si256(); @@ -71,134 +63,6 @@ void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output, _mm256_zeroupper(); } -static void mm256_transpose_16x16(__m256i *in) { - __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); - __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); - __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); - __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); - __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); - __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); - __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); - __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); - - __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); - __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); - __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); - __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); - __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); - __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); - __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); - __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); - - // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b - // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f - // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b - // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f - // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b - // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f - // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b - // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f - - // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b - // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f - // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb - // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf - // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db - // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df - // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb - // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff - - __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); - __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); - __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); - __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); - __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); - __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); - __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); - __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); - - __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); - __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); - __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); - __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); - __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); - __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); - __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); - __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); - - // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 - // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b - // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d - // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f - // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 - // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b - // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d - // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f - - // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 - // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb - // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd - // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf - // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 - // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb - // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd - // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff - - tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); - tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); - tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); - tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); - tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); - tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); - tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); - tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); - - tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); - tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); - tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); - tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); - tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); - tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); - tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); - tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); - - // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 - // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 - // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a - // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b - // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c - // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d - // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e - // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f - - // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 - // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 - // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa - // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb - // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc - // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd - // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe - // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff - - in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 - in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 - in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); - in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); - in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); - in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); - in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); - in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); - - in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); - in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); - in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); - in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); - in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); - in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); - in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); - in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); -} - static INLINE void load_buffer_16x16(const int16_t *input, int stride, int flipud, int fliplr, __m256i *in) { if (!flipud) { @@ -352,19 +216,6 @@ static void right_shift_16x16(__m256i *in) { in[15] = _mm256_srai_epi16(in[15], 2); } -static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) { - const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i y0 = _mm256_madd_epi16(a0, cospi); - __m256i y1 = _mm256_madd_epi16(a1, cospi); - - y0 = _mm256_add_epi32(y0, dct_rounding); - y1 = _mm256_add_epi32(y1, dct_rounding); - y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS); - y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS); - - return _mm256_packs_epi32(y0, y1); -} - static void fdct16_avx2(__m256i *in) { // sequence: cospi_L_H = pairs(L, H) and L first const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); @@ -1099,31 +950,7 @@ void fadst16_avx2(__m256i *in) { } #if CONFIG_EXT_TX -static void fidtx16_avx2(__m256i *in) { - const __m256i zero = _mm256_setzero_si256(); - const __m256i sqrt2_epi16 = _mm256_set1_epi16((int16_t)Sqrt2); - const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i u0, u1; - int i = 0; - - while (i < 16) { - in[i] = _mm256_slli_epi16(in[i], 1); - - u0 = _mm256_unpacklo_epi16(zero, in[i]); - u1 = _mm256_unpackhi_epi16(zero, in[i]); - - u0 = _mm256_madd_epi16(u0, sqrt2_epi16); - u1 = _mm256_madd_epi16(u1, sqrt2_epi16); - - u0 = _mm256_add_epi32(u0, dct_const_rounding); - u1 = _mm256_add_epi32(u1, dct_const_rounding); - - u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); - u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); - in[i] = _mm256_packs_epi32(u0, u1); - i++; - } -} +static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); } #endif void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride, diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc index 4a44e16d7..0b890716a 100644 --- a/test/av1_fht16x16_test.cc +++ b/test/av1_fht16x16_test.cc @@ -33,6 +33,11 @@ void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { av1_fht16x16_c(in, out, stride, tx_type); } +void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride, + int tx_type) { + av1_iht16x16_256_add_c(in, dest, stride, tx_type); +} + #if CONFIG_AOM_HIGHBITDEPTH typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride, int tx_type, int bd); @@ -48,16 +53,6 @@ void highbd_fht16x16_ref(const int16_t *in, int32_t *out, int stride, } #endif // CONFIG_AOM_HIGHBITDEPTH -#if HAVE_AVX2 -void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride, - int tx_type) { - (void)in; - (void)out; - (void)stride; - (void)tx_type; -} -#endif - class AV1Trans16x16HT : public libaom_test::TransformTestBase, public ::testing::TestWithParam { public: @@ -70,6 +65,7 @@ class AV1Trans16x16HT : public libaom_test::TransformTestBase, pitch_ = 16; height_ = 16; fwd_txfm_ref = fht16x16_ref; + inv_txfm_ref = iht16x16_ref; bit_depth_ = GET_PARAM(3); mask_ = (1 << bit_depth_) - 1; num_coeffs_ = GET_PARAM(4); @@ -90,6 +86,7 @@ class AV1Trans16x16HT : public libaom_test::TransformTestBase, }; TEST_P(AV1Trans16x16HT, CoeffCheck) { RunCoeffCheck(); } +TEST_P(AV1Trans16x16HT, InvCoeffCheck) { RunInvCoeffCheck(); } #if CONFIG_AOM_HIGHBITDEPTH class AV1HighbdTrans16x16HT @@ -203,22 +200,27 @@ INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x16HT, #if HAVE_AVX2 const Ht16x16Param kArrayHt16x16Param_avx2[] = { - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 0, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 1, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 2, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 3, AOM_BITS_8, 256), #if CONFIG_EXT_TX - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 256), - make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 256) + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 4, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 5, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 6, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 7, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 8, AOM_BITS_8, 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 10, AOM_BITS_8, + 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 11, AOM_BITS_8, + 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 12, AOM_BITS_8, + 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 13, AOM_BITS_8, + 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 14, AOM_BITS_8, + 256), + make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 15, AOM_BITS_8, 256) #endif // CONFIG_EXT_TX }; INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT, diff --git a/test/transform_test_base.h b/test/transform_test_base.h index 540136cc0..64bf2d6a1 100644 --- a/test/transform_test_base.h +++ b/test/transform_test_base.h @@ -210,7 +210,7 @@ class TransformTestBase { int out_idx = j * stride + k; ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx]) << "Error: not bit-exact result at index: " << out_idx - << " at test block: " << i; + << " j = " << j << " k = " << k << " at test block: " << i; } } }