From 731720000229bcc85c0b95516dd744c9e95a5465 Mon Sep 17 00:00:00 2001
From: Yi Luo <luoyi@google.com>
Date: Fri, 28 Oct 2016 10:52:04 -0700
Subject: [PATCH] Hybrid inverse transforms 16x16 AVX2 optimization

- Add unit tests to verify the bit-exact result.
- User level time reduction (EXT_TX):
    encoder: 3.63%
    decoder: 2.36%
- Also add tx_type=V_DCT...H_FLIPADST SSE2 for 16x16 inv txfm.

Change-Id: Idc6d9e8254aa536e5f18a87fa0d37c6bd551c083
---
 aom_dsp/x86/txfm_common_avx2.h         | 177 +++++++++
 av1/av1_common.mk                      |   2 +
 av1/common/av1_rtcd_defs.pl            |   4 +-
 av1/common/idct.c                      |   7 +-
 av1/common/x86/hybrid_inv_txfm_avx2.c  | 503 +++++++++++++++++++++++++
 av1/common/x86/idct_intrin_sse2.c      | 154 ++++----
 av1/encoder/x86/hybrid_fwd_txfm_avx2.c | 175 +--------
 test/av1_fht16x16_test.cc              |  52 +--
 test/transform_test_base.h             |   2 +-
 9 files changed, 805 insertions(+), 271 deletions(-)
 create mode 100644 av1/common/x86/hybrid_inv_txfm_avx2.c

diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index 7dc17f033..39e9b8e2a 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h
@@ -14,6 +14,8 @@
 
 #include <immintrin.h>
 
+#include "aom_dsp/txfm_common.h"
+
 #define pair256_set_epi16(a, b)                                            \
   _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                    (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
@@ -24,4 +26,179 @@
   _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
                    (int)(b), (int)(a))
 
+static INLINE void mm256_reverse_epi16(__m256i *u) {
+  const __m256i control = _mm256_set_epi16(
+      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
+      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
+  __m256i v = _mm256_shuffle_epi8(*u, control);
+  *u = _mm256_permute2x128_si256(v, v, 1);
+}
+
+static INLINE void mm256_transpose_16x16(__m256i *in) {
+  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
+  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
+  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
+  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
+  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
+  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
+  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
+  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
+
+  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
+  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
+  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
+  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
+  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
+  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
+  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
+  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
+
+  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
+  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
+  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
+  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
+  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
+  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
+  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
+  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
+
+  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
+  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
+  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
+  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
+  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
+  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
+  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
+  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
+
+  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
+  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
+  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
+  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
+  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
+  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
+  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
+  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
+
+  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
+  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
+  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
+  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
+  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
+  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
+  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
+  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
+
+  in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
+  in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
+  in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+  in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+  in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+  in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+  in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+  in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+  in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+  in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+  in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+  in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+  in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+  in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+  in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+  in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+
+static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i y0 = _mm256_madd_epi16(a0, cospi);
+  __m256i y1 = _mm256_madd_epi16(a1, cospi);
+
+  y0 = _mm256_add_epi32(y0, dct_rounding);
+  y1 = _mm256_add_epi32(y1, dct_rounding);
+  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
+  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
+
+  return _mm256_packs_epi32(y0, y1);
+}
+
+static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
+  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i u0, u1;
+  int i = 0;
+
+  while (i < 16) {
+    in[i] = _mm256_slli_epi16(in[i], 1);
+
+    u0 = _mm256_unpacklo_epi16(zero, in[i]);
+    u1 = _mm256_unpackhi_epi16(zero, in[i]);
+
+    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
+    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
+
+    u0 = _mm256_add_epi32(u0, dct_const_rounding);
+    u1 = _mm256_add_epi32(u1, dct_const_rounding);
+
+    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+    in[i] = _mm256_packs_epi32(u0, u1);
+    i++;
+  }
+}
+
 #endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 43b76ad62..18048d48b 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -122,6 +122,8 @@ AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct8x8_msa.c
 AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c
 
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
+
 ifeq ($(CONFIG_AV1_ENCODER),yes)
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_sse2.c
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_dct32x32_impl_sse2.h
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e66826f0a..8fac06c9b 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -114,7 +114,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
     specialize qw/av1_iht8x8_64_add sse2/;
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/av1_iht16x16_256_add sse2/;
+    specialize qw/av1_iht16x16_256_add sse2 avx2/;
   }
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
@@ -175,7 +175,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
     specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/av1_iht16x16_256_add sse2 dspr2/;
+    specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
 
     if (aom_config("CONFIG_EXT_TX") ne "yes") {
       specialize qw/av1_iht4x4_16_add msa/;
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 81581a47a..96b34ee53 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -984,17 +984,12 @@ void av1_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, int stride,
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      av1_iht16x16_256_add(input, dest, stride, tx_type);
-      break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-      // Use C version since DST only exists in C code
-      av1_iht16x16_256_add_c(input, dest, stride, tx_type);
-      break;
+    case H_FLIPADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
     case IDTX: inv_idtx_add_c(input, dest, stride, 16, tx_type); break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
diff --git a/av1/common/x86/hybrid_inv_txfm_avx2.c b/av1/common/x86/hybrid_inv_txfm_avx2.c
new file mode 100644
index 000000000..754152c43
--- /dev/null
+++ b/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // avx2
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  *in = _mm256_setr_epi16(
+      (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+      (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+      (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+      (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+      (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+      (int16_t)coeff[15]);
+#else
+  *in = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
+  int i = 0;
+  while (i < 16) {
+    load_coeff(coeff + (i << 4), &in[i]);
+    i += 1;
+  }
+}
+
+static void recon_and_store(const __m256i *res, uint8_t *output) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x = _mm_loadu_si128((__m128i const *)output);
+  __m128i p0 = _mm_unpacklo_epi8(x, zero);
+  __m128i p1 = _mm_unpackhi_epi8(x, zero);
+
+  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
+  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
+  x = _mm_packus_epi16(p0, p1);
+  _mm_storeu_si128((__m128i *)output, x);
+}
+
+#define IDCT_ROUNDING_POS (6)
+
+static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
+  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
+  int i = 0;
+
+  while (i < 16) {
+    in[i] = _mm256_add_epi16(in[i], rounding);
+    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
+    recon_and_store(&in[i], output + i * stride);
+    i += 1;
+  }
+}
+
+static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
+                                     const __m256i *c0, const __m256i *c1,
+                                     __m256i *b0, __m256i *b1) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  *b0 = butter_fly(x0, x1, *c0);
+  *b1 = butter_fly(x0, x1, *c1);
+}
+
+static void idct16_avx2(__m256i *in) {
+  const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
+  const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
+  const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
+  const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+
+  // stage 1, (0-7)
+  u0 = in[0];
+  u1 = in[8];
+  u2 = in[4];
+  u3 = in[12];
+  u4 = in[2];
+  u5 = in[10];
+  u6 = in[6];
+  u7 = in[14];
+
+  // stage 2, (0-7)
+  // stage 3, (0-7)
+  t0 = u0;
+  t1 = u1;
+  t2 = u2;
+  t3 = u3;
+  unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
+  unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
+
+  // stage 4, (0-7)
+  unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
+  unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
+  u4 = _mm256_add_epi16(t4, t5);
+  u5 = _mm256_sub_epi16(t4, t5);
+  u6 = _mm256_sub_epi16(t7, t6);
+  u7 = _mm256_add_epi16(t7, t6);
+
+  // stage 5, (0-7)
+  t0 = _mm256_add_epi16(u0, u3);
+  t1 = _mm256_add_epi16(u1, u2);
+  t2 = _mm256_sub_epi16(u1, u2);
+  t3 = _mm256_sub_epi16(u0, u3);
+  t4 = u4;
+  t7 = u7;
+  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
+
+  // stage 6, (0-7)
+  u0 = _mm256_add_epi16(t0, t7);
+  u1 = _mm256_add_epi16(t1, t6);
+  u2 = _mm256_add_epi16(t2, t5);
+  u3 = _mm256_add_epi16(t3, t4);
+  u4 = _mm256_sub_epi16(t3, t4);
+  u5 = _mm256_sub_epi16(t2, t5);
+  u6 = _mm256_sub_epi16(t1, t6);
+  u7 = _mm256_sub_epi16(t0, t7);
+
+  // stage 1, (8-15)
+  v0 = in[1];
+  v1 = in[9];
+  v2 = in[5];
+  v3 = in[13];
+  v4 = in[3];
+  v5 = in[11];
+  v6 = in[7];
+  v7 = in[15];
+
+  // stage 2, (8-15)
+  unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
+  unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
+  unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
+
+  // stage 3, (8-15)
+  v0 = _mm256_add_epi16(t0, t1);
+  v1 = _mm256_sub_epi16(t0, t1);
+  v2 = _mm256_sub_epi16(t3, t2);
+  v3 = _mm256_add_epi16(t2, t3);
+  v4 = _mm256_add_epi16(t4, t5);
+  v5 = _mm256_sub_epi16(t4, t5);
+  v6 = _mm256_sub_epi16(t7, t6);
+  v7 = _mm256_add_epi16(t6, t7);
+
+  // stage 4, (8-15)
+  t0 = v0;
+  t7 = v7;
+  t3 = v3;
+  t4 = v4;
+  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
+
+  // stage 5, (8-15)
+  v0 = _mm256_add_epi16(t0, t3);
+  v1 = _mm256_add_epi16(t1, t2);
+  v2 = _mm256_sub_epi16(t1, t2);
+  v3 = _mm256_sub_epi16(t0, t3);
+  v4 = _mm256_sub_epi16(t7, t4);
+  v5 = _mm256_sub_epi16(t6, t5);
+  v6 = _mm256_add_epi16(t6, t5);
+  v7 = _mm256_add_epi16(t7, t4);
+
+  // stage 6, (8-15)
+  t0 = v0;
+  t1 = v1;
+  t6 = v6;
+  t7 = v7;
+  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
+  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
+
+  // stage 7
+  in[0] = _mm256_add_epi16(u0, t7);
+  in[1] = _mm256_add_epi16(u1, t6);
+  in[2] = _mm256_add_epi16(u2, t5);
+  in[3] = _mm256_add_epi16(u3, t4);
+  in[4] = _mm256_add_epi16(u4, t3);
+  in[5] = _mm256_add_epi16(u5, t2);
+  in[6] = _mm256_add_epi16(u6, t1);
+  in[7] = _mm256_add_epi16(u7, t0);
+  in[8] = _mm256_sub_epi16(u7, t0);
+  in[9] = _mm256_sub_epi16(u6, t1);
+  in[10] = _mm256_sub_epi16(u5, t2);
+  in[11] = _mm256_sub_epi16(u4, t3);
+  in[12] = _mm256_sub_epi16(u3, t4);
+  in[13] = _mm256_sub_epi16(u2, t5);
+  in[14] = _mm256_sub_epi16(u1, t6);
+  in[15] = _mm256_sub_epi16(u0, t7);
+}
+
+static void idct16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  idct16_avx2(in);
+}
+
+static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
+                                 const __m256i *c0, const __m256i *c1,
+                                 __m256i *b) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  b[0] = _mm256_madd_epi16(x0, *c0);
+  b[1] = _mm256_madd_epi16(x1, *c0);
+  b[2] = _mm256_madd_epi16(x0, *c1);
+  b[3] = _mm256_madd_epi16(x1, *c1);
+}
+
+static INLINE void group_rounding(__m256i *a, int num) {
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  int i;
+  for (i = 0; i < num; ++i) {
+    a[i] = _mm256_add_epi32(a[i], dct_rounding);
+    a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
+  }
+}
+
+static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_add_epi32(a[0], b[0]);
+  x[1] = _mm256_add_epi32(a[1], b[1]);
+  x[2] = _mm256_add_epi32(a[2], b[2]);
+  x[3] = _mm256_add_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_sub_epi32(a[0], b[0]);
+  x[1] = _mm256_sub_epi32(a[1], b[1]);
+  x[2] = _mm256_sub_epi32(a[2], b[2]);
+  x[3] = _mm256_sub_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
+  group_rounding(a, 4);
+  out[0] = _mm256_packs_epi32(a[0], a[1]);
+  out[1] = _mm256_packs_epi32(a[2], a[3]);
+}
+
+static void iadst16_avx2(__m256i *in) {
+  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i x[16], s[16];
+  __m256i u[4], v[4];
+
+  // stage 1
+  butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
+  butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
+  add_rnd(u, v, &x[0]);
+  sub_rnd(u, v, &x[8]);
+
+  butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
+  butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
+  add_rnd(u, v, &x[2]);
+  sub_rnd(u, v, &x[10]);
+
+  butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
+  butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[12]);
+
+  butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
+  butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
+  add_rnd(u, v, &x[6]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 2
+  s[0] = _mm256_add_epi16(x[0], x[4]);
+  s[1] = _mm256_add_epi16(x[1], x[5]);
+  s[2] = _mm256_add_epi16(x[2], x[6]);
+  s[3] = _mm256_add_epi16(x[3], x[7]);
+  s[4] = _mm256_sub_epi16(x[0], x[4]);
+  s[5] = _mm256_sub_epi16(x[1], x[5]);
+  s[6] = _mm256_sub_epi16(x[2], x[6]);
+  s[7] = _mm256_sub_epi16(x[3], x[7]);
+  butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
+  butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
+  add_rnd(u, v, &s[8]);
+  sub_rnd(u, v, &s[12]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
+  add_rnd(u, v, &s[10]);
+  sub_rnd(u, v, &s[14]);
+
+  // stage 3
+  x[0] = _mm256_add_epi16(s[0], s[2]);
+  x[1] = _mm256_add_epi16(s[1], s[3]);
+  x[2] = _mm256_sub_epi16(s[0], s[2]);
+  x[3] = _mm256_sub_epi16(s[1], s[3]);
+
+  x[8] = _mm256_add_epi16(s[8], s[10]);
+  x[9] = _mm256_add_epi16(s[9], s[11]);
+  x[10] = _mm256_sub_epi16(s[8], s[10]);
+  x[11] = _mm256_sub_epi16(s[9], s[11]);
+
+  butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[6]);
+
+  butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[12]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 4
+  butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
+  butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
+  butterfly_rnd(u, &x[2]);
+  butterfly_rnd(v, &x[6]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
+  butterfly_rnd(u, &x[10]);
+  butterfly_rnd(v, &x[14]);
+
+  in[0] = x[0];
+  in[1] = _mm256_sub_epi16(zero, x[8]);
+  in[2] = x[12];
+  in[3] = _mm256_sub_epi16(zero, x[4]);
+  in[4] = x[6];
+  in[5] = x[14];
+  in[6] = x[10];
+  in[7] = x[2];
+  in[8] = x[3];
+  in[9] = x[11];
+  in[10] = x[15];
+  in[11] = x[7];
+  in[12] = x[5];
+  in[13] = _mm256_sub_epi16(zero, x[13]);
+  in[14] = x[9];
+  in[15] = _mm256_sub_epi16(zero, x[1]);
+}
+
+static void iadst16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  iadst16_avx2(in);
+}
+
+#if CONFIG_EXT_TX
+static void flip_row(__m256i *in, int rows) {
+  int i;
+  for (i = 0; i < rows; ++i) {
+    mm256_reverse_epi16(&in[i]);
+  }
+}
+
+static void flip_col(uint8_t **dest, int *stride, int rows) {
+  *dest = *dest + (rows - 1) * (*stride);
+  *stride = -*stride;
+}
+
+static void iidtx16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  txfm_scaling16_avx2(Sqrt2, in);
+}
+#endif
+
+void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m256i in[16];
+
+  load_buffer_16x16(input, in);
+  switch (tx_type) {
+    case DCT_DCT:
+      idct16(in);
+      idct16(in);
+      break;
+    case ADST_DCT:
+      idct16(in);
+      iadst16(in);
+      break;
+    case DCT_ADST:
+      iadst16(in);
+      idct16(in);
+      break;
+    case ADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16(in);
+      idct16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      flip_col(&dest, &stride, 16);
+      break;
+    case ADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case V_DCT:
+      iidtx16(in);
+      idct16(in);
+      break;
+    case H_DCT:
+      idct16(in);
+      iidtx16(in);
+      break;
+    case V_ADST:
+      iidtx16(in);
+      iadst16(in);
+      break;
+    case H_ADST:
+      iadst16(in);
+      iidtx16(in);
+      break;
+    case V_FLIPADST:
+      iidtx16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case H_FLIPADST:
+      iadst16(in);
+      iidtx16(in);
+      flip_row(in, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  write_buffer_16x16(in, stride, dest);
+}
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 10102e7ff..5d28a2831 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -242,69 +242,6 @@ void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   RECON_AND_STORE(dest + 7 * stride, in[7]);
 }
 
-void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, int tx_type) {
-  __m128i in[32];
-  __m128i *in0 = &in[0];
-  __m128i *in1 = &in[16];
-
-  load_buffer_8x16(input, in0);
-  input += 8;
-  load_buffer_8x16(input, in1);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      break;
-    case ADST_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      break;
-    case DCT_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      break;
-    case ADST_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      aom_idct16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-    case DCT_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_idct16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case FLIPADST_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case ADST_FLIPADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case FLIPADST_ADST:
-      aom_iadst16_sse2(in0, in1);
-      aom_iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-
-  write_buffer_8x16(dest, in0, stride);
-  dest += 8;
-  write_buffer_8x16(dest, in1, stride);
-}
-
 #if CONFIG_EXT_TX
 static void iidtx16_8col(__m128i *in) {
   const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
@@ -501,7 +438,98 @@ static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
   iidtx16_8col(in0);
   iidtx16_8col(in1);
 }
+#endif
 
+void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case ADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case DCT_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case ADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case V_DCT:
+      iidtx16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case H_DCT:
+      aom_idct16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_ADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case H_ADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_FLIPADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case H_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
+
+#if CONFIG_EXT_TX
 static void iidtx8_sse2(__m128i *in) {
   in[0] = _mm_slli_epi16(in[0], 1);
   in[1] = _mm_slli_epi16(in[1], 1);
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index f4bd1427d..77ae724f6 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -18,14 +18,6 @@
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
-static INLINE void mm256_reverse_epi16(__m256i *u) {
-  const __m256i control = _mm256_set_epi16(
-      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
-      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
-  __m256i v = _mm256_shuffle_epi8(*u, control);
-  *u = _mm256_permute2x128_si256(v, v, 1);
-}
-
 static int32_t get_16x16_sum(const int16_t *input, int stride) {
   __m256i r0, r1, r2, r3, u0, u1;
   __m256i zero = _mm256_setzero_si256();
@@ -71,134 +63,6 @@ void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
   _mm256_zeroupper();
 }
 
-static void mm256_transpose_16x16(__m256i *in) {
-  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
-  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
-  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
-  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
-  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
-  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
-  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
-
-  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
-  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
-  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
-  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
-  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
-  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
-  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
-  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
-
-  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
-  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
-  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
-  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
-  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
-  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
-  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
-  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
-
-  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
-  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
-  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
-  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
-  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
-  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
-  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
-  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
-
-  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
-  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
-  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
-  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
-  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
-  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
-  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
-  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
-
-  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
-  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
-  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
-  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
-  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
-  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
-  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
-  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
-
-  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
-  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
-  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
-  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
-  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
-  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
-  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
-  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
-
-  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
-  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
-  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
-  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
-  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
-  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
-  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
-  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
-
-  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
-  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
-  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
-  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
-  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
-  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
-  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
-  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-
-  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
-  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
-  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
-  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
-  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
-  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
-  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
-  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
-
-  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
-  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
-  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
-  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
-  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
-  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
-  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
-  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
-
-  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
-  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
-  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
-  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
-  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
-  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
-  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
-  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
-
-  in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
-  in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
-  in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
-  in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
-  in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
-  in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
-  in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
-  in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
-
-  in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
-  in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
-  in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
-  in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
-  in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
-  in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
-  in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
-  in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
-}
-
 static INLINE void load_buffer_16x16(const int16_t *input, int stride,
                                      int flipud, int fliplr, __m256i *in) {
   if (!flipud) {
@@ -352,19 +216,6 @@ static void right_shift_16x16(__m256i *in) {
   in[15] = _mm256_srai_epi16(in[15], 2);
 }
 
-static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i y0 = _mm256_madd_epi16(a0, cospi);
-  __m256i y1 = _mm256_madd_epi16(a1, cospi);
-
-  y0 = _mm256_add_epi32(y0, dct_rounding);
-  y1 = _mm256_add_epi32(y1, dct_rounding);
-  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
-
-  return _mm256_packs_epi32(y0, y1);
-}
-
 static void fdct16_avx2(__m256i *in) {
   // sequence: cospi_L_H = pairs(L, H) and L first
   const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
@@ -1099,31 +950,7 @@ void fadst16_avx2(__m256i *in) {
 }
 
 #if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) {
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sqrt2_epi16 = _mm256_set1_epi16((int16_t)Sqrt2);
-  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i u0, u1;
-  int i = 0;
-
-  while (i < 16) {
-    in[i] = _mm256_slli_epi16(in[i], 1);
-
-    u0 = _mm256_unpacklo_epi16(zero, in[i]);
-    u1 = _mm256_unpackhi_epi16(zero, in[i]);
-
-    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
-    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
-
-    u0 = _mm256_add_epi32(u0, dct_const_rounding);
-    u1 = _mm256_add_epi32(u1, dct_const_rounding);
-
-    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-    in[i] = _mm256_packs_epi32(u0, u1);
-    i++;
-  }
-}
+static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
 #endif
 
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index 4a44e16d7..0b890716a 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -33,6 +33,11 @@ void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
   av1_fht16x16_c(in, out, stride, tx_type);
 }
 
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                  int tx_type) {
+  av1_iht16x16_256_add_c(in, dest, stride, tx_type);
+}
+
 #if CONFIG_AOM_HIGHBITDEPTH
 typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                            int tx_type, int bd);
@@ -48,16 +53,6 @@ void highbd_fht16x16_ref(const int16_t *in, int32_t *out, int stride,
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-#if HAVE_AVX2
-void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
-                    int tx_type) {
-  (void)in;
-  (void)out;
-  (void)stride;
-  (void)tx_type;
-}
-#endif
-
 class AV1Trans16x16HT : public libaom_test::TransformTestBase,
                         public ::testing::TestWithParam<Ht16x16Param> {
  public:
@@ -70,6 +65,7 @@ class AV1Trans16x16HT : public libaom_test::TransformTestBase,
     pitch_ = 16;
     height_ = 16;
     fwd_txfm_ref = fht16x16_ref;
+    inv_txfm_ref = iht16x16_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
     num_coeffs_ = GET_PARAM(4);
@@ -90,6 +86,7 @@ class AV1Trans16x16HT : public libaom_test::TransformTestBase,
 };
 
 TEST_P(AV1Trans16x16HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
 
 #if CONFIG_AOM_HIGHBITDEPTH
 class AV1HighbdTrans16x16HT
@@ -203,22 +200,27 @@ INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x16HT,
 
 #if HAVE_AVX2
 const Ht16x16Param kArrayHt16x16Param_avx2[] = {
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 0, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 1, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 2, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 3, AOM_BITS_8, 256),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 256)
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 4, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 5, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 6, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 7, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 8, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 12, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 13, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 14, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 15, AOM_BITS_8, 256)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT,
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 540136cc0..64bf2d6a1 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -210,7 +210,7 @@ class TransformTestBase {
           int out_idx = j * stride + k;
           ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
               << "Error: not bit-exact result at index: " << out_idx
-              << " at test block: " << i;
+              << " j = " << j << " k = " << k << " at test block: " << i;
         }
       }
     }