Replace DST1 with DST2 for ext-tx experiment

A small gain (0.1 - 0.2%) with this experiment on derflr/hevcmr. The DST2 can be implemened very efficiently using sign flipping of odd indexed inputs, followed by DCT, followed by reversal of the output. This is how it is implemented in this patch. SIMD optimization is pending. Change-Id: Ic2fc211ce0e6b7c6702974d76d6573f55cc4da0e
2015-12-09 19:12:09 -08:00 · 2015-12-09 19:12:09 -08:00 · e6790e30c5
commit e6790e30c5
parent d7eb423a72
4 changed files with 1284 additions and 387 deletions
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@ -82,27 +82,6 @@ static const tran_high_t sinpi_2_9 = 9929;
 static const tran_high_t sinpi_3_9 = 13377;
 static const tran_high_t sinpi_4_9 = 15212;

-#if CONFIG_EXT_TX
-static const int32_t dst_lookup4[] = {
-  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
-  // at precision of 2 * DCT_CONST_BITS bits
-  141124871, 228344838,
-};
-
-static const int32_t dst_lookup8[] = {
-  // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
-  // at precision of 2 * DCT_CONST_BITS bits
-  86559612, 162678858, 219176632, 249238470
-};
-
-static const int32_t dst_lookup16[] = {
-  // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
-  // at precision of 2 * DCT_CONST_BITS bits
-  47852167, 94074787, 137093803, 175444254,
-  207820161, 233119001, 250479254, 259309736
-};
-#endif  // CONFIG_EXT_TX
-
 static INLINE tran_low_t check_range(tran_high_t input) {
 #if CONFIG_VP9_HIGHBITDEPTH
  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
@ -169,6 +148,220 @@ static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
  return clip_pixel(WRAPLOW(dest + trans, 8));
 }

+#if CONFIG_EXT_TX
+#define USE_DST2 1
+
+#if USE_DST2
+static const tran_high_t Tx4[4 * 4] = {
+  // dst2
+  6270,  15137,  15137,   6270,
+  11585,  11585, -11585, -11585,
+  15137,  -6270,  -6270,  15137,
+  11585, -11585,  11585, -11585,
+};
+
+static const tran_high_t Tx8[8 * 8] = {
+  // dst2
+  3196,   9102,  13623,  16069,  16069,  13623,   9102,   3196,
+  6270,  15137,  15137,   6270,  -6270, -15137, -15137,  -6270,
+  9102,  16069,   3196, -13623, -13623,   3196,  16069,   9102,
+  11585,  11585, -11585, -11585,  11585,  11585, -11585, -11585,
+  13623,   3196, -16069,   9102,   9102, -16069,   3196,  13623,
+  15137,  -6270,  -6270,  15137, -15137,   6270,   6270, -15137,
+  16069, -13623,   9102,  -3196,  -3196,   9102, -13623,  16069,
+  11585, -11585,  11585, -11585,  11585, -11585,  11585, -11585,
+};
+
+static const tran_high_t Tx16[16 * 16] = {
+  // dst2
+  1606,   4756,   7723,  10394,  12665,  14449,  15679,  16305,
+  16305,  15679,  14449,  12665,  10394,   7723,   4756,   1606,
+  3196,   9102,  13623,  16069,  16069,  13623,   9102,   3196,
+  -3196,  -9102, -13623, -16069, -16069, -13623,  -9102,  -3196,
+  4756,  12665,  16305,  14449,   7723,  -1606, -10394, -15679,
+  -15679, -10394,  -1606,   7723,  14449,  16305,  12665,   4756,
+  6270,  15137,  15137,   6270,  -6270, -15137, -15137,  -6270,
+  6270,  15137,  15137,   6270,  -6270, -15137, -15137,  -6270,
+  7723,  16305,  10394,  -4756, -15679, -12665,   1606,  14449,
+  14449,   1606, -12665, -15679,  -4756,  10394,  16305,   7723,
+  9102,  16069,   3196, -13623, -13623,   3196,  16069,   9102,
+  -9102, -16069,  -3196,  13623,  13623,  -3196, -16069,  -9102,
+  10394,  14449,  -4756, -16305,  -1606,  15679,   7723, -12665,
+  -12665,   7723,  15679,  -1606, -16305,  -4756,  14449,  10394,
+  11585,  11585, -11585, -11585,  11585,  11585, -11585, -11585,
+  11585,  11585, -11585, -11585,  11585,  11585, -11585, -11585,
+  12665,   7723, -15679,  -1606,  16305,  -4756, -14449,  10394,
+  10394, -14449,  -4756,  16305,  -1606, -15679,   7723,  12665,
+  13623,   3196, -16069,   9102,   9102, -16069,   3196,  13623,
+  -13623,  -3196,  16069,  -9102,  -9102,  16069,  -3196, -13623,
+  14449,  -1606, -12665,  15679,  -4756, -10394,  16305,  -7723,
+  -7723,  16305, -10394,  -4756,  15679, -12665,  -1606,  14449,
+  15137,  -6270,  -6270,  15137, -15137,   6270,   6270, -15137,
+  15137,  -6270,  -6270,  15137, -15137,   6270,   6270, -15137,
+  15679, -10394,   1606,   7723, -14449,  16305, -12665,   4756,
+  4756, -12665,  16305, -14449,   7723,   1606, -10394,  15679,
+  16069, -13623,   9102,  -3196,  -3196,   9102, -13623,  16069,
+  -16069,  13623,  -9102,   3196,   3196,  -9102,  13623, -16069,
+  16305, -15679,  14449, -12665,  10394,  -7723,   4756,  -1606,
+  -1606,   4756,  -7723,  10394, -12665,  14449, -15679,  16305,
+  11585, -11585,  11585, -11585,  11585, -11585,  11585, -11585,
+  11585, -11585,  11585, -11585,  11585, -11585,  11585, -11585,
+};
+#endif  // USE_DST2
+
+static INLINE void vp9_fgentx4(const tran_low_t *input, tran_low_t *output,
+                               const tran_high_t *T) {
+  tran_high_t sum;
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 4; ++i, Tx += 4) {
+    sum = Tx[0] * input[0] + Tx[1] * input[1] +
+          Tx[2] * input[2] + Tx[3] * input[3];
+    output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
+  }
+}
+
+static INLINE void vp9_fgentx8(const tran_low_t *input, tran_low_t *output,
+                               const tran_high_t *T) {
+  tran_high_t sum;
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 8; ++i, Tx += 8) {
+    sum = Tx[0] * input[0] + Tx[1] * input[1] +
+          Tx[2] * input[2] + Tx[3] * input[3] +
+          Tx[4] * input[4] + Tx[5] * input[5] +
+          Tx[6] * input[6] + Tx[7] * input[7];
+    output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
+  }
+}
+
+static INLINE void vp9_fgentx16(const tran_low_t *input, tran_low_t *output,
+                                const tran_high_t *T) {
+  tran_high_t sum;
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 16; ++i, Tx += 16) {
+    sum = Tx[0] * input[0] + Tx[1] * input[1] +
+          Tx[2] * input[2] + Tx[3] * input[3] +
+          Tx[4] * input[4] + Tx[5] * input[5] +
+          Tx[6] * input[6] + Tx[7] * input[7] +
+          Tx[8] * input[8] + Tx[9] * input[9] +
+          Tx[10] * input[10] + Tx[11] * input[11] +
+          Tx[12] * input[12] + Tx[13] * input[13] +
+          Tx[14] * input[14] + Tx[15] * input[15];
+    output[i] = ROUND_POWER_OF_TWO(sum, DCT_CONST_BITS);
+  }
+}
+
+static INLINE void vp9_igentx4(const tran_low_t *input, tran_low_t *output,
+                               const tran_high_t *T) {
+  tran_high_t sum[4];
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 4; ++i, ++Tx) {
+    sum[i] = Tx[0] * input[0] + Tx[4] * input[1] +
+             Tx[8] * input[2] + Tx[12] * input[3];
+  }
+  for (i = 0; i < 4; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
+  }
+}
+
+static INLINE void vp9_igentx8(const tran_low_t *input, tran_low_t *output,
+                               const tran_high_t *T) {
+  tran_high_t sum[8];
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 8; ++i, ++Tx) {
+    sum[i] = Tx[0] * input[0] + Tx[8] * input[1] +
+             Tx[16] * input[2] + Tx[24] * input[3] +
+             Tx[32] * input[4] + Tx[40] * input[5] +
+             Tx[48] * input[6] + Tx[56] * input[7];
+  }
+  for (i = 0; i < 8; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
+  }
+}
+
+static INLINE void vp9_igentx16(const tran_low_t *input, tran_low_t *output,
+                                const tran_high_t *T) {
+  tran_high_t sum[16];
+  int i;
+  const tran_high_t *Tx = T;
+  for (i = 0; i < 16; ++i, ++Tx) {
+    sum[i] = Tx[0] * input[0] + Tx[16] * input[1] +
+             Tx[32] * input[2] + Tx[48] * input[3] +
+             Tx[64] * input[4] + Tx[80] * input[5] +
+             Tx[96] * input[6] + Tx[112] * input[7] +
+             Tx[128] * input[8] + Tx[144] * input[9] +
+             Tx[160] * input[10] + Tx[176] * input[11] +
+             Tx[192] * input[12] + Tx[208] * input[13] +
+             Tx[224] * input[14] + Tx[240] * input[15];
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), 8);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vp9_highbd_igentx4(const tran_low_t *input,
+                                      tran_low_t *output,
+                                      int bd, const tran_high_t *T) {
+  tran_high_t sum[4];
+  int i;
+  const tran_high_t *Tx = T;
+  (void) bd;
+  for (i = 0; i < 4; ++i, Tx += 1) {
+    sum[i] = Tx[0] * input[0] + Tx[4] * input[1] +
+             Tx[8] * input[2] + Tx[12] * input[3];
+  }
+  for (i = 0; i < 4; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
+  }
+}
+
+static INLINE void vp9_highbd_igentx8(const tran_low_t *input,
+                                      tran_low_t *output,
+                                      int bd, const tran_high_t *T) {
+  tran_high_t sum[8];
+  int i;
+  const tran_high_t *Tx = T;
+  (void) bd;
+  for (i = 0; i < 8; ++i, Tx += 1) {
+    sum[i] = Tx[0] * input[0] + Tx[8] * input[1] +
+             Tx[16] * input[2] + Tx[24] * input[3] +
+             Tx[32] * input[4] + Tx[40] * input[5] +
+             Tx[48] * input[6] + Tx[56] * input[7];
+  }
+  for (i = 0; i < 8; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
+  }
+}
+
+static INLINE void vp9_highbd_igentx16(const tran_low_t *input,
+                                       tran_low_t *output,
+                                       int bd, const tran_high_t *T) {
+  tran_high_t sum[16];
+  int i;
+  const tran_high_t *Tx = T;
+  (void) bd;
+  for (i = 0; i < 16; ++i, Tx += 1) {
+    sum[i] = Tx[0] * input[0] + Tx[16] * input[1] +
+             Tx[32] * input[2] + Tx[48] * input[3] +
+             Tx[64] * input[4] + Tx[80] * input[5] +
+             Tx[96] * input[6] + Tx[112] * input[7] +
+             Tx[128] * input[8] + Tx[144] * input[9] +
+             Tx[160] * input[10] + Tx[176] * input[11] +
+             Tx[192] * input[12] + Tx[208] * input[13] +
+             Tx[224] * input[14] + Tx[240] * input[15];
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = WRAPLOW(ROUND_POWER_OF_TWO(sum[i], DCT_CONST_BITS), bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
+
 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@ -1874,9 +1874,6 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd,
  int is_global;
  gm = &xd->global_motion[mi->mbmi.ref_frame[ref]][0];
 #endif  // CONFIG_GLOBAL_MOTION
-#if CONFIG_INTRABC
-  assert(!is_intrabc || mi->mbmi.interp_filter == BILINEAR);
-#endif  // CONFIG_INTRABC

  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
  struct buf_2d *const dst_buf = &pd->dst;
@ -1910,6 +1907,9 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd,
  int xs, ys, subpel_x, subpel_y;
  const int is_scaled = vp9_is_scaled(sf);
  (void) dst_buf;
+#if CONFIG_INTRABC
+  assert(!is_intrabc || mi->mbmi.interp_filter == BILINEAR);
+#endif  // CONFIG_INTRABC

 #if CONFIG_GLOBAL_MOTION
  is_global = (get_y_mode(mi, block) == ZEROMV &&
@ -2017,10 +2017,10 @@ static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const MODE_INFO *mi = xd->mi[0].src_mi;
  const int is_compound = has_second_ref(&mi->mbmi);
+  int ref;
 #if CONFIG_INTRABC
  const int is_intrabc = is_intrabc_mode(mi->mbmi.mode);
 #endif  // CONFIG_INTRABC
-  int ref;
 #if CONFIG_GLOBAL_MOTION
  Global_Motion_Params *gm[2];
  gm[0] = &xd->global_motion[mi->mbmi.ref_frame[0]][0];
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@ -28,7 +28,38 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
 }

 #if CONFIG_EXT_TX
+void vp9_fklt4(const tran_low_t *input, tran_low_t *output) {
+  vp9_fgentx4(input, output, Tx4);
+}
+
+void vp9_fklt8(const tran_low_t *input, tran_low_t *output) {
+  vp9_fgentx8(input, output, Tx8);
+}
+
+void vp9_fklt16(const tran_low_t *input, tran_low_t *output) {
+  vp9_fgentx16(input, output, Tx16);
+}
+
 void vp9_fdst4(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  // vp9_fgentx4(input, output, Tx4);
+  tran_high_t step[4];
+  tran_high_t temp1, temp2;
+
+  step[0] = input[0] - input[3];
+  step[1] = -input[1] + input[2];
+  step[2] = -input[1] - input[2];
+  step[3] = input[0] + input[3];
+
+  temp1 = (step[0] + step[1]) * cospi_16_64;
+  temp2 = (step[0] - step[1]) * cospi_16_64;
+  output[3] = fdct_round_shift(temp1);
+  output[1] = fdct_round_shift(temp2);
+  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+  output[2] = fdct_round_shift(temp1);
+  output[0] = fdct_round_shift(temp2);
+#else
  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
  static const int32_t sinvalue_lookup[] = {
    141124871, 228344838,
@ -46,9 +77,61 @@ void vp9_fdst4(const tran_low_t *input, tran_low_t *output) {
  output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
  sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
  output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+#endif
 }

 void vp9_fdst8(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  // vp9_fgentx8(input, output, Tx8);
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+  tran_high_t t0, t1, t2, t3;                  // needs32
+  tran_high_t x0, x1, x2, x3;                  // canbe16
+
+  // stage 1
+  s0 = input[0] - input[7];
+  s1 = -input[1] + input[6];
+  s2 = input[2] - input[5];
+  s3 = -input[3] + input[4];
+  s4 = -input[3] - input[4];
+  s5 = input[2] + input[5];
+  s6 = -input[1] - input[6];
+  s7 = input[0] + input[7];
+
+  x0 = s0 + s3;
+  x1 = s1 + s2;
+  x2 = s1 - s2;
+  x3 = s0 - s3;
+  t0 = (x0 + x1) * cospi_16_64;
+  t1 = (x0 - x1) * cospi_16_64;
+  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+  output[7] = fdct_round_shift(t0);
+  output[5] = fdct_round_shift(t2);
+  output[3] = fdct_round_shift(t1);
+  output[1] = fdct_round_shift(t3);
+
+  // Stage 2
+  t0 = (s6 - s5) * cospi_16_64;
+  t1 = (s6 + s5) * cospi_16_64;
+  t2 = fdct_round_shift(t0);
+  t3 = fdct_round_shift(t1);
+
+  // Stage 3
+  x0 = s4 + t2;
+  x1 = s4 - t2;
+  x2 = s7 - t3;
+  x3 = s7 + t3;
+
+  // Stage 4
+  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+  output[6] = fdct_round_shift(t0);
+  output[4] = fdct_round_shift(t2);
+  output[2] = fdct_round_shift(t1);
+  output[0] = fdct_round_shift(t3);
+#else
  // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
  static const int sinvalue_lookup[] = {
    86559612, 162678858, 219176632, 249238470
@ -84,9 +167,151 @@ void vp9_fdst8(const tran_low_t *input, tran_low_t *output) {
  sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
  output[7] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+#endif
 }

 void vp9_fdst16(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  // vp9_fgentx16(input, output, Tx16);
+  tran_high_t step1[8];      // canbe16
+  tran_high_t step2[8];      // canbe16
+  tran_high_t step3[8];      // canbe16
+  tran_high_t in[8];      // canbe16
+  tran_high_t temp1, temp2;  // needs32
+
+  // step 1
+  in[0] = input[0] - input[15];
+  in[1] = -input[1] + input[14];
+  in[2] = input[2] - input[13];
+  in[3] = -input[3] + input[12];
+  in[4] = input[4] - input[11];
+  in[5] = -input[5] + input[10];
+  in[6] = input[6] - input[ 9];
+  in[7] = -input[7] + input[ 8];
+
+  step1[0] = -input[7] - input[ 8];
+  step1[1] = input[6] + input[ 9];
+  step1[2] = -input[5] - input[10];
+  step1[3] = input[4] + input[11];
+  step1[4] = -input[3] - input[12];
+  step1[5] = input[2] + input[13];
+  step1[6] = -input[1] - input[14];
+  step1[7] = input[0] + input[15];
+
+  // fdct8(step, step);
+  {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    // stage 1
+    s0 = in[0] + in[7];
+    s1 = in[1] + in[6];
+    s2 = in[2] + in[5];
+    s3 = in[3] + in[4];
+    s4 = in[3] - in[4];
+    s5 = in[2] - in[5];
+    s6 = in[1] - in[6];
+    s7 = in[0] - in[7];
+
+    // fdct4(step, step);
+    x0 = s0 + s3;
+    x1 = s1 + s2;
+    x2 = s1 - s2;
+    x3 = s0 - s3;
+    t0 = (x0 + x1) * cospi_16_64;
+    t1 = (x0 - x1) * cospi_16_64;
+    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+    output[15] = fdct_round_shift(t0);
+    output[11] = fdct_round_shift(t2);
+    output[7] = fdct_round_shift(t1);
+    output[3] = fdct_round_shift(t3);
+
+    // Stage 2
+    t0 = (s6 - s5) * cospi_16_64;
+    t1 = (s6 + s5) * cospi_16_64;
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
+
+    // Stage 3
+    x0 = s4 + t2;
+    x1 = s4 - t2;
+    x2 = s7 - t3;
+    x3 = s7 + t3;
+
+    // Stage 4
+    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+    output[13] = fdct_round_shift(t0);
+    output[9] = fdct_round_shift(t2);
+    output[5] = fdct_round_shift(t1);
+    output[1] = fdct_round_shift(t3);
+  }
+
+  // step 2
+  temp1 = (step1[5] - step1[2]) * cospi_16_64;
+  temp2 = (step1[4] - step1[3]) * cospi_16_64;
+  step2[2] = fdct_round_shift(temp1);
+  step2[3] = fdct_round_shift(temp2);
+  temp1 = (step1[4] + step1[3]) * cospi_16_64;
+  temp2 = (step1[5] + step1[2]) * cospi_16_64;
+  step2[4] = fdct_round_shift(temp1);
+  step2[5] = fdct_round_shift(temp2);
+
+  // step 3
+  step3[0] = step1[0] + step2[3];
+  step3[1] = step1[1] + step2[2];
+  step3[2] = step1[1] - step2[2];
+  step3[3] = step1[0] - step2[3];
+  step3[4] = step1[7] - step2[4];
+  step3[5] = step1[6] - step2[5];
+  step3[6] = step1[6] + step2[5];
+  step3[7] = step1[7] + step2[4];
+
+  // step 4
+  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+  step2[1] = fdct_round_shift(temp1);
+  step2[2] = fdct_round_shift(temp2);
+  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+  step2[5] = fdct_round_shift(temp1);
+  step2[6] = fdct_round_shift(temp2);
+
+  // step 5
+  step1[0] = step3[0] + step2[1];
+  step1[1] = step3[0] - step2[1];
+  step1[2] = step3[3] + step2[2];
+  step1[3] = step3[3] - step2[2];
+  step1[4] = step3[4] - step2[5];
+  step1[5] = step3[4] + step2[5];
+  step1[6] = step3[7] - step2[6];
+  step1[7] = step3[7] + step2[6];
+
+  // step 6
+  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+  output[14] = fdct_round_shift(temp1);
+  output[6] = fdct_round_shift(temp2);
+
+  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+  output[10] = fdct_round_shift(temp1);
+  output[2] = fdct_round_shift(temp2);
+
+  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+  output[12] = fdct_round_shift(temp1);
+  output[4] = fdct_round_shift(temp2);
+
+  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+  output[8] = fdct_round_shift(temp1);
+  output[0] = fdct_round_shift(temp2);
+#else
  // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
  static const int sinvalue_lookup[] = {
    47852167, 94074787, 137093803, 175444254,
@ -189,6 +414,7 @@ void vp9_fdst16(const tran_low_t *input, tran_low_t *output) {
        d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
        d69  * sinvalue_lookup[6] - d78  * sinvalue_lookup[7];
  output[15] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+#endif
 }
 #endif  // CONFIG_EXT_TX