Adds wavelet transforms + hybrid dct/dwt variants

The wavelets implemented are 2/6, 5/3 and 9/7 each with a lifting based scheme for even block sizes. The 9/7 one is a double implementation currently. This is to start experiments with: 1. Replacing large transforms (32x32 and 64x64) with wavelets or wavelet-dct hybrids that can hopefully localize errors better spatially. (Will also need alternate entropy coder) 2. Super-resolution modes where the higher sub-bands may be selectively skipped from being conveyed, while a smart reconstruction recovers the lost frequencies. The current patch includes two types of 32x32 and 64x64 transforms: one where only wavelets are used, and another where a single level wavelet decomposition is followed by a lower resolution dct on the low-low band. Change-Id: I2d6755c4e6c8ec9386a04633dacbe0de3b0043ec
2015-06-02 12:25:28 -07:00
parent 5a69abc66b
commit b433dd4443
11 changed files with 1085 additions and 1 deletions
--- a/1
+++ b/1
@@ -301,6 +301,7 @@ EXPERIMENT_LIST="
    bitstream_fixes
    newmvref
    misc_entropy
    wavelets
 "
 CONFIG_LIST="
    external_build
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -709,6 +709,33 @@ void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
  }
 }
 #if CONFIG_WAVELETS
 void vp9_idct16x16_noscale_c(const tran_low_t *input, int16_t *dest,
                             int stride) {
  tran_low_t out[16 * 16];
  tran_low_t *outptr = out;
  int i, j;
  tran_low_t temp_in[16], temp_out[16];
  // First transform rows
  for (i = 0; i < 16; ++i) {
    idct16(input, outptr);
    input += 16;
    outptr += 16;
  }
  // Then transform columns
  for (i = 0; i < 16; ++i) {
    for (j = 0; j < 16; ++j)
      temp_in[j] = out[j * 16 + i];
    idct16(temp_in, temp_out);
    for (j = 0; j < 16; ++j) {
      dest[j * stride + i] = ROUND_POWER_OF_TWO(temp_out[j], 3);
    }
  }
 }
 #endif  // CONFIG_WAVELETS
 static void iadst16(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  tran_high_t s9, s10, s11, s12, s13, s14, s15;
@@ -1361,6 +1388,46 @@ void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
  }
 }
 #if CONFIG_WAVELETS
 void vp9_idct32x32_noscale_c(const tran_low_t *input, int16_t *dest,
                             int stride) {
  tran_low_t out[32 * 32];
  tran_low_t *outptr = out;
  int i, j;
  tran_low_t temp_in[32], temp_out[32];
  // Rows
  for (i = 0; i < 32; ++i) {
    int16_t zero_coeff[16];
    for (j = 0; j < 16; ++j)
      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
    for (j = 0; j < 8; ++j)
      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    for (j = 0; j < 4; ++j)
      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    for (j = 0; j < 2; ++j)
      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
    if (zero_coeff[0] | zero_coeff[1])
      idct32(input, outptr);
    else
      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
    input += 32;
    outptr += 32;
  }
  // Columns
  for (i = 0; i < 32; ++i) {
    for (j = 0; j < 32; ++j)
      temp_in[j] = out[j * 32 + i];
    idct32(temp_in, temp_out);
    for (j = 0; j < 32; ++j) {
      dest[j * stride + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
    }
  }
 }
 #endif  // CONFIG_WAVELETS
 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                            int stride) {
  tran_low_t out[32 * 32] = {0};
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -156,7 +156,7 @@ void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
 #if CONFIG_TX64X64
 void vp9_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
                       int eob);
-#endif
+#endif  // CONFIG_TX64X64
 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                    int stride, int eob);
 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
--- a/vp9/common/vp9_idwt.c
+++ b/vp9/common/vp9_idwt.c
@@ -0,0 +1,352 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <math.h>
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idwt.h"
 // Note: block length must be even for this implementation
 static void synthesis_53_row(int length,
                             tran_low_t *lowpass, tran_low_t *highpass,
                             tran_low_t *x) {
  tran_low_t r, *a, *b;
  int n;
  n = length >> 1;
  b = highpass;
  a = lowpass;
  r = *highpass;
  while (n--) {
    *a++ -= (r + (*b) + 1) >> 1;
    r = *b++;
  }
  n = length >> 1;
  b = highpass;
  a = lowpass;
  while (--n) {
    *x++ = ((r = *a++) + 1) >> 1;
    *x++ = *b++ + ((r + (*a) + 2) >> 2);
  }
  *x++ = ((r = *a) + 1) >> 1;
  *x++ = *b + ((r + 1) >> 1);
 }
 static void synthesis_53_col(int length,
                             tran_low_t *lowpass, tran_low_t *highpass,
                             tran_low_t *x) {
  tran_low_t r, *a, *b;
  int n;
  n = length >> 1;
  b = highpass;
  a = lowpass;
  r = *highpass;
  while (n--) {
    *a++ -= (r + (*b) + 1) >> 1;
    r = *b++;
  }
  n = length >> 1;
  b = highpass;
  a = lowpass;
  while (--n) {
    r = *a++;
    *x++ = r;
    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
  }
  *x++ = *a;
  *x++ = ((*b) << 1) + *a;
 }
 static void dyadic_synthesize_53(int levels, int width, int height,
                                 tran_low_t *c, int pitch_c,
                                 int16_t *x, int pitch_x,
                                 int dwt_scale_bits) {
  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
  tran_low_t buffer[2 * DWT_MAX_LENGTH];
  const int dwt_scale_rnd = 1 << (dwt_scale_bits - 1);
  th[0] = hh;
  tw[0] = hw;
  for (i = 1; i <= levels; i++) {
    th[i] = (th[i - 1] + 1) >> 1;
    tw[i] = (tw[i - 1] + 1) >> 1;
  }
  for (lv = levels - 1; lv >= 0; lv--) {
    nh = th[lv];
    nw = tw[lv];
    hh = th[lv + 1];
    hw = tw[lv + 1];
    if ((nh < 2) || (nw < 2)) continue;
    for (j = 0; j < nw; j++) {
      for (i = 0; i < nh; i++)
        buffer[i] = c[i * pitch_c + j];
      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
      for (i = 0; i < nh; i++)
        c[i * pitch_c + j] = buffer[i + nh];
    }
    for (i = 0; i < nh; i++) {
      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
    }
  }
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
          ((c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits) :
          -((-c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits);
    }
  }
 }
 // Note: block length must be even for this implementation
 static void synthesis_26_row(int length,
                             tran_low_t *lowpass, tran_low_t *highpass,
                             tran_low_t *x) {
  tran_low_t r, s, *a, *b;
  int i, n = length >> 1;
  if (n >= 4) {
    a = lowpass;
    b = highpass;
    r = *lowpass;
    while (--n) {
      *b++ += (r - a[1] + 4) >> 3;
      r = *a++;
    }
    *b += (r - *a + 4) >> 3;
  }
  a = lowpass;
  b = highpass;
  for (i = length >> 1; i; i--) {
    s = *b++;
    r = *a++;
    *x++ = (r + s + 1) >> 1;
    *x++ = (r - s + 1) >> 1;
  }
 }
 static void synthesis_26_col(int length,
                             tran_low_t *lowpass, tran_low_t *highpass,
                             tran_low_t *x) {
  tran_low_t r, s, *a, *b;
  int i, n = length >> 1;
  if (n >= 4) {
    a = lowpass;
    b = highpass;
    r = *lowpass;
    while (--n) {
      *b++ += (r - a[1] + 4) >> 3;
      r = *a++;
    }
    *b += (r - *a + 4) >> 3;
  }
  a = lowpass;
  b = highpass;
  for (i = length >> 1; i; i--) {
    s = *b++;
    r = *a++;
    *x++ = r + s;
    *x++ = r - s;
  }
 }
 static void dyadic_synthesize_26(int levels, int width, int height,
                                 tran_low_t *c, int pitch_c,
                                 int16_t *x, int pitch_x,
                                 int dwt_scale_bits) {
  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
  tran_low_t buffer[2 * DWT_MAX_LENGTH];
  const int dwt_scale_rnd = 1 << (dwt_scale_bits - 1);
  th[0] = hh;
  tw[0] = hw;
  for (i = 1; i <= levels; i++) {
    th[i] = (th[i - 1] + 1) >> 1;
    tw[i] = (tw[i - 1] + 1) >> 1;
  }
  for (lv = levels - 1; lv >= 0; lv--) {
    nh = th[lv];
    nw = tw[lv];
    hh = th[lv + 1];
    hw = tw[lv + 1];
    if ((nh < 2) || (nw < 2)) continue;
    for (j = 0; j < nw; j++) {
      for (i = 0; i < nh; i++)
        buffer[i] = c[i * pitch_c + j];
      synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);
      for (i = 0; i < nh; i++)
        c[i * pitch_c + j] = buffer[i + nh];
    }
    for (i = 0; i < nh; i++) {
      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
      synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
    }
  }
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
          ((c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits) :
          -((-c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits);
    }
  }
 }
 static void synthesis_97(int length, double *lowpass, double *highpass,
                         double *x) {
  const double a_predict1 = -1.586134342;
  const double a_update1 = -0.05298011854;
  const double a_predict2 = 0.8829110762;
  const double a_update2 = 0.4435068522;
  const double s_low = 1.149604398;
  const double s_high = 1/1.149604398;
  const double inv_s_low = 1 / s_low;
  const double inv_s_high = 1 / s_high;
  int i;
  double y[DWT_MAX_LENGTH];
  // Undo pack and scale
  for (i = 0; i < length / 2; i++) {
    y[i * 2] = lowpass[i] * inv_s_low;
    y[i * 2 + 1] = highpass[i] * inv_s_high;
  }
  memcpy(x, y, sizeof(*y) * length);
  // Undo update 2
  for (i = 2; i < length; i += 2) {
    x[i] -= a_update2 * (x[i - 1] + x[i + 1]);
  }
  x[0] -= 2 * a_update2 * x[1];
  // Undo predict 2
  for (i = 1; i < length - 2; i += 2) {
    x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);
  }
  x[length - 1] -= 2 * a_predict2 * x[length - 2];
  // Undo update 1
  for (i = 2; i < length; i += 2) {
    x[i] -= a_update1 * (x[i - 1] + x[i + 1]);
  }
  x[0] -= 2 * a_update1 * x[1];
  // Undo predict 1
  for (i = 1; i < length - 2; i += 2) {
    x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);
  }
  x[length - 1] -= 2 * a_predict1 * x[length - 2];
 }
 static void dyadic_synthesize_97(int levels, int width, int height,
                                 tran_low_t *c, int pitch_c,
                                 int16_t *x, int pitch_x,
                                 int dwt_scale_bits) {
  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
  double buffer[2 * DWT_MAX_LENGTH];
  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
  for (i = 0; i < height; i++)
    for (j = 0; j < width; j++)
      y[i * DWT_MAX_LENGTH + j] = c[i * pitch_c + j];
  th[0] = hh;
  tw[0] = hw;
  for (i = 1; i <= levels; i++) {
    th[i] = (th[i - 1] + 1) >> 1;
    tw[i] = (tw[i - 1] + 1) >> 1;
  }
  for (lv = levels - 1; lv >= 0; lv--) {
    nh = th[lv];
    nw = tw[lv];
    hh = th[lv + 1];
    hw = tw[lv + 1];
    if ((nh < 2) || (nw < 2)) continue;
    for (j = 0; j < nw; j++) {
      for (i = 0; i < nh; i++)
        buffer[i] = y[i * DWT_MAX_LENGTH + j];
      synthesis_97(nh, buffer, buffer + hh, buffer + nh);
      for (i = 0; i < nh; i++)
        y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];
    }
    for (i = 0; i < nh; i++) {
      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
      synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);
    }
  }
  for (i = 0; i < height; i++)
    for (j = 0; j < width; j++)
      x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /
                                 (1 << dwt_scale_bits));
 }
 void vp9_idwt32x32_c(tran_low_t *input, tran_low_t *output, int stride) {
 #if DWT_TYPE == 26
  dyadic_synthesize_26(4, 32, 32, input, 32, output, stride, 2);
 #elif DWT_TYPE == 97
  dyadic_synthesize_97(4, 32, 32, input, 32, output, stride, 2);
 #elif DWT_TYPE == 53
  dyadic_synthesize_53(4, 32, 32, input, 32, output, stride, 2);
 #endif
 }
 void vp9_idwtdct32x32_c(tran_low_t *input, tran_low_t *output, int stride) {
  const int dwt_levels = 1;
  tran_low_t buffer[16 * 16];
  tran_low_t buffer2[32 * 32];
  int i;
  for (i = 0; i < 32; ++i) {
    memcpy(&buffer2[i * 32], &input[i * 32], sizeof(buffer2[0]) * 32);
  }
  for (i = 0; i < 16; ++i) {
    memcpy(&buffer[i * 16], &input[i * 32], sizeof(buffer[0]) * 16);
  }
  vp9_idct16x16_noscale(buffer, buffer2, 32);
 #if DWT_TYPE == 26
  dyadic_synthesize_26(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
 #elif DWT_TYPE == 97
  dyadic_synthesize_97(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
 #elif DWT_TYPE == 53
  dyadic_synthesize_53(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
 #endif
 }
 #if CONFIG_TX64X64
 void vp9_idwt64x64_c(tran_low_t *input, tran_low_t *output, int stride) {
 #if DWT_TYPE == 26
  dyadic_synthesize_26(4, 64, 64, input, 64, output, stride, 1);
 #elif DWT_TYPE == 97
  dyadic_synthesize_97(4, 64, 64, input, 64, output, stride, 1);
 #elif DWT_TYPE == 53
  dyadic_synthesize_53(4, 64, 64, input, 64, output, stride, 1);
 #endif
 }
 void vp9_idwtdct64x64_c(tran_low_t *input, tran_low_t *output, int stride) {
  const int dwt_levels = 1;
  tran_low_t buffer[32 * 32];
  tran_low_t buffer2[64 * 64];
  int i;
  for (i = 0; i < 64; ++i) {
    memcpy(&buffer2[i * 64], &input[i * 64], sizeof(buffer2[0]) * 64);
  }
  for (i = 0; i < 32; ++i) {
    memcpy(&buffer[i * 32], &input[i * 64], sizeof(buffer[0]) * 32);
  }
  vp9_idct32x32_noscale(buffer, buffer2, 64);
 #if DWT_TYPE == 26
  dyadic_synthesize_26(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
 #elif DWT_TYPE == 97
  dyadic_synthesize_97(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
 #elif DWT_TYPE == 53
  dyadic_synthesize_53(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
 #endif
 }
 #endif  // CONFIG_TX64X64
--- a/vp9/common/vp9_idwt.h
+++ b/vp9/common/vp9_idwt.h
@@ -0,0 +1,39 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP9_COMMON_VP9_IDWT_H_
 #define VP9_COMMON_VP9_IDWT_H_
 #include <assert.h>
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_idct.h"
 #define DWT_MAX_LENGTH   64
 #define DWT_TYPE         26    // 26/53/97
 #ifdef __cplusplus
 extern "C" {
 #endif
 #if CONFIG_TX64X64
 void vp9_idwt64x64(tran_low_t *input, tran_low_t *output, int stride);
 void vp9_idwtdct64x64(tran_low_t *input, tran_low_t *output, int stride);
 #endif  // CONFIG_TX64X64
 void vp9_idwt32x32(tran_low_t *input, tran_low_t *output, int stride);
 void vp9_idwtdct32x32(tran_low_t *input, tran_low_t *output, int stride);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP9_COMMON_VP9_IDWT_H_
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -396,6 +396,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp9_idct16x16_256_add/;
  if (vpx_config("CONFIG_WAVELETS") eq "yes") {
    add_proto qw/void vp9_idct16x16_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
    specialize qw/vp9_idct16x16_noscale/;
  }
  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp9_idct16x16_10_add/;
@@ -411,6 +416,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  if (vpx_config("CONFIG_TX64X64") eq "yes") {
    add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct64x64_4096_add/;
    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
      add_proto qw/void vp9_idct32x32_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
      specialize qw/vp9_idct32x32_noscale/;
    }
  }
  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
@@ -454,6 +464,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct16x16_256_add/;
    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
      add_proto qw/void vp9_idct16x16_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
      specialize qw/vp9_idct16x16_noscale/;
    }
    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct16x16_10_add/;
@@ -469,6 +484,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    if (vpx_config("CONFIG_TX64X64") eq "yes") {
      add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
      specialize qw/vp9_idct64x64_4096_add/;
      if (vpx_config("CONFIG_WAVELETS") eq "yes") {
        add_proto qw/void vp9_idct32x32_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
        specialize qw/vp9_idct32x32_noscale/;
      }
    }
    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
@@ -516,6 +536,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
    $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
      add_proto qw/void vp9_idct16x16_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
      specialize qw/vp9_idct16x16_noscale/;
    }
    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
    $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
@@ -535,6 +560,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    if (vpx_config("CONFIG_TX64X64") eq "yes") {
      add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
      specialize qw/vp9_idct64x64_4096_add/;
      if (vpx_config("CONFIG_WAVELETS") eq "yes") {
        add_proto qw/void vp9_idct32x32_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
        specialize qw/vp9_idct32x32_noscale/;
      }
    }
    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
@@ -1498,12 +1528,22 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_fdct32x32_rd/;
  if (vpx_config("CONFIG_WAVELETS") eq "yes") {
    add_proto qw/void vp9_fdct16x16_noscale/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct16x16_noscale/;
  }
  if (vpx_config("CONFIG_TX64X64") eq "yes") {
    add_proto qw/void vp9_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct64x64_1/;
    add_proto qw/void vp9_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct64x64/;
    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
      add_proto qw/void vp9_fdct32x32_noscale/, "const int16_t *input, tran_low_t *output, int stride";
      specialize qw/vp9_fdct32x32_noscale/;
    }
  }
  specialize qw/vp9_fdct32x32_rd sse2/;
 } else {
@@ -1546,12 +1586,22 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_fdct32x32_rd sse2 avx2/;
  if (vpx_config("CONFIG_WAVELETS") eq "yes") {
    add_proto qw/void vp9_fdct16x16_noscale/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct16x16_noscale/;
  }
  if (vpx_config("CONFIG_TX64X64") eq "yes") {
    add_proto qw/void vp9_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct64x64_1/;
    add_proto qw/void vp9_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct64x64/;
    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
      add_proto qw/void vp9_fdct32x32_noscale/, "const int16_t *input, tran_low_t *output, int stride";
      specialize qw/vp9_fdct32x32_noscale/;
    }
  }
 }
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -522,6 +522,193 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
  }
 }
 #if CONFIG_WAVELETS
 // The difference between this one and the function above is scaling
 // of the input. This function does not scale so that the actual 2D
 // transform is unitary. The function above scales the transform to be
 // 8 times unitary.
 void vp9_fdct16x16_noscale_c(const int16_t *input, tran_low_t *output,
                             int stride) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we transpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  int pass;
  // We need an intermediate buffer between passes.
  tran_low_t intermediate[256];
  const int16_t *in_pass0 = input;
  const tran_low_t *in = NULL;
  tran_low_t *out = intermediate;
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    tran_high_t step1[8];      // canbe16
    tran_high_t step2[8];      // canbe16
    tran_high_t step3[8];      // canbe16
    tran_high_t input[8];      // canbe16
    tran_high_t temp1, temp2;  // needs32
    int i;
    for (i = 0; i < 16; i++) {
      if (0 == pass) {
        // Calculate input for the first 8 results.
        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) >> 1;
        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) >> 1;
        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) >> 1;
        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) >> 1;
        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) >> 1;
        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) >> 1;
        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) >> 1;
        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) >> 1;
        // Calculate input for the next 8 results.
        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) >> 1;
        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) >> 1;
        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) >> 1;
        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) >> 1;
        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) >> 1;
        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) >> 1;
        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) >> 1;
        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) >> 1;
      } else {
        // Calculate input for the first 8 results.
        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
        // Calculate input for the next 8 results.
        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
      }
      // Work on the first eight values; fdct8(input, even_results);
      {
        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
        tran_high_t t0, t1, t2, t3;                  // needs32
        tran_high_t x0, x1, x2, x3;                  // canbe16
        // stage 1
        s0 = input[0] + input[7];
        s1 = input[1] + input[6];
        s2 = input[2] + input[5];
        s3 = input[3] + input[4];
        s4 = input[3] - input[4];
        s5 = input[2] - input[5];
        s6 = input[1] - input[6];
        s7 = input[0] - input[7];
        // fdct4(step, step);
        x0 = s0 + s3;
        x1 = s1 + s2;
        x2 = s1 - s2;
        x3 = s0 - s3;
        t0 = (x0 + x1) * cospi_16_64;
        t1 = (x0 - x1) * cospi_16_64;
        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
        out[0] = fdct_round_shift(t0);
        out[4] = fdct_round_shift(t2);
        out[8] = fdct_round_shift(t1);
        out[12] = fdct_round_shift(t3);
        // Stage 2
        t0 = (s6 - s5) * cospi_16_64;
        t1 = (s6 + s5) * cospi_16_64;
        t2 = fdct_round_shift(t0);
        t3 = fdct_round_shift(t1);
        // Stage 3
        x0 = s4 + t2;
        x1 = s4 - t2;
        x2 = s7 - t3;
        x3 = s7 + t3;
        // Stage 4
        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
        out[2] = fdct_round_shift(t0);
        out[6] = fdct_round_shift(t2);
        out[10] = fdct_round_shift(t1);
        out[14] = fdct_round_shift(t3);
      }
      // Work on the next eight values; step1 -> odd_results
      {
        // step 2
        temp1 = (step1[5] - step1[2]) * cospi_16_64;
        temp2 = (step1[4] - step1[3]) * cospi_16_64;
        step2[2] = fdct_round_shift(temp1);
        step2[3] = fdct_round_shift(temp2);
        temp1 = (step1[4] + step1[3]) * cospi_16_64;
        temp2 = (step1[5] + step1[2]) * cospi_16_64;
        step2[4] = fdct_round_shift(temp1);
        step2[5] = fdct_round_shift(temp2);
        // step 3
        step3[0] = step1[0] + step2[3];
        step3[1] = step1[1] + step2[2];
        step3[2] = step1[1] - step2[2];
        step3[3] = step1[0] - step2[3];
        step3[4] = step1[7] - step2[4];
        step3[5] = step1[6] - step2[5];
        step3[6] = step1[6] + step2[5];
        step3[7] = step1[7] + step2[4];
        // step 4
        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
        step2[1] = fdct_round_shift(temp1);
        step2[2] = fdct_round_shift(temp2);
        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
        step2[5] = fdct_round_shift(temp1);
        step2[6] = fdct_round_shift(temp2);
        // step 5
        step1[0] = step3[0] + step2[1];
        step1[1] = step3[0] - step2[1];
        step1[2] = step3[3] + step2[2];
        step1[3] = step3[3] - step2[2];
        step1[4] = step3[4] - step2[5];
        step1[5] = step3[4] + step2[5];
        step1[6] = step3[7] - step2[6];
        step1[7] = step3[7] + step2[6];
        // step 6
        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
        out[1] = fdct_round_shift(temp1);
        out[9] = fdct_round_shift(temp2);
        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
        out[5] = fdct_round_shift(temp1);
        out[13] = fdct_round_shift(temp2);
        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
        out[3] = fdct_round_shift(temp1);
        out[11] = fdct_round_shift(temp2);
        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
        out[7] = fdct_round_shift(temp1);
        out[15] = fdct_round_shift(temp2);
      }
      // Do next column (which is a transposed row in second/horizontal pass)
      in++;
      in_pass0++;
      out += 16;
    }
    // Setup in/out for next pass.
    in = intermediate;
    out = output;
  }
 }
 #endif  // CONFIG_WAVELETS
 void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -1389,6 +1576,35 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
  }
 }
 #if CONFIG_WAVELETS
 void vp9_fdct32x32_noscale_c(const int16_t *input, tran_low_t *out,
                             int stride) {
  int i, j;
  tran_high_t output[32 * 32];
  // Columns
  for (i = 0; i < 32; ++i) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = input[j * stride + i];
    vp9_fdct32(temp_in, temp_out, 0);
    for (j = 0; j < 32; ++j)
      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
  }
  // Rows
  for (i = 0; i < 32; ++i) {
    tran_high_t temp_in[32], temp_out[32];
    for (j = 0; j < 32; ++j)
      temp_in[j] = output[j + i * 32];
    vp9_fdct32(temp_in, temp_out, 0);
    for (j = 0; j < 32; ++j)
      out[j + i * 32] = (tran_low_t)
          ((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
  }
 }
 #endif  // CONFIG_WAVELETS
 // Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
--- a/vp9/encoder/vp9_dwt.c
+++ b/vp9/encoder/vp9_dwt.c
@@ -0,0 +1,323 @@
 /*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include <math.h>
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/encoder/vp9_dct.h"
 #include "vp9/encoder/vp9_dwt.h"
 // Note: block length must be even for this implementation
 static void analysis_53_row(int length, tran_low_t *x,
                            tran_low_t *lowpass, tran_low_t *highpass) {
  int n;
  tran_low_t r, *a, *b;
  n = length >> 1;
  b = highpass;
  a = lowpass;
  while (--n) {
    *a++ = (r = *x++) << 1;
    *b++ = *x - ((r + x[1] + 1) >> 1);
    x++;
  }
  *a = (r = *x++) << 1;
  *b = *x - r;
  n = length >> 1;
  b = highpass;
  a = lowpass;
  r = *highpass;
  while (n--) {
    *a++ += (r + (*b) + 1) >> 1;
    r = *b++;
  }
 }
 static void analysis_53_col(int length, tran_low_t *x,
                            tran_low_t *lowpass, tran_low_t *highpass) {
  int n;
  tran_low_t r, *a, *b;
  n = length >> 1;
  b = highpass;
  a = lowpass;
  while (--n) {
    *a++ = (r = *x++);
    *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;
    x++;
  }
  *a = (r = *x++);
  *b = (*x - r + 1) >> 1;
  n = length >> 1;
  b = highpass;
  a = lowpass;
  r = *highpass;
  while (n--) {
    *a++ += (r + (*b) + 1) >> 1;
    r = *b++;
  }
 }
 static void dyadic_analyze_53(int levels, int width, int height,
                              int16_t *x, int pitch_x,
                              tran_low_t *c, int pitch_c,
                              int dwt_scale_bits) {
  int lv, i, j, nh, nw, hh = height, hw = width;
  tran_low_t buffer[2 * DWT_MAX_LENGTH];
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
    }
  }
  for (lv = 0; lv < levels; lv++) {
    nh = hh;
    hh = (hh + 1) >> 1;
    nw = hw;
    hw = (hw + 1) >> 1;
    if ((nh < 2) || (nw < 2)) return;
    for (i = 0; i < nh; i++) {
      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
    }
    for (j = 0; j < nw; j++) {
      for (i = 0; i < nh; i++)
        buffer[i + nh] = c[i * pitch_c + j];
      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
      for (i = 0; i < nh; i++)
        c[i * pitch_c + j] = buffer[i];
    }
  }
 }
 static void analysis_26_row(int length, tran_low_t *x,
                            tran_low_t *lowpass, tran_low_t *highpass) {
  int i, n;
  tran_low_t r, s, *a, *b;
  a = lowpass;
  b = highpass;
  for (i = length >> 1; i; i--) {
    r = *x++;
    s = *x++;
    *a++ = r + s;
    *b++ = r - s;
  }
  n = length >> 1;
  if (n >= 4) {
    a = lowpass;
    b = highpass;
    r = *lowpass;
    while (--n) {
      *b++ -= (r - a[1] + 4) >> 3;
      r = *a++;
    }
    *b -= (r - *a + 4) >> 3;
  }
 }
 static void analysis_26_col(int length, tran_low_t *x,
                            tran_low_t *lowpass, tran_low_t *highpass) {
  int i, n;
  tran_low_t r, s, *a, *b;
  a = lowpass;
  b = highpass;
  for (i = length >> 1; i; i--) {
    r = *x++;
    s = *x++;
    *a++ = (r + s + 1) >> 1;
    *b++ = (r - s + 1) >> 1;
  }
  n = length >> 1;
  if (n >= 4) {
    a = lowpass;
    b = highpass;
    r = *lowpass;
    while (--n) {
      *b++ -= (r - a[1] + 4) >> 3;
      r = *a++;
    }
    *b -= (r - *a + 4) >> 3;
  }
 }
 static void dyadic_analyze_26(int levels, int width, int height,
                              int16_t *x, int pitch_x,
                              tran_low_t *c, int pitch_c,
                              int dwt_scale_bits) {
  int lv, i, j, nh, nw, hh = height, hw = width;
  tran_low_t buffer[2 * DWT_MAX_LENGTH];
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
    }
  }
  for (lv = 0; lv < levels; lv++) {
    nh = hh;
    hh = (hh + 1) >> 1;
    nw = hw;
    hw = (hw + 1) >> 1;
    if ((nh < 2) || (nw < 2)) return;
    for (i = 0; i < nh; i++) {
      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
      analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
    }
    for (j = 0; j < nw; j++) {
      for (i = 0; i < nh; i++)
        buffer[i + nh] = c[i * pitch_c + j];
      analysis_26_col(nh, buffer + nh, buffer, buffer + hh);
      for (i = 0; i < nh; i++)
        c[i * pitch_c + j] = buffer[i];
    }
  }
 }
 static void analysis_97(int length, double *x,
                        double *lowpass, double *highpass) {
  static const double a_predict1 = -1.586134342;
  static const double a_update1 = -0.05298011854;
  static const double a_predict2 = 0.8829110762;
  static const double a_update2 = 0.4435068522;
  static const double s_low = 1.149604398;
  static const double s_high = 1/1.149604398;
  int i;
  double y[DWT_MAX_LENGTH];
  // Predict 1
  for (i = 1; i < length - 2; i += 2) {
    x[i] += a_predict1 * (x[i - 1] + x[i + 1]);
  }
  x[length - 1] += 2 * a_predict1 * x[length - 2];
  // Update 1
  for (i = 2; i < length; i += 2) {
    x[i] += a_update1 * (x[i - 1] + x[i + 1]);
  }
  x[0] += 2 * a_update1 * x[1];
  // Predict 2
  for (i = 1; i < length - 2; i += 2) {
    x[i] += a_predict2 * (x[i - 1] + x[i + 1]);
  }
  x[length - 1] += 2 * a_predict2 * x[length - 2];
  // Update 2
  for (i = 2; i < length; i += 2) {
    x[i] += a_update2 * (x[i - 1] + x[i + 1]);
  }
  x[0] += 2 * a_update2 * x[1];
  memcpy(y, x, sizeof(*y) * length);
  // Scale and pack
  for (i = 0; i < length / 2; i++) {
    lowpass[i] = y[2 * i] * s_low;
    highpass[i] = y[2 * i + 1] * s_high;
  }
 }
 static void dyadic_analyze_97(int levels, int width, int height,
                              int16_t *x, int pitch_x,
                              tran_low_t *c, int pitch_c,
                              int dwt_scale_bits) {
  int lv, i, j, nh, nw, hh = height, hw = width;
  double buffer[2 * DWT_MAX_LENGTH];
  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << dwt_scale_bits;
    }
  }
  for (lv = 0; lv < levels; lv++) {
    nh = hh;
    hh = (hh + 1) >> 1;
    nw = hw;
    hw = (hw + 1) >> 1;
    if ((nh < 2) || (nw < 2)) return;
    for (i = 0; i < nh; i++) {
      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
      analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH],
                  &y[i * DWT_MAX_LENGTH] + hw);
    }
    for (j = 0; j < nw; j++) {
      for (i = 0; i < nh; i++)
        buffer[i + nh] = y[i * DWT_MAX_LENGTH + j];
      analysis_97(nh, buffer + nh, buffer, buffer + hh);
      for (i = 0; i < nh; i++)
        y[i * DWT_MAX_LENGTH + j] = buffer[i];
    }
  }
  for (i = 0; i < height; i++) {
    for (j = 0; j < width; j++) {
      c[i * pitch_c + j] = round(y[i * DWT_MAX_LENGTH + j]);
    }
  }
 }
 void vp9_fdwt32x32_c(tran_low_t *input, tran_low_t *output, int stride) {
 #if DWT_TYPE == 26
  dyadic_analyze_26(4, 32, 32, input, stride, output, 32, 2);
 #elif DWT_TYPE == 97
  dyadic_analyze_97(4, 32, 32, input, stride, output, 32, 2);
 #elif DWT_TYPE == 53
  dyadic_analyze_53(4, 32, 32, input, stride, output, 32, 2);
 #endif
 }
 void vp9_fdwtdct32x32_c(tran_low_t *input, tran_low_t *output,
                        int stride) {
  const int dwt_levels = 1;
  tran_low_t buffer[16 * 16];
  int i, j;
  // Scales up by 2-bit from unitary
 #if DWT_TYPE == 26
  dyadic_analyze_26(dwt_levels, 32, 32, input, stride, output, 32, 2);
 #elif DWT_TYPE == 97
  dyadic_analyze_97(dwt_levels, 32, 32, input, stride, output, 32, 2);
 #elif DWT_TYPE == 53
  dyadic_analyze_53(dwt_levels, 32, 32, input, stride, output, 32, 2);
 #endif
  // 16x16 dct in LL band that is unitary
  vp9_fdct16x16_noscale(output, buffer, 32);
  // Note that the transform overall is 2-bit scaled up from unitary
  for (i = 0; i < 16; ++i) {
    memcpy(&output[i * 32], &buffer[i * 16], sizeof(buffer[0]) * 16);
  }
 }
 #if CONFIG_TX64X64
 void vp9_fdwt64x64_c(tran_low_t *input, tran_low_t *output, int stride) {
 #if DWT_TYPE == 26
  dyadic_analyze_26(4, 64, 64, input, stride, output, 64, 1);
 #elif DWT_TYPE == 97
  dyadic_analyze_97(4, 64, 64, input, stride, output, 64, 1);
 #elif DWT_TYPE == 53
  dyadic_analyze_53(4, 64, 64, input, stride, output, 64, 1);
 #endif
 }
 void vp9_fdwtdct64x64_c(tran_low_t *input, tran_low_t *output,
                        int stride) {
  const int dwt_levels = 1;
  tran_low_t buffer[32 * 32];
  int i;
  // Scales up by 1-bit from unitary
 #if DWT_TYPE == 26
  dyadic_analyze_26(dwt_levels, 64, 64, input, stride, output, 64, 1);
 #elif DWT_TYPE == 97
  dyadic_analyze_97(dwt_levels, 64, 64, input, stride, output, 64, 1);
 #elif DWT_TYPE == 53
  dyadic_analyze_53(dwt_levels, 64, 64, input, stride, output, 64, 1);
 #endif
  // 32x32 dct in LL band that is unitary
  vp9_fdct32x32_noscale(output, buffer, 64);
  // Note that the transform overall is 1-bit scaled up from unitary
  for (i = 0; i < 32; ++i) {
    memcpy(&output[i * 64], &buffer[i * 32], sizeof(buffer[0]) * 32);
  }
 }
 #endif  // CONFIG_TX64X64
--- a/vp9/encoder/vp9_dwt.h
+++ b/vp9/encoder/vp9_dwt.h
@@ -0,0 +1,32 @@
 /*
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef VP9_ENCODER_VP9_DWT_H_
 #define VP9_ENCODER_VP9_DWT_H_
 #include "./vpx_config.h"
 #include "vp9/common/vp9_idwt.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #if CONFIG_TX64X64
 void vp9_fdwt64x64(tran_low_t *input, tran_low_t *output, int stride);
 void vp9_fdwtdct64x64(tran_low_t *input, tran_low_t *output, int stride);
 #endif  // CONFIG_TX64X64
 void vp9_fdwt32x32(tran_low_t *input, tran_low_t *output, int stride);
 void vp9_fdwtdct32x32(tran_low_t *input, tran_low_t *output, int stride);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // VP9_ENCODER_VP9_DWT_H_
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -70,6 +70,8 @@ VP9_COMMON_SRCS-yes += common/vp9_scan.c
 VP9_COMMON_SRCS-yes += common/vp9_scan.h
 VP9_COMMON_SRCS-$(CONFIG_GLOBAL_MOTION) += common/vp9_motion_model.c
 VP9_COMMON_SRCS-$(CONFIG_GLOBAL_MOTION) += common/vp9_motion_model.h
 VP9_COMMON_SRCS-$(CONFIG_WAVELETS) += common/vp9_idwt.c
 VP9_COMMON_SRCS-$(CONFIG_WAVELETS) += common/vp9_idwt.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -85,6 +85,8 @@ VP9_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/vp9_global_motion.c
 VP9_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/vp9_global_motion.h
 VP9_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/vp9_motion_field.c
 VP9_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/vp9_motion_field.h
 VP9_CX_SRCS-$(CONFIG_WAVELETS) += encoder/vp9_dwt.c
 VP9_CX_SRCS-$(CONFIG_WAVELETS) += encoder/vp9_dwt.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c