Adds wavelet transforms + hybrid dct/dwt variants

The wavelets implemented are 2/6, 5/3 and 9/7 each with a lifting based scheme for even block sizes. The 9/7 one is a double implementation currently. This is to start experiments with: 1. Replacing large transforms (32x32 and 64x64) with wavelets or wavelet-dct hybrids that can hopefully localize errors better spatially. (Will also need alternate entropy coder) 2. Super-resolution modes where the higher sub-bands may be selectively skipped from being conveyed, while a smart reconstruction recovers the lost frequencies. The current patch includes two types of 32x32 and 64x64 transforms: one where only wavelets are used, and another where a single level wavelet decomposition is followed by a lower resolution dct on the low-low band. Change-Id: I2d6755c4e6c8ec9386a04633dacbe0de3b0043ec
2015-06-02 12:25:28 -07:00 · 2015-06-02 12:25:28 -07:00 · b433dd4443
commit b433dd4443
parent 5a69abc66b
11 changed files with 1085 additions and 1 deletions
--- a/1
+++ b/1
@ -301,6 +301,7 @@ EXPERIMENT_LIST="
    bitstream_fixes
    newmvref
    misc_entropy
+    wavelets
 "
 CONFIG_LIST="
    external_build
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@ -709,6 +709,33 @@ void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
  }
 }

+#if CONFIG_WAVELETS
+void vp9_idct16x16_noscale_c(const tran_low_t *input, int16_t *dest,
+                             int stride) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows
+  for (i = 0; i < 16; ++i) {
+    idct16(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    idct16(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = ROUND_POWER_OF_TWO(temp_out[j], 3);
+    }
+  }
+}
+#endif  // CONFIG_WAVELETS
+
 static void iadst16(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  tran_high_t s9, s10, s11, s12, s13, s14, s15;
@ -1361,6 +1388,46 @@ void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
  }
 }

+#if CONFIG_WAVELETS
+void vp9_idct32x32_noscale_c(const tran_low_t *input, int16_t *dest,
+                             int stride) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    int16_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      idct32(input, outptr);
+    else
+      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+    }
+  }
+}
+#endif  // CONFIG_WAVELETS
+
 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                            int stride) {
  tran_low_t out[32 * 32] = {0};
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@ -156,7 +156,7 @@ void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
 #if CONFIG_TX64X64
 void vp9_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
                       int eob);
-#endif
+#endif  // CONFIG_TX64X64
 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                    int stride, int eob);
 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
--- a/vp9/common/vp9_idwt.c
+++ b/vp9/common/vp9_idwt.c
@ -0,0 +1,352 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idwt.h"
+
+
+// Note: block length must be even for this implementation
+static void synthesis_53_row(int length,
+                             tran_low_t *lowpass, tran_low_t *highpass,
+                             tran_low_t *x) {
+  tran_low_t r, *a, *b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *x++ = ((r = *a++) + 1) >> 1;
+    *x++ = *b++ + ((r + (*a) + 2) >> 2);
+  }
+  *x++ = ((r = *a) + 1) >> 1;
+  *x++ = *b + ((r + 1) >> 1);
+}
+
+static void synthesis_53_col(int length,
+                             tran_low_t *lowpass, tran_low_t *highpass,
+                             tran_low_t *x) {
+  tran_low_t r, *a, *b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    r = *a++;
+    *x++ = r;
+    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
+  }
+  *x++ = *a;
+  *x++ = ((*b) << 1) + *a;
+}
+
+static void dyadic_synthesize_53(int levels, int width, int height,
+                                 tran_low_t *c, int pitch_c,
+                                 int16_t *x, int pitch_x,
+                                 int dwt_scale_bits) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+  const int dwt_scale_rnd = 1 << (dwt_scale_bits - 1);
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
+      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
+    }
+  }
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
+          ((c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits) :
+          -((-c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits);
+    }
+  }
+}
+
+// Note: block length must be even for this implementation
+static void synthesis_26_row(int length,
+                             tran_low_t *lowpass, tran_low_t *highpass,
+                             tran_low_t *x) {
+  tran_low_t r, s, *a, *b;
+  int i, n = length >> 1;
+
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ += (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b += (r - *a + 4) >> 3;
+  }
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    s = *b++;
+    r = *a++;
+    *x++ = (r + s + 1) >> 1;
+    *x++ = (r - s + 1) >> 1;
+  }
+}
+
+static void synthesis_26_col(int length,
+                             tran_low_t *lowpass, tran_low_t *highpass,
+                             tran_low_t *x) {
+  tran_low_t r, s, *a, *b;
+  int i, n = length >> 1;
+
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ += (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b += (r - *a + 4) >> 3;
+  }
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    s = *b++;
+    r = *a++;
+    *x++ = r + s;
+    *x++ = r - s;
+  }
+}
+
+static void dyadic_synthesize_26(int levels, int width, int height,
+                                 tran_low_t *c, int pitch_c,
+                                 int16_t *x, int pitch_x,
+                                 int dwt_scale_bits) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+  const int dwt_scale_rnd = 1 << (dwt_scale_bits - 1);
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
+      synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
+    }
+  }
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
+          ((c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits) :
+          -((-c[i * pitch_c + j] + dwt_scale_rnd) >> dwt_scale_bits);
+    }
+  }
+}
+
+static void synthesis_97(int length, double *lowpass, double *highpass,
+                         double *x) {
+  const double a_predict1 = -1.586134342;
+  const double a_update1 = -0.05298011854;
+  const double a_predict2 = 0.8829110762;
+  const double a_update2 = 0.4435068522;
+  const double s_low = 1.149604398;
+  const double s_high = 1/1.149604398;
+  const double inv_s_low = 1 / s_low;
+  const double inv_s_high = 1 / s_high;
+  int i;
+  double y[DWT_MAX_LENGTH];
+  // Undo pack and scale
+  for (i = 0; i < length / 2; i++) {
+    y[i * 2] = lowpass[i] * inv_s_low;
+    y[i * 2 + 1] = highpass[i] * inv_s_high;
+  }
+  memcpy(x, y, sizeof(*y) * length);
+  // Undo update 2
+  for (i = 2; i < length; i += 2) {
+    x[i] -= a_update2 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] -= 2 * a_update2 * x[1];
+  // Undo predict 2
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] -= 2 * a_predict2 * x[length - 2];
+  // Undo update 1
+  for (i = 2; i < length; i += 2) {
+    x[i] -= a_update1 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] -= 2 * a_update1 * x[1];
+  // Undo predict 1
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] -= 2 * a_predict1 * x[length - 2];
+}
+
+static void dyadic_synthesize_97(int levels, int width, int height,
+                                 tran_low_t *c, int pitch_c,
+                                 int16_t *x, int pitch_x,
+                                 int dwt_scale_bits) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  double buffer[2 * DWT_MAX_LENGTH];
+  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
+
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      y[i * DWT_MAX_LENGTH + j] = c[i * pitch_c + j];
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = y[i * DWT_MAX_LENGTH + j];
+      synthesis_97(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
+      synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);
+    }
+  }
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /
+                                 (1 << dwt_scale_bits));
+}
+
+void vp9_idwt32x32_c(tran_low_t *input, tran_low_t *output, int stride) {
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(4, 32, 32, input, 32, output, stride, 2);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(4, 32, 32, input, 32, output, stride, 2);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(4, 32, 32, input, 32, output, stride, 2);
+#endif
+}
+
+void vp9_idwtdct32x32_c(tran_low_t *input, tran_low_t *output, int stride) {
+  const int dwt_levels = 1;
+  tran_low_t buffer[16 * 16];
+  tran_low_t buffer2[32 * 32];
+  int i;
+  for (i = 0; i < 32; ++i) {
+    memcpy(&buffer2[i * 32], &input[i * 32], sizeof(buffer2[0]) * 32);
+  }
+  for (i = 0; i < 16; ++i) {
+    memcpy(&buffer[i * 16], &input[i * 32], sizeof(buffer[0]) * 16);
+  }
+  vp9_idct16x16_noscale(buffer, buffer2, 32);
+
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(dwt_levels, 32, 32, buffer2, 32, output, stride, 2);
+#endif
+}
+
+#if CONFIG_TX64X64
+void vp9_idwt64x64_c(tran_low_t *input, tran_low_t *output, int stride) {
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(4, 64, 64, input, 64, output, stride, 1);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(4, 64, 64, input, 64, output, stride, 1);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(4, 64, 64, input, 64, output, stride, 1);
+#endif
+}
+
+void vp9_idwtdct64x64_c(tran_low_t *input, tran_low_t *output, int stride) {
+  const int dwt_levels = 1;
+  tran_low_t buffer[32 * 32];
+  tran_low_t buffer2[64 * 64];
+  int i;
+  for (i = 0; i < 64; ++i) {
+    memcpy(&buffer2[i * 64], &input[i * 64], sizeof(buffer2[0]) * 64);
+  }
+  for (i = 0; i < 32; ++i) {
+    memcpy(&buffer[i * 32], &input[i * 64], sizeof(buffer[0]) * 32);
+  }
+  vp9_idct32x32_noscale(buffer, buffer2, 64);
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(dwt_levels, 64, 64, buffer2, 64, output, stride, 1);
+#endif
+}
+#endif  // CONFIG_TX64X64
--- a/vp9/common/vp9_idwt.h
+++ b/vp9/common/vp9_idwt.h
@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_IDWT_H_
+#define VP9_COMMON_VP9_IDWT_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_idct.h"
+
+#define DWT_MAX_LENGTH   64
+#define DWT_TYPE         26    // 26/53/97
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_TX64X64
+void vp9_idwt64x64(tran_low_t *input, tran_low_t *output, int stride);
+void vp9_idwtdct64x64(tran_low_t *input, tran_low_t *output, int stride);
+#endif  // CONFIG_TX64X64
+void vp9_idwt32x32(tran_low_t *input, tran_low_t *output, int stride);
+void vp9_idwtdct32x32(tran_low_t *input, tran_low_t *output, int stride);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_IDWT_H_
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -396,6 +396,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp9_idct16x16_256_add/;

+  if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+    add_proto qw/void vp9_idct16x16_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
+    specialize qw/vp9_idct16x16_noscale/;
+  }
+
  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp9_idct16x16_10_add/;

@ -411,6 +416,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  if (vpx_config("CONFIG_TX64X64") eq "yes") {
    add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct64x64_4096_add/;
+
+    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+      add_proto qw/void vp9_idct32x32_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
+      specialize qw/vp9_idct32x32_noscale/;
+    }
  }

  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
@ -454,6 +464,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct16x16_256_add/;

+    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+      add_proto qw/void vp9_idct16x16_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
+      specialize qw/vp9_idct16x16_noscale/;
+    }
+
    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct16x16_10_add/;

@ -469,6 +484,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    if (vpx_config("CONFIG_TX64X64") eq "yes") {
      add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
      specialize qw/vp9_idct64x64_4096_add/;
+
+      if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+        add_proto qw/void vp9_idct32x32_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
+        specialize qw/vp9_idct32x32_noscale/;
+      }
    }

    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
@ -516,6 +536,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
    $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;

+    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+      add_proto qw/void vp9_idct16x16_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
+      specialize qw/vp9_idct16x16_noscale/;
+    }
+
    add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
    $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
@ -535,6 +560,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    if (vpx_config("CONFIG_TX64X64") eq "yes") {
      add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
      specialize qw/vp9_idct64x64_4096_add/;
+
+      if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+        add_proto qw/void vp9_idct32x32_noscale/, "const tran_low_t *input, int16_t *dest, int dest_stride";
+        specialize qw/vp9_idct32x32_noscale/;
+      }
    }

    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
@ -1498,12 +1528,22 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_fdct32x32_rd/;

+  if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+    add_proto qw/void vp9_fdct16x16_noscale/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct16x16_noscale/;
+  }
+
  if (vpx_config("CONFIG_TX64X64") eq "yes") {
    add_proto qw/void vp9_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct64x64_1/;

    add_proto qw/void vp9_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct64x64/;
+
+    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+      add_proto qw/void vp9_fdct32x32_noscale/, "const int16_t *input, tran_low_t *output, int stride";
+      specialize qw/vp9_fdct32x32_noscale/;
+    }
  }
  specialize qw/vp9_fdct32x32_rd sse2/;
 } else {
@ -1546,12 +1586,22 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_fdct32x32_rd sse2 avx2/;

+  if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+    add_proto qw/void vp9_fdct16x16_noscale/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct16x16_noscale/;
+  }
+
  if (vpx_config("CONFIG_TX64X64") eq "yes") {
    add_proto qw/void vp9_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct64x64_1/;

    add_proto qw/void vp9_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
    specialize qw/vp9_fdct64x64/;
+
+    if (vpx_config("CONFIG_WAVELETS") eq "yes") {
+      add_proto qw/void vp9_fdct32x32_noscale/, "const int16_t *input, tran_low_t *output, int stride";
+      specialize qw/vp9_fdct32x32_noscale/;
+    }
  }
 }

--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@ -522,6 +522,193 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
  }
 }

+#if CONFIG_WAVELETS
+// The difference between this one and the function above is scaling
+// of the input. This function does not scale so that the actual 2D
+// transform is unitary. The function above scales the transform to be
+// 8 times unitary.
+void vp9_fdct16x16_noscale_c(const int16_t *input, tran_low_t *output,
+                             int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[256];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t step1[8];      // canbe16
+    tran_high_t step2[8];      // canbe16
+    tran_high_t step3[8];      // canbe16
+    tran_high_t input[8];      // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 16; i++) {
+      if (0 == pass) {
+        // Calculate input for the first 8 results.
+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) >> 1;
+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) >> 1;
+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) >> 1;
+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) >> 1;
+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) >> 1;
+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) >> 1;
+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) >> 1;
+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) >> 1;
+        // Calculate input for the next 8 results.
+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) >> 1;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) >> 1;
+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) >> 1;
+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) >> 1;
+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) >> 1;
+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) >> 1;
+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) >> 1;
+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) >> 1;
+      } else {
+        // Calculate input for the first 8 results.
+        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+        // Calculate input for the next 8 results.
+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+        tran_high_t t0, t1, t2, t3;                  // needs32
+        tran_high_t x0, x1, x2, x3;                  // canbe16
+
+        // stage 1
+        s0 = input[0] + input[7];
+        s1 = input[1] + input[6];
+        s2 = input[2] + input[5];
+        s3 = input[3] + input[4];
+        s4 = input[3] - input[4];
+        s5 = input[2] - input[5];
+        s6 = input[1] - input[6];
+        s7 = input[0] - input[7];
+
+        // fdct4(step, step);
+        x0 = s0 + s3;
+        x1 = s1 + s2;
+        x2 = s1 - s2;
+        x3 = s0 - s3;
+        t0 = (x0 + x1) * cospi_16_64;
+        t1 = (x0 - x1) * cospi_16_64;
+        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+        out[0] = fdct_round_shift(t0);
+        out[4] = fdct_round_shift(t2);
+        out[8] = fdct_round_shift(t1);
+        out[12] = fdct_round_shift(t3);
+
+        // Stage 2
+        t0 = (s6 - s5) * cospi_16_64;
+        t1 = (s6 + s5) * cospi_16_64;
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
+
+        // Stage 3
+        x0 = s4 + t2;
+        x1 = s4 - t2;
+        x2 = s7 - t3;
+        x3 = s7 + t3;
+
+        // Stage 4
+        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+        out[2] = fdct_round_shift(t0);
+        out[6] = fdct_round_shift(t2);
+        out[10] = fdct_round_shift(t1);
+        out[14] = fdct_round_shift(t3);
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        temp1 = (step1[5] - step1[2]) * cospi_16_64;
+        temp2 = (step1[4] - step1[3]) * cospi_16_64;
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
+        temp1 = (step1[4] + step1[3]) * cospi_16_64;
+        temp2 = (step1[5] + step1[2]) * cospi_16_64;
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
+        // step 3
+        step3[0] = step1[0] + step2[3];
+        step3[1] = step1[1] + step2[2];
+        step3[2] = step1[1] - step2[2];
+        step3[3] = step1[0] - step2[3];
+        step3[4] = step1[7] - step2[4];
+        step3[5] = step1[6] - step2[5];
+        step3[6] = step1[6] + step2[5];
+        step3[7] = step1[7] + step2[4];
+        // step 4
+        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
+        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
+        // step 5
+        step1[0] = step3[0] + step2[1];
+        step1[1] = step3[0] - step2[1];
+        step1[2] = step3[3] + step2[2];
+        step1[3] = step3[3] - step2[2];
+        step1[4] = step3[4] - step2[5];
+        step1[5] = step3[4] + step2[5];
+        step1[6] = step3[7] - step2[6];
+        step1[7] = step3[7] + step2[6];
+        // step 6
+        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+        out[1] = fdct_round_shift(temp1);
+        out[9] = fdct_round_shift(temp2);
+        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+        out[5] = fdct_round_shift(temp1);
+        out[13] = fdct_round_shift(temp2);
+        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+        out[3] = fdct_round_shift(temp1);
+        out[11] = fdct_round_shift(temp2);
+        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+        out[7] = fdct_round_shift(temp1);
+        out[15] = fdct_round_shift(temp2);
+      }
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in++;
+      in_pass0++;
+      out += 16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+#endif  // CONFIG_WAVELETS
+
 void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

@ -1389,6 +1576,35 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
  }
 }

+#if CONFIG_WAVELETS
+void vp9_fdct32x32_noscale_c(const int16_t *input, tran_low_t *out,
+                             int stride) {
+  int i, j;
+  tran_high_t output[32 * 32];
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = input[j * stride + i];
+    vp9_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i * 32];
+    vp9_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] = (tran_low_t)
+          ((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+#endif  // CONFIG_WAVELETS
+
 // Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
--- a/vp9/encoder/vp9_dwt.c
+++ b/vp9/encoder/vp9_dwt.c
@ -0,0 +1,323 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vp9/encoder/vp9_dct.h"
+#include "vp9/encoder/vp9_dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x,
+                            tran_low_t *lowpass, tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) << 1;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) << 1;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, tran_low_t *x,
+                            tran_low_t *lowpass, tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void dyadic_analyze_53(int levels, int width, int height,
+                              int16_t *x, int pitch_x,
+                              tran_low_t *c, int pitch_c,
+                              int dwt_scale_bits) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+    }
+  }
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+static void analysis_26_row(int length, tran_low_t *x,
+                            tran_low_t *lowpass, tran_low_t *highpass) {
+  int i, n;
+  tran_low_t r, s, *a, *b;
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    r = *x++;
+    s = *x++;
+    *a++ = r + s;
+    *b++ = r - s;
+  }
+  n = length >> 1;
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ -= (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b -= (r - *a + 4) >> 3;
+  }
+}
+
+static void analysis_26_col(int length, tran_low_t *x,
+                            tran_low_t *lowpass, tran_low_t *highpass) {
+  int i, n;
+  tran_low_t r, s, *a, *b;
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    r = *x++;
+    s = *x++;
+    *a++ = (r + s + 1) >> 1;
+    *b++ = (r - s + 1) >> 1;
+  }
+  n = length >> 1;
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ -= (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b -= (r - *a + 4) >> 3;
+  }
+}
+
+static void dyadic_analyze_26(int levels, int width, int height,
+                              int16_t *x, int pitch_x,
+                              tran_low_t *c, int pitch_c,
+                              int dwt_scale_bits) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+    }
+  }
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+      analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i + nh] = c[i * pitch_c + j];
+      analysis_26_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+static void analysis_97(int length, double *x,
+                        double *lowpass, double *highpass) {
+  static const double a_predict1 = -1.586134342;
+  static const double a_update1 = -0.05298011854;
+  static const double a_predict2 = 0.8829110762;
+  static const double a_update2 = 0.4435068522;
+  static const double s_low = 1.149604398;
+  static const double s_high = 1/1.149604398;
+  int i;
+  double y[DWT_MAX_LENGTH];
+  // Predict 1
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] += a_predict1 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] += 2 * a_predict1 * x[length - 2];
+  // Update 1
+  for (i = 2; i < length; i += 2) {
+    x[i] += a_update1 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] += 2 * a_update1 * x[1];
+  // Predict 2
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] += a_predict2 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] += 2 * a_predict2 * x[length - 2];
+  // Update 2
+  for (i = 2; i < length; i += 2) {
+    x[i] += a_update2 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] += 2 * a_update2 * x[1];
+  memcpy(y, x, sizeof(*y) * length);
+  // Scale and pack
+  for (i = 0; i < length / 2; i++) {
+    lowpass[i] = y[2 * i] * s_low;
+    highpass[i] = y[2 * i + 1] * s_high;
+  }
+}
+
+static void dyadic_analyze_97(int levels, int width, int height,
+                              int16_t *x, int pitch_x,
+                              tran_low_t *c, int pitch_c,
+                              int dwt_scale_bits) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  double buffer[2 * DWT_MAX_LENGTH];
+  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << dwt_scale_bits;
+    }
+  }
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
+      analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH],
+                  &y[i * DWT_MAX_LENGTH] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i + nh] = y[i * DWT_MAX_LENGTH + j];
+      analysis_97(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++)
+        y[i * DWT_MAX_LENGTH + j] = buffer[i];
+    }
+  }
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      c[i * pitch_c + j] = round(y[i * DWT_MAX_LENGTH + j]);
+    }
+  }
+}
+
+void vp9_fdwt32x32_c(tran_low_t *input, tran_low_t *output, int stride) {
+#if DWT_TYPE == 26
+  dyadic_analyze_26(4, 32, 32, input, stride, output, 32, 2);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(4, 32, 32, input, stride, output, 32, 2);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(4, 32, 32, input, stride, output, 32, 2);
+#endif
+}
+
+void vp9_fdwtdct32x32_c(tran_low_t *input, tran_low_t *output,
+                        int stride) {
+  const int dwt_levels = 1;
+  tran_low_t buffer[16 * 16];
+  int i, j;
+  // Scales up by 2-bit from unitary
+#if DWT_TYPE == 26
+  dyadic_analyze_26(dwt_levels, 32, 32, input, stride, output, 32, 2);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(dwt_levels, 32, 32, input, stride, output, 32, 2);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(dwt_levels, 32, 32, input, stride, output, 32, 2);
+#endif
+  // 16x16 dct in LL band that is unitary
+  vp9_fdct16x16_noscale(output, buffer, 32);
+  // Note that the transform overall is 2-bit scaled up from unitary
+  for (i = 0; i < 16; ++i) {
+    memcpy(&output[i * 32], &buffer[i * 16], sizeof(buffer[0]) * 16);
+  }
+}
+
+#if CONFIG_TX64X64
+void vp9_fdwt64x64_c(tran_low_t *input, tran_low_t *output, int stride) {
+#if DWT_TYPE == 26
+  dyadic_analyze_26(4, 64, 64, input, stride, output, 64, 1);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(4, 64, 64, input, stride, output, 64, 1);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(4, 64, 64, input, stride, output, 64, 1);
+#endif
+}
+
+void vp9_fdwtdct64x64_c(tran_low_t *input, tran_low_t *output,
+                        int stride) {
+  const int dwt_levels = 1;
+  tran_low_t buffer[32 * 32];
+  int i;
+  // Scales up by 1-bit from unitary
+#if DWT_TYPE == 26
+  dyadic_analyze_26(dwt_levels, 64, 64, input, stride, output, 64, 1);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(dwt_levels, 64, 64, input, stride, output, 64, 1);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(dwt_levels, 64, 64, input, stride, output, 64, 1);
+#endif
+  // 32x32 dct in LL band that is unitary
+  vp9_fdct32x32_noscale(output, buffer, 64);
+  // Note that the transform overall is 1-bit scaled up from unitary
+  for (i = 0; i < 32; ++i) {
+    memcpy(&output[i * 64], &buffer[i * 32], sizeof(buffer[0]) * 32);
+  }
+}
+#endif  // CONFIG_TX64X64
--- a/vp9/encoder/vp9_dwt.h
+++ b/vp9/encoder/vp9_dwt.h
@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_DWT_H_
+#define VP9_ENCODER_VP9_DWT_H_
+
+#include "./vpx_config.h"
+#include "vp9/common/vp9_idwt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_TX64X64
+void vp9_fdwt64x64(tran_low_t *input, tran_low_t *output, int stride);
+void vp9_fdwtdct64x64(tran_low_t *input, tran_low_t *output, int stride);
+#endif  // CONFIG_TX64X64
+void vp9_fdwt32x32(tran_low_t *input, tran_low_t *output, int stride);
+void vp9_fdwtdct32x32(tran_low_t *input, tran_low_t *output, int stride);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_DWT_H_
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@ -70,6 +70,8 @@ VP9_COMMON_SRCS-yes += common/vp9_scan.c
 VP9_COMMON_SRCS-yes += common/vp9_scan.h
 VP9_COMMON_SRCS-$(CONFIG_GLOBAL_MOTION) += common/vp9_motion_model.c
 VP9_COMMON_SRCS-$(CONFIG_GLOBAL_MOTION) += common/vp9_motion_model.h
+VP9_COMMON_SRCS-$(CONFIG_WAVELETS) += common/vp9_idwt.c
+VP9_COMMON_SRCS-$(CONFIG_WAVELETS) += common/vp9_idwt.h

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@ -85,6 +85,8 @@ VP9_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/vp9_global_motion.c
 VP9_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/vp9_global_motion.h
 VP9_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/vp9_motion_field.c
 VP9_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/vp9_motion_field.h
+VP9_CX_SRCS-$(CONFIG_WAVELETS) += encoder/vp9_dwt.c
+VP9_CX_SRCS-$(CONFIG_WAVELETS) += encoder/vp9_dwt.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c