Adding a 64x64 transform mode

Preliminary 64x64 transform implementation. Includes all code changes. All mismatches resolved. Coding results for derf and stdhd are within noise. stdhd is slightly higher, derf is slightly lower. To be further refined. Change-Id: I091c183f62b156d23ed6f648202eb96c82e69b4b
2014-10-23 16:31:21 -07:00 · 2014-10-23 16:31:21 -07:00 · 0c7a94f49b
commit 0c7a94f49b
parent cf608110fc
28 changed files with 4382 additions and 193 deletions
--- a/1
+++ b/1
@ -282,6 +282,7 @@ EXPERIMENT_LIST="
    vp9_temporal_denoising
    fp_mb_stats
    emulate_hardware
+    tx64x64
 "
 CONFIG_LIST="
    external_build
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@ -101,22 +101,35 @@ const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
  TX_4X4,   TX_4X4,   TX_4X4,
  TX_8X8,   TX_8X8,   TX_8X8,
  TX_16X16, TX_16X16, TX_16X16,
-  TX_32X32, TX_32X32, TX_32X32, TX_32X32
+  TX_32X32, TX_32X32, TX_32X32,
+#if CONFIG_TX64X64
+  TX_64X64,
+#else
+  TX_32X32,
+#endif
 };

 const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
-    BLOCK_4X4,  // TX_4X4
-    BLOCK_8X8,  // TX_8X8
+    BLOCK_4X4,    // TX_4X4
+    BLOCK_8X8,    // TX_8X8
    BLOCK_16X16,  // TX_16X16
    BLOCK_32X32,  // TX_32X32
+#if CONFIG_TX64X64
+    BLOCK_32X32,  // TX_64X64
+#endif
 };

 const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
-  TX_4X4,  // ONLY_4X4
-  TX_8X8,  // ALLOW_8X8
+  TX_4X4,    // ONLY_4X4
+  TX_8X8,    // ALLOW_8X8
  TX_16X16,  // ALLOW_16X16
  TX_32X32,  // ALLOW_32X32
+#if CONFIG_TX64X64
+  TX_64X64,  // ALLOW_64X64
+  TX_64X64,  // TX_MODE_SELECT
+#else
  TX_32X32,  // TX_MODE_SELECT
+#endif
 };

 const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@ -44,7 +44,7 @@ const vp9_prob vp9_cat6_prob_high12[] = {
 };
 #endif

-const uint8_t vp9_coefband_trans_8x8plus[1024] = {
+const uint8_t vp9_coefband_trans_8x8plus[MAX_NUM_COEFS] = {
  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
  4, 4, 4, 4, 4, 5,
  // beyond MAXBAND_INDEX+1 all values are filled as 5
@ -111,6 +111,200 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+#if CONFIG_TX64X64
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+#endif
 };

 const uint8_t vp9_coefband_trans_4x4[16] = {
@ -736,6 +930,92 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
  }
 };

+#if CONFIG_TX64X64
+static const vp9_coeff_probs_model default_coef_probs_64x64[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
+  }
+};
+#endif  // CONFIG_TX64X64
+
 static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
  vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
             MODEL_NODES * sizeof(vp9_prob));
@ -752,6 +1032,9 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+#if CONFIG_TX64X64
+  vp9_copy(cm->fc.coef_probs[TX_64X64], default_coef_probs_64x64);
+#endif
 }

 #define COEF_COUNT_SAT 24
@ -806,6 +1089,6 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) {
    update_factor = COEF_MAX_UPDATE_FACTOR;
    count_sat = COEF_COUNT_SAT;
  }
-  for (t = TX_4X4; t <= TX_32X32; t++)
+  for (t = TX_4X4; t < TX_SIZES; t++)
    adapt_coef_probs(cm, t, count_sat, update_factor);
 }
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@ -90,10 +90,20 @@ extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS];
 extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS];
 #endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_TX64X64
+#define DCT_MAX_VALUE           32768
+#else
 #define DCT_MAX_VALUE           16384
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_TX64X64
+#define DCT_MAX_VALUE_HIGH10   131072
+#define DCT_MAX_VALUE_HIGH12   524288
+#else
 #define DCT_MAX_VALUE_HIGH10    65536
 #define DCT_MAX_VALUE_HIGH12   262144
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 /* Coefficients are predicted via a 3-dimensional probability table. */
@ -153,7 +163,14 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 // This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21

-DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+#if CONFIG_TX64X64
+#define MAX_NUM_COEFS 4096
+#else
+#define MAX_NUM_COEFS 1024
+#endif
+
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_coefband_trans_8x8plus[MAX_NUM_COEFS]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]);

 static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
@ -204,6 +221,12 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
      above_ec = !!*(const uint64_t *)a;
      left_ec  = !!*(const uint64_t *)l;
      break;
+#if CONFIG_TX64X64
+    case TX_64X64:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
+      break;
+#endif
    default:
      assert(0 && "Invalid transform size.");
      break;
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@ -229,7 +229,7 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
  -D45_PRED, 14,                    /* 6 = D45_NODE */
  -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D207_PRED            /* 8 = D153_NODE */
 };

 const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
@ -265,6 +265,11 @@ static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
 };

 static const struct tx_probs default_tx_probs = {
+#if CONFIG_TX64X64
+  { { 3, 3, 136, 37 },
+    { 3, 5, 52,  13 } },
+#endif
+
  { { 3, 136, 37 },
    { 5, 52,  13 } },

@ -275,6 +280,26 @@ static const struct tx_probs default_tx_probs = {
    { 66  } }
 };

+#if CONFIG_TX64X64
+void tx_counts_to_branch_counts_64x64(const unsigned int *tx_count_64x64p,
+                                      unsigned int (*ct_64x64p)[2]) {
+  ct_64x64p[0][0] = tx_count_64x64p[TX_4X4];
+  ct_64x64p[0][1] = tx_count_64x64p[TX_8X8] +
+                    tx_count_64x64p[TX_16X16] +
+                    tx_count_64x64p[TX_32X32] +
+                    tx_count_64x64p[TX_64X64];
+  ct_64x64p[1][0] = tx_count_64x64p[TX_8X8];
+  ct_64x64p[1][1] = tx_count_64x64p[TX_16X16] +
+                    tx_count_64x64p[TX_32X32] +
+                    tx_count_64x64p[TX_64X64];
+  ct_64x64p[2][0] = tx_count_64x64p[TX_16X16];
+  ct_64x64p[2][1] = tx_count_64x64p[TX_32X32] +
+                    tx_count_64x64p[TX_64X64];
+  ct_64x64p[3][0] = tx_count_64x64p[TX_32X32];
+  ct_64x64p[3][1] = tx_count_64x64p[TX_64X64];
+}
+#endif
+
 void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]) {
  ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
@ -392,25 +417,34 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {

  if (cm->tx_mode == TX_MODE_SELECT) {
    int j;
-    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
-    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
-    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
+    unsigned int branch_ct_8x8p[1][2];
+    unsigned int branch_ct_16x16p[2][2];
+    unsigned int branch_ct_32x32p[3][2];
+#if CONFIG_TX64X64
+    unsigned int branch_ct_64x64p[4][2];
+#endif

    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; ++j)
+      for (j = 0; j < 1; ++j)
        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
                                             branch_ct_8x8p[j]);

      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; ++j)
+      for (j = 0; j < 2; ++j)
        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
                                               branch_ct_16x16p[j]);

      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; ++j)
+      for (j = 0; j < 3; ++j)
        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
                                               branch_ct_32x32p[j]);
+#if CONFIG_TX64X64
+      tx_counts_to_branch_counts_64x64(counts->tx.p64x64[i], branch_ct_64x64p);
+      for (j = 0; j < 4; ++j)
+        fc->tx_probs.p64x64[i][j] = adapt_prob(pre_fc->tx_probs.p64x64[i][j],
+                                               branch_ct_64x64p[j]);
+#endif
    }
  }

--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@ -24,15 +24,21 @@ extern "C" {
 struct VP9Common;

 struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
+#if CONFIG_TX64X64
+  vp9_prob p64x64[TX_SIZE_CONTEXTS][4];
+#endif
+  vp9_prob p32x32[TX_SIZE_CONTEXTS][3];
+  vp9_prob p16x16[TX_SIZE_CONTEXTS][2];
+  vp9_prob p8x8[TX_SIZE_CONTEXTS][1];
 };

 struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+#if CONFIG_TX64X64
+  unsigned int p64x64[TX_SIZE_CONTEXTS][5];
+#endif
+  unsigned int p32x32[TX_SIZE_CONTEXTS][4];
+  unsigned int p16x16[TX_SIZE_CONTEXTS][3];
+  unsigned int p8x8[TX_SIZE_CONTEXTS][2];
 };

 typedef struct frame_contexts {
@ -88,6 +94,10 @@ void vp9_init_mode_probs(FRAME_CONTEXT *fc);

 void vp9_adapt_mode_probs(struct VP9Common *cm);

+#if CONFIG_TX64X64
+void tx_counts_to_branch_counts_64x64(const unsigned int *tx_count_64x64p,
+                                      unsigned int (*ct_64x64p)[2]);
+#endif
 void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]);
 void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@ -77,6 +77,9 @@ typedef enum {
  TX_8X8 = 1,                      // 8x8 transform
  TX_16X16 = 2,                    // 16x16 transform
  TX_32X32 = 3,                    // 32x32 transform
+#if CONFIG_TX64X64
+  TX_64X64 = 4,                    // 64x64 transform
+#endif
  TX_SIZES
 } TX_SIZE;

@ -86,8 +89,11 @@ typedef enum {
  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
-  TX_MODE_SELECT      = 4,        // transform specified for each block
-  TX_MODES            = 5,
+#if CONFIG_TX64X64
+  ALLOW_64X64         = 4,        // allow block transform size up to 32x32
+#endif
+  TX_MODE_SELECT,                 // transform specified for each block
+  TX_MODES,
 } TX_MODE;

 typedef enum {
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@ -1457,6 +1457,458 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
  }
 }

+#if CONFIG_TX64X64
+#define DownshiftMultiplyBy2(x) x * 2
+#define DownshiftMultiply(x) x
+
+static void idct16f(double *input, double *output, int stride) {
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
+  double step[16];
+  double intermediate[16];
+  double temp1, temp2;
+
+  // step 1 and 2
+  step[ 0] = input[stride*0] + input[stride*8];
+  step[ 1] = input[stride*0] - input[stride*8];
+
+  temp1 = input[stride*4]*C12;
+  temp2 = input[stride*12]*C4;
+
+  temp1 -= temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+
+  step[ 2] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*4]*C4;
+  temp2 = input[stride*12]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  step[ 3] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*2]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] + input[stride*10];
+
+  step[ 4] = temp1 + temp2;
+  step[ 5] = temp1 - temp2;
+
+  temp1 = input[stride*14]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] - input[stride*10];
+
+  step[ 6] = temp2 - temp1;
+  step[ 7] = temp2 + temp1;
+
+  // for odd input
+  temp1 = input[stride*3]*C12;
+  temp2 = input[stride*13]*C4;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[ 8] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*3]*C4;
+  temp2 = input[stride*13]*C12;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[ 9] = DownshiftMultiplyBy2(temp2);
+
+  intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8);
+  intermediate[11] = input[stride*15] - input[stride*1];
+  intermediate[12] = input[stride*15] + input[stride*1];
+  intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8));
+
+  temp1 = input[stride*11]*C12;
+  temp2 = input[stride*5]*C4;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[14] = DownshiftMultiplyBy2(temp2);
+
+  temp1 = input[stride*11]*C4;
+  temp2 = input[stride*5]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[15] = DownshiftMultiplyBy2(temp1);
+
+  step[ 8] = intermediate[ 8] + intermediate[14];
+  step[ 9] = intermediate[ 9] + intermediate[15];
+  step[10] = intermediate[10] + intermediate[11];
+  step[11] = intermediate[10] - intermediate[11];
+  step[12] = intermediate[12] + intermediate[13];
+  step[13] = intermediate[12] - intermediate[13];
+  step[14] = intermediate[ 8] - intermediate[14];
+  step[15] = intermediate[ 9] - intermediate[15];
+
+  // step 3
+  output[stride*0] = step[ 0] + step[ 3];
+  output[stride*1] = step[ 1] + step[ 2];
+  output[stride*2] = step[ 1] - step[ 2];
+  output[stride*3] = step[ 0] - step[ 3];
+
+  temp1 = step[ 4]*C14;
+  temp2 = step[ 7]*C2;
+  temp1 -= temp2;
+  output[stride*4] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 4]*C2;
+  temp2 = step[ 7]*C14;
+  temp1 += temp2;
+  output[stride*7] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C10;
+  temp2 = step[ 6]*C6;
+  temp1 -= temp2;
+  output[stride*5] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C6;
+  temp2 = step[ 6]*C10;
+  temp1 += temp2;
+  output[stride*6] =  DownshiftMultiply(temp1);
+
+  output[stride*8] = step[ 8] + step[11];
+  output[stride*9] = step[ 9] + step[10];
+  output[stride*10] = step[ 9] - step[10];
+  output[stride*11] = step[ 8] - step[11];
+  output[stride*12] = step[12] + step[15];
+  output[stride*13] = step[13] + step[14];
+  output[stride*14] = step[13] - step[14];
+  output[stride*15] = step[12] - step[15];
+
+  // output 4
+  step[ 0] = output[stride*0] + output[stride*7];
+  step[ 1] = output[stride*1] + output[stride*6];
+  step[ 2] = output[stride*2] + output[stride*5];
+  step[ 3] = output[stride*3] + output[stride*4];
+  step[ 4] = output[stride*3] - output[stride*4];
+  step[ 5] = output[stride*2] - output[stride*5];
+  step[ 6] = output[stride*1] - output[stride*6];
+  step[ 7] = output[stride*0] - output[stride*7];
+
+  temp1 = output[stride*8]*C7;
+  temp2 = output[stride*15]*C9;
+  temp1 -= temp2;
+  step[ 8] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C11;
+  temp2 = output[stride*14]*C5;
+  temp1 += temp2;
+  step[ 9] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*10]*C3;
+  temp2 = output[stride*13]*C13;
+  temp1 -= temp2;
+  step[10] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C15;
+  temp2 = output[stride*12]*C1;
+  temp1 += temp2;
+  step[11] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C1;
+  temp2 = output[stride*12]*C15;
+  temp2 -= temp1;
+  step[12] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*10]*C13;
+  temp2 = output[stride*13]*C3;
+  temp1 += temp2;
+  step[13] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C5;
+  temp2 = output[stride*14]*C11;
+  temp2 -= temp1;
+  step[14] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*8]*C9;
+  temp2 = output[stride*15]*C7;
+  temp1 += temp2;
+  step[15] = DownshiftMultiply(temp1);
+
+  // step 5
+  output[stride*0] = step[0] + step[15];
+  output[stride*1] = step[1] + step[14];
+  output[stride*2] = step[2] + step[13];
+  output[stride*3] = step[3] + step[12];
+  output[stride*4] = step[4] + step[11];
+  output[stride*5] = step[5] + step[10];
+  output[stride*6] = step[6] + step[ 9];
+  output[stride*7] = step[7] + step[ 8];
+
+  output[stride*15] = step[0] - step[15];
+  output[stride*14] = step[1] - step[14];
+  output[stride*13] = step[2] - step[13];
+  output[stride*12] = step[3] - step[12];
+  output[stride*11] = step[4] - step[11];
+  output[stride*10] = step[5] - step[10];
+  output[stride*9] = step[6] - step[ 9];
+  output[stride*8] = step[7] - step[ 8];
+}
+
+static void butterfly_32_idct_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step1[32];
+  double step2[32];
+
+  step1[ 0] = input[stride*0];
+  step1[ 1] = input[stride*2];
+  step1[ 2] = input[stride*4];
+  step1[ 3] = input[stride*6];
+  step1[ 4] = input[stride*8];
+  step1[ 5] = input[stride*10];
+  step1[ 6] = input[stride*12];
+  step1[ 7] = input[stride*14];
+  step1[ 8] = input[stride*16];
+  step1[ 9] = input[stride*18];
+  step1[10] = input[stride*20];
+  step1[11] = input[stride*22];
+  step1[12] = input[stride*24];
+  step1[13] = input[stride*26];
+  step1[14] = input[stride*28];
+  step1[15] = input[stride*30];
+
+  step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16);
+  step1[17] = (input[stride*3] + input[stride*1]);
+  step1[18] = (input[stride*5] + input[stride*3]);
+  step1[19] = (input[stride*7] + input[stride*5]);
+  step1[20] = (input[stride*9] + input[stride*7]);
+  step1[21] = (input[stride*11] + input[stride*9]);
+  step1[22] = (input[stride*13] + input[stride*11]);
+  step1[23] = (input[stride*15] + input[stride*13]);
+  step1[24] = (input[stride*17] + input[stride*15]);
+  step1[25] = (input[stride*19] + input[stride*17]);
+  step1[26] = (input[stride*21] + input[stride*19]);
+  step1[27] = (input[stride*23] + input[stride*21]);
+  step1[28] = (input[stride*25] + input[stride*23]);
+  step1[29] = (input[stride*27] + input[stride*25]);
+  step1[30] = (input[stride*29] + input[stride*27]);
+  step1[31] = (input[stride*31] + input[stride*29]);
+
+  idct16f(step1, step2, 1);
+  idct16f(step1 + 16, step2 + 16, 1);
+
+  step2[16] = DownshiftMultiply(step2[16] / (2*C1));
+  step2[17] = DownshiftMultiply(step2[17] / (2*C3));
+  step2[18] = DownshiftMultiply(step2[18] / (2*C5));
+  step2[19] = DownshiftMultiply(step2[19] / (2*C7));
+  step2[20] = DownshiftMultiply(step2[20] / (2*C9));
+  step2[21] = DownshiftMultiply(step2[21] / (2*C11));
+  step2[22] = DownshiftMultiply(step2[22] / (2*C13));
+  step2[23] = DownshiftMultiply(step2[23] / (2*C15));
+  step2[24] = DownshiftMultiply(step2[24] / (2*C17));
+  step2[25] = DownshiftMultiply(step2[25] / (2*C19));
+  step2[26] = DownshiftMultiply(step2[26] / (2*C21));
+  step2[27] = DownshiftMultiply(step2[27] / (2*C23));
+  step2[28] = DownshiftMultiply(step2[28] / (2*C25));
+  step2[29] = DownshiftMultiply(step2[29] / (2*C27));
+  step2[30] = DownshiftMultiply(step2[30] / (2*C29));
+  step2[31] = DownshiftMultiply(step2[31] / (2*C31));
+
+  output[stride* 0] = step2[ 0] + step2[16];
+  output[stride* 1] = step2[ 1] + step2[17];
+  output[stride* 2] = step2[ 2] + step2[18];
+  output[stride* 3] = step2[ 3] + step2[19];
+  output[stride* 4] = step2[ 4] + step2[20];
+  output[stride* 5] = step2[ 5] + step2[21];
+  output[stride* 6] = step2[ 6] + step2[22];
+  output[stride* 7] = step2[ 7] + step2[23];
+  output[stride* 8] = step2[ 8] + step2[24];
+  output[stride* 9] = step2[ 9] + step2[25];
+  output[stride*10] = step2[10] + step2[26];
+  output[stride*11] = step2[11] + step2[27];
+  output[stride*12] = step2[12] + step2[28];
+  output[stride*13] = step2[13] + step2[29];
+  output[stride*14] = step2[14] + step2[30];
+  output[stride*15] = step2[15] + step2[31];
+  output[stride*16] = step2[15] - step2[(31 - 0)];
+  output[stride*17] = step2[14] - step2[(31 - 1)];
+  output[stride*18] = step2[13] - step2[(31 - 2)];
+  output[stride*19] = step2[12] - step2[(31 - 3)];
+  output[stride*20] = step2[11] - step2[(31 - 4)];
+  output[stride*21] = step2[10] - step2[(31 - 5)];
+  output[stride*22] = step2[ 9] - step2[(31 - 6)];
+  output[stride*23] = step2[ 8] - step2[(31 - 7)];
+  output[stride*24] = step2[ 7] - step2[(31 - 8)];
+  output[stride*25] = step2[ 6] - step2[(31 - 9)];
+  output[stride*26] = step2[ 5] - step2[(31 - 10)];
+  output[stride*27] = step2[ 4] - step2[(31 - 11)];
+  output[stride*28] = step2[ 3] - step2[(31 - 12)];
+  output[stride*29] = step2[ 2] - step2[(31 - 13)];
+  output[stride*30] = step2[ 1] - step2[(31 - 14)];
+  output[stride*31] = step2[ 0] - step2[(31 - 15)];
+}
+
+static void butterfly_64_idct_1d(double *input, double *output, int stride) {
+  double step1[64], step2[64];
+  int i;
+  static const double C[64] = {
+    1.00000000000000000000,  // cos(0 * pi / 128)
+    0.99969881869620424997,  // cos(1 * pi / 128)
+    0.99879545620517240501,  // cos(2 * pi / 128)
+    0.99729045667869020697,  // cos(3 * pi / 128)
+    0.99518472667219692873,  // cos(4 * pi / 128)
+    0.99247953459870996706,  // cos(5 * pi / 128)
+    0.98917650996478101444,  // cos(6 * pi / 128)
+    0.98527764238894122162,  // cos(7 * pi / 128)
+    0.98078528040323043058,  // cos(8 * pi / 128)
+    0.97570213003852857003,  // cos(9 * pi / 128)
+    0.97003125319454397424,  // cos(10 * pi / 128)
+    0.96377606579543984022,  // cos(11 * pi / 128)
+    0.95694033573220882438,  // cos(12 * pi / 128)
+    0.94952818059303667475,  // cos(13 * pi / 128)
+    0.94154406518302080631,  // cos(14 * pi / 128)
+    0.93299279883473895669,  // cos(15 * pi / 128)
+    0.92387953251128673848,  // cos(16 * pi / 128)
+    0.91420975570353069095,  // cos(17 * pi / 128)
+    0.90398929312344333820,  // cos(18 * pi / 128)
+    0.89322430119551532446,  // cos(19 * pi / 128)
+    0.88192126434835504956,  // cos(20 * pi / 128)
+    0.87008699110871146054,  // cos(21 * pi / 128)
+    0.85772861000027211809,  // cos(22 * pi / 128)
+    0.84485356524970711689,  // cos(23 * pi / 128)
+    0.83146961230254523567,  // cos(24 * pi / 128)
+    0.81758481315158371139,  // cos(25 * pi / 128)
+    0.80320753148064494287,  // cos(26 * pi / 128)
+    0.78834642762660633863,  // cos(27 * pi / 128)
+    0.77301045336273699338,  // cos(28 * pi / 128)
+    0.75720884650648456748,  // cos(29 * pi / 128)
+    0.74095112535495921691,  // cos(30 * pi / 128)
+    0.72424708295146700276,  // cos(31 * pi / 128)
+    0.70710678118654757274,  // cos(32 * pi / 128)
+    0.68954054473706694051,  // cos(33 * pi / 128)
+    0.67155895484701844111,  // cos(34 * pi / 128)
+    0.65317284295377686654,  // cos(35 * pi / 128)
+    0.63439328416364559882,  // cos(36 * pi / 128)
+    0.61523159058062693028,  // cos(37 * pi / 128)
+    0.59569930449243346793,  // cos(38 * pi / 128)
+    0.57580819141784544968,  // cos(39 * pi / 128)
+    0.55557023301960228867,  // cos(40 * pi / 128)
+    0.53499761988709737537,  // cos(41 * pi / 128)
+    0.51410274419322177231,  // cos(42 * pi / 128)
+    0.49289819222978414892,  // cos(43 * pi / 128)
+    0.47139673682599780857,  // cos(44 * pi / 128)
+    0.44961132965460659516,  // cos(45 * pi / 128)
+    0.42755509343028219593,  // cos(46 * pi / 128)
+    0.40524131400498980549,  // cos(47 * pi / 128)
+    0.38268343236508983729,  // cos(48 * pi / 128)
+    0.35989503653498827740,  // cos(49 * pi / 128)
+    0.33688985339222005111,  // cos(50 * pi / 128)
+    0.31368174039889151761,  // cos(51 * pi / 128)
+    0.29028467725446227554,  // cos(52 * pi / 128)
+    0.26671275747489842090,  // cos(53 * pi / 128)
+    0.24298017990326398197,  // cos(54 * pi / 128)
+    0.21910124015686976984,  // cos(55 * pi / 128)
+    0.19509032201612830359,  // cos(56 * pi / 128)
+    0.17096188876030135595,  // cos(57 * pi / 128)
+    0.14673047445536174793,  // cos(58 * pi / 128)
+    0.12241067519921627893,  // cos(59 * pi / 128)
+    0.09801714032956077016,  // cos(60 * pi / 128)
+    0.07356456359966745406,  // cos(61 * pi / 128)
+    0.04906767432741813290,  // cos(62 * pi / 128)
+    0.02454122852291226731,  // cos(63 * pi / 128)
+  };
+
+  for (i = 0; i < 64; i += 2) {
+    step1[i / 2] = input[stride * i];
+  }
+  step1[32] = DownshiftMultiplyBy2(input[stride*1] * C[32]);
+  for (i = 3; i < 64; i+=2) {
+    step1[32 + i/2] = (input[stride * i] + input[stride * (i - 2)]);
+  }
+
+  butterfly_32_idct_1d(step1, step2, 1);
+  butterfly_32_idct_1d(step1 + 32, step2 + 32, 1);
+
+  for (i = 32; i < 64; ++i) {
+    step2[i] = DownshiftMultiply(step2[i] / (2 * C[(i - 32) * 2 + 1]));
+  }
+
+  for (i = 0; i < 32; ++i) {
+    output[stride * i] = step2[i] + step2[32 + i];
+  }
+
+  for (i = 0; i < 32; ++i) {
+    output[stride * (i + 32)] = step2[31 - i] - step2[63 - i];
+  }
+}
+
+void vp9_idct64x64_4096_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double out[64 * 64], out2[64 * 64];
+    int i, j;
+    // First transform rows
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = input[j + i * 64];
+      butterfly_64_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        out[j + i * 64] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = out[j * 64 + i];
+      butterfly_64_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        out2[j * 64 + i] = temp_out[j];
+    }
+
+    for (j = 0; j < 64; ++j) {
+      for (i = 0; i < 64; ++i)
+        dest[i] = clip_pixel_add(dest[i], round(out2[j * 64 + i] / 128));
+      dest += stride;
+    }
+  }
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+void vp9_idct64x64_add(const tran_low_t *input, uint8_t *dest,
+                       int stride, int eob) {
+  (void) eob;
+  vp9_idct64x64_4096_add_c(input, dest, stride);
+}
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int bd) {
@ -2899,4 +3351,47 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
    vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
  }
 }
+
+#if CONFIG_TX64X64
+void vp9_highbd_idct64x64_4096_add_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, int bd) {
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double out[64 * 64], out2[64 * 64];
+    int i, j;
+    // First transform rows
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = input[j + i * 64];
+      butterfly_64_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        out[j + i * 64] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = out[j * 64 + i];
+      butterfly_64_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        out2[j * 64 + i] = temp_out[j];
+    }
+
+    for (j = 0; j < 64; ++j) {
+      for (i = 0; i < 64; ++i)
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * 64 + i], round(out2[j * 64 + i] / 128), bd);
+      dest += stride;
+    }
+  }
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+void vp9_highbd_idct64x64_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
+  (void) eob;
+  vp9_highbd_idct64x64_4096_add_c(input, dest, stride, bd);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@ -122,11 +122,14 @@ void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
-void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int
-                       eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                       int eob);
-
+#if CONFIG_TX64X64
+void vp9_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+#endif
 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                    int stride, int eob);
 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
@ -145,6 +148,10 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, int bd);
 void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, int bd);
+#if CONFIG_TX64X64
+void vp9_highbd_idct64x64_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+#endif
 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
                           uint8_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -38,6 +38,9 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
  0xffffffffffffffff,  // TX_8x8
  0x5555555555555555,  // TX_16x16
  0x1111111111111111,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0101010101010101,  // TX_64x64
+#endif
 };

 // 64 bit masks for above transform size. Each 1 represents a position where
@ -62,6 +65,9 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
  0xffffffffffffffff,  // TX_8x8
  0x00ff00ff00ff00ff,  // TX_16x16
  0x000000ff000000ff,  // TX_32x32
+#if CONFIG_TX64X64
+  0x00000000000000ff,  // TX_64x64
+#endif
 };

 // 64 bit masks for prediction sizes (left). Each 1 represents a position
@ -140,6 +146,9 @@ static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
  0xffff,  // TX_8x8
  0x5555,  // TX_16x16
  0x1111,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0101,  // TX_64x64, never used
+#endif
 };

 static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
@ -147,6 +156,9 @@ static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
  0xffff,  // TX_8x8
  0x0f0f,  // TX_16x16
  0x000f,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0003,  // TX_64x64, never used
+#endif
 };

 // 16 bit left mask to shift and set for each uv prediction size.
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@ -107,6 +107,10 @@ static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
      return tx_probs->p16x16[ctx];
    case TX_32X32:
      return tx_probs->p32x32[ctx];
+#if CONFIG_TX64X64
+    case TX_64X64:
+      return tx_probs->p64x64[ctx];
+#endif
    default:
      assert(0 && "Invalid max_tx_size.");
      return NULL;
@ -128,6 +132,10 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
      return tx_counts->p16x16[ctx];
    case TX_32X32:
      return tx_counts->p32x32[ctx];
+#if CONFIG_TX64X64
+    case TX_64X64:
+      return tx_counts->p64x64[ctx];
+#endif
    default:
      assert(0 && "Invalid max_tx_size.");
      return NULL;
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@ -47,7 +47,34 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
      const uint16_t *left, int bd) { \
    highbd_##type##_predictor(dst, stride, size, above, left, bd); \
  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_TX64X64
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_sized(type, 64) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32) \
+  intra_pred_highbd_sized(type, 64)
+#else
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_sized(type, 64)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#else   // CONFIG_TX64X64
+
+#if CONFIG_VP9_HIGHBITDEPTH
 #define intra_pred_allsizes(type) \
  intra_pred_sized(type, 4) \
  intra_pred_sized(type, 8) \
@ -57,9 +84,7 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
  intra_pred_highbd_sized(type, 8) \
  intra_pred_highbd_sized(type, 16) \
  intra_pred_highbd_sized(type, 32)
-
 #else
-
 #define intra_pred_allsizes(type) \
  intra_pred_sized(type, 4) \
  intra_pred_sized(type, 8) \
@ -67,6 +92,8 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
  intra_pred_sized(type, 32)
 #endif  // CONFIG_VP9_HIGHBITDEPTH

+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
                                         int bs, const uint16_t *above,
@ -575,16 +602,25 @@ static intra_pred_fn dc_pred[2][2][TX_SIZES];
 typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
                                   const uint16_t *above, const uint16_t *left,
                                   int bd);
-static intra_high_pred_fn pred_high[INTRA_MODES][4];
-static intra_high_pred_fn dc_pred_high[2][2][4];
+static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES];
+static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES];
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_init_intra_predictors() {
+#if CONFIG_TX64X64
+#define INIT_ALL_SIZES(p, type) \
+  p[TX_4X4] = vp9_##type##_predictor_4x4; \
+  p[TX_8X8] = vp9_##type##_predictor_8x8; \
+  p[TX_16X16] = vp9_##type##_predictor_16x16; \
+  p[TX_32X32] = vp9_##type##_predictor_32x32; \
+  p[TX_64X64] = vp9_##type##_predictor_64x64
+#else
 #define INIT_ALL_SIZES(p, type) \
  p[TX_4X4] = vp9_##type##_predictor_4x4; \
  p[TX_8X8] = vp9_##type##_predictor_8x8; \
  p[TX_16X16] = vp9_##type##_predictor_16x16; \
  p[TX_32X32] = vp9_##type##_predictor_32x32
+#endif

  INIT_ALL_SIZES(pred[V_PRED], v);
  INIT_ALL_SIZES(pred[H_PRED], h);
@ -638,7 +674,11 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 64);
+#if CONFIG_TX64X64
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 256 + 16);
+#else
  DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 128 + 16);
+#endif
  uint16_t *above_row = above_data + 16;
  const uint16_t *const_above_row = above_row;
  const int bs = 4 << tx_size;
@ -767,7 +807,11 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                   int plane) {
  int i;
  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
+#if CONFIG_TX64X64
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 256 + 16);
+#else
  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+#endif
  uint8_t *above_row = above_data + 16;
  const uint8_t *const_above_row = above_row;
  const int bs = 4 << tx_size;
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -224,6 +224,47 @@ specialize qw/vp9_dc_left_predictor_32x32/;
 add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_128_predictor_32x32/;

+if (vpx_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void vp9_d207_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d207_predictor_64x64/;
+
+  add_proto qw/void vp9_d45_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d45_predictor_64x64/;
+
+  add_proto qw/void vp9_d63_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d63_predictor_64x64/;
+
+  add_proto qw/void vp9_h_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_h_predictor_64x64/;
+
+  add_proto qw/void vp9_d117_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d117_predictor_64x64/;
+
+  add_proto qw/void vp9_d135_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d135_predictor_64x64/;
+
+  add_proto qw/void vp9_d153_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d153_predictor_64x64/;
+
+  add_proto qw/void vp9_v_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_v_predictor_64x64/;
+
+  add_proto qw/void vp9_tm_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_tm_predictor_64x64/;
+
+  add_proto qw/void vp9_dc_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_dc_predictor_64x64/;
+
+  add_proto qw/void vp9_dc_top_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_dc_top_predictor_64x64/;
+
+  add_proto qw/void vp9_dc_left_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_dc_left_predictor_64x64/;
+
+  add_proto qw/void vp9_dc_128_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_dc_128_predictor_64x64/;
+}
+
 #
 # Loopfilter
 #
@ -366,6 +407,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp9_idct32x32_1_add/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct64x64_4096_add/;
+  }
+
  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
  specialize qw/vp9_iht4x4_16_add/;

@ -419,6 +465,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct32x32_1_add/;

+    if (vpx_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+      specialize qw/vp9_idct64x64_4096_add/;
+    }
+
    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp9_iht4x4_16_add/;

@ -480,6 +531,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
    $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;

+    if (vpx_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+      specialize qw/vp9_idct64x64_4096_add/;
+    }
+
    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
    $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
@ -662,6 +718,46 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
  specialize qw/vp9_highbd_dc_128_predictor_32x32/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_highbd_d207_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d207_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d45_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d45_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d63_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d63_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_h_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_h_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d117_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d117_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d135_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d135_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d153_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d153_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_v_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_v_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_tm_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_tm_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_dc_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_dc_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_dc_top_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_dc_top_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_dc_left_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_dc_left_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_dc_128_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_dc_128_predictor_64x64/;
+  }
  #
  # Sub Pixel Filters
  #
@ -774,6 +870,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct32x32_1024_add/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_highbd_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct64x64_4096_add/;
+  }
+
  add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct32x32_34_add/;

@ -1144,6 +1245,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  specialize qw/vp9_quantize_b_32x32/;
+
+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_quantize_fp_64x64/;
+
+    add_proto qw/void vp9_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_quantize_b_64x64/;
+  }
 } else {
  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
  specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
@ -1159,6 +1268,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+
+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_quantize_fp_64x64/;
+
+    add_proto qw/void vp9_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_quantize_b_64x64/;
+  }
 }

 #
@ -1213,6 +1330,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_fdct32x32_rd/;
+
+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct64x64_1/;
+
+    add_proto qw/void vp9_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct64x64/;
+  }
 } else {
  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
  specialize qw/vp9_fht4x4 sse2/;
@ -1252,6 +1377,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+
+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct64x64_1/;
+
+    add_proto qw/void vp9_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct64x64/;
+  }
 }

 #
@ -1868,6 +2001,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  specialize qw/vp9_highbd_quantize_b_32x32/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_highbd_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_highbd_quantize_fp_64x64/;
+
+    add_proto qw/void vp9_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_highbd_quantize_b_64x64/;
+  }
+
  #
  # Structured Similarity (SSIM)
  #
@ -1913,6 +2054,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fdct32x32_rd/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_highbd_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_highbd_fdct64x64_1/;
+
+    add_proto qw/void vp9_highbd_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_highbd_fdct64x64/;
+  }
+
  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
  specialize qw/vp9_highbd_temporal_filter_apply/;

--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@ -80,8 +80,15 @@ static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {

 static TX_MODE read_tx_mode(vp9_reader *r) {
  TX_MODE tx_mode = vp9_read_literal(r, 2);
+#if CONFIG_TX64X64
+  if (tx_mode == 2)
+    tx_mode += vp9_read_bit(r);      // ALLOW_16X16 and ALLOW_32X32
+  else if (tx_mode == 3)
+    tx_mode += 1 + vp9_read_bit(r);  // ALLOW_64X64 and TX_MODE_SELECT
+#else
  if (tx_mode == ALLOW_32X32)
    tx_mode += vp9_read_bit(r);
+#endif
  return tx_mode;
 }

@ -89,16 +96,22 @@ static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) {
  int i, j;

  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 3; ++j)
+    for (j = 0; j < 1; ++j)
      vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);

  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 2; ++j)
+    for (j = 0; j < 2; ++j)
      vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);

  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 1; ++j)
+    for (j = 0; j < 3; ++j)
      vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+
+#if CONFIG_TX64X64
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < 4; ++j)
+      vp9_diff_update_prob(r, &tx_probs->p64x64[i][j]);
+#endif
 }

 static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
@ -220,6 +233,12 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
            tx_type = DCT_DCT;
            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
            break;
+#if CONFIG_TX64X64
+          case TX_64X64:
+            tx_type = DCT_DCT;
+            vp9_highbd_idct64x64_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+#endif
          default:
            assert(0 && "Invalid transform size");
        }
@ -247,6 +266,12 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
            tx_type = DCT_DCT;
            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
            break;
+#if CONFIG_TX64X64
+          case TX_64X64:
+            tx_type = DCT_DCT;
+            vp9_idct64x64_add(dqcoeff, dst, stride, eob);
+            break;
+#endif
          default:
            assert(0 && "Invalid transform size");
            return;
@ -276,6 +301,12 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
          tx_type = DCT_DCT;
          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
          break;
+#if CONFIG_TX64X64
+        case TX_64X64:
+          tx_type = DCT_DCT;
+          vp9_idct64x64_add(dqcoeff, dst, stride, eob);
+          break;
+#endif
        default:
          assert(0 && "Invalid transform size");
          return;
@ -321,7 +352,6 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
                          b_width_log2_lookup[plane_bsize], tx_size, mode,
                          dst, pd->dst.stride, dst, pd->dst.stride,
                          x, y, plane);
-
  if (!mi->mbmi.skip) {
    const int eob = vp9_decode_block_tokens(cm, xd, plane, block,
                                            plane_bsize, x, y, tx_size,
@ -701,14 +731,14 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
  setup_display_size(cm, rb);

  if (vp9_realloc_frame_buffer(
-          get_frame_new_buffer(cm), cm->width, cm->height,
-          cm->subsampling_x, cm->subsampling_y,
+      get_frame_new_buffer(cm), cm->width, cm->height,
+      cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-          cm->use_highbitdepth,
+      cm->use_highbitdepth,
 #endif
-          VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+      VP9_DEC_BORDER_IN_PIXELS,
+      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
+      cm->cb_priv)) {
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate frame buffer");
  }
@ -779,14 +809,14 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
  setup_display_size(cm, rb);

  if (vp9_realloc_frame_buffer(
-          get_frame_new_buffer(cm), cm->width, cm->height,
-          cm->subsampling_x, cm->subsampling_y,
+      get_frame_new_buffer(cm), cm->width, cm->height,
+      cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-          cm->use_highbitdepth,
+      cm->use_highbitdepth,
 #endif
-          VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+      VP9_DEC_BORDER_IN_PIXELS,
+      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
+      cm->cb_priv)) {
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate frame buffer");
  }
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@ -65,8 +65,14 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
  int tx_size = vp9_read(r, tx_probs[0]);
  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
    tx_size += vp9_read(r, tx_probs[1]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) {
      tx_size += vp9_read(r, tx_probs[2]);
+#if CONFIG_TX64X64
+      if (tx_size != TX_16X16 && max_tx_size >= TX_64X64) {
+        tx_size += vp9_read(r, tx_probs[3]);
+      }
+#endif
+    }
  }

  if (!cm->frame_parallel_decoding_mode)
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@ -32,7 +32,7 @@
 #define INCREMENT_COUNT(token)                              \
  do {                                                      \
     if (!cm->frame_parallel_decoding_mode)                 \
-       ++coef_counts[band][ctx][token];                      \
+       ++coef_counts[band][ctx][token];                     \
  } while (0)

 static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
@ -69,9 +69,9 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
      counts->coef[tx_size][type][ref];
  unsigned int (*eob_branch_count)[COEFF_CONTEXTS] =
      counts->eob_branch[tx_size][type][ref];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_NUM_COEFS];
  const uint8_t *band_translate = get_band_translate(tx_size);
-  const int dq_shift = (tx_size == TX_32X32);
+  const int dq_shift = (tx_size > TX_16X16) ? tx_size - TX_16X16 : 0;
  int v, token;
  int16_t dqv = dq[0];
  const uint8_t *cat1_prob;
@ -214,6 +214,9 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
  const int eob = decode_coefs(cm, xd, pd->plane_type,
                               BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
                               pd->dequant, ctx, so->scan, so->neighbors, r);
+#if CONFIG_TX64X64
+  if (plane > 0) assert(tx_size != TX_64X64);
+#endif
  vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
  return eob;
 }
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@ -88,8 +88,13 @@ static void write_selected_tx_size(const VP9_COMMON *cm,
  vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
    vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) {
      vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
+#if CONFIG_TX64X64
+      if (tx_size != TX_16X16 && max_tx_size >= TX_64X64)
+        vp9_write(w, tx_size != TX_32X32, tx_probs[3]);
+#endif
+    }
  }
 }

@ -684,7 +689,7 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];

-  for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
    build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
                            frame_coef_probs[tx_size]);

@ -815,37 +820,60 @@ static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,

 static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
  // Mode
+#if CONFIG_TX64X64
+  if (cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32) {
+    vp9_write_literal(w, 2, 2);
+    vp9_write_bit(w, cm->tx_mode == ALLOW_32X32);
+  } else if (cm->tx_mode == ALLOW_64X64 || cm->tx_mode == TX_MODE_SELECT) {
+    vp9_write_literal(w, 3, 2);
+    vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+  } else {
+    vp9_write_literal(w, cm->tx_mode, 2);
+  }
+#else
  vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
  if (cm->tx_mode >= ALLOW_32X32)
    vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+#endif  // CONFIG_TX64X64

  // Probabilities
  if (cm->tx_mode == TX_MODE_SELECT) {
    int i, j;
-    unsigned int ct_8x8p[TX_SIZES - 3][2];
-    unsigned int ct_16x16p[TX_SIZES - 2][2];
-    unsigned int ct_32x32p[TX_SIZES - 1][2];
-
+    unsigned int ct_8x8p[1][2];
+    unsigned int ct_16x16p[2][2];
+    unsigned int ct_32x32p[3][2];
+#if CONFIG_TX64X64
+    unsigned int ct_64x64p[4][2];
+#endif

    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; j++)
+      for (j = 0; j < 1; j++)
        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
    }

    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; j++)
+      for (j = 0; j < 2; j++)
        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
                                  ct_16x16p[j]);
    }

    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
      tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; j++)
+      for (j = 0; j < 3; j++)
        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
                                  ct_32x32p[j]);
    }
+
+#if CONFIG_TX64X64
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      tx_counts_to_branch_counts_64x64(cm->counts.tx.p64x64[i], ct_64x64p);
+      for (j = 0; j < 4; j++)
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p64x64[i][j],
+                                  ct_64x64p[j]);
+    }
+#endif  // CONFIG_TX64X64
  }
 }

--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@ -1439,6 +1439,458 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
  }
 }

+#if CONFIG_TX64X64
+// TODO(debargha): Using a floating point implementation for now.
+// Should re-use the 32x32 integer dct we already have.
+static void dct32_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C4 = 0.980785280403;  // cos(pi * 4 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C6 = 0.956940335732;  // cos(pi * 6 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C8 = 0.923879532511;  // cos(pi * 8 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C10 = 0.881921264348;  // cos(pi * 10 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C12 = 0.831469612303;  // cos(pi * 12 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C14 = 0.773010453363;  // cos(pi * 14 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C18 = 0.634393284164;  // cos(pi * 18 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C20 = 0.555570233020;  // cos(pi * 20 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C22 = 0.471396736826;  // cos(pi * 22 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C24 = 0.382683432365;  // cos(pi * 24 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C26 = 0.290284677254;  // cos(pi * 26 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C28 = 0.195090322016;  // cos(pi * 28 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C30 = 0.098017140330;  // cos(pi * 30 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step[32];
+
+  // Stage 1
+  step[0] = input[stride*0] + input[stride*(32 - 1)];
+  step[1] = input[stride*1] + input[stride*(32 - 2)];
+  step[2] = input[stride*2] + input[stride*(32 - 3)];
+  step[3] = input[stride*3] + input[stride*(32 - 4)];
+  step[4] = input[stride*4] + input[stride*(32 - 5)];
+  step[5] = input[stride*5] + input[stride*(32 - 6)];
+  step[6] = input[stride*6] + input[stride*(32 - 7)];
+  step[7] = input[stride*7] + input[stride*(32 - 8)];
+  step[8] = input[stride*8] + input[stride*(32 - 9)];
+  step[9] = input[stride*9] + input[stride*(32 - 10)];
+  step[10] = input[stride*10] + input[stride*(32 - 11)];
+  step[11] = input[stride*11] + input[stride*(32 - 12)];
+  step[12] = input[stride*12] + input[stride*(32 - 13)];
+  step[13] = input[stride*13] + input[stride*(32 - 14)];
+  step[14] = input[stride*14] + input[stride*(32 - 15)];
+  step[15] = input[stride*15] + input[stride*(32 - 16)];
+  step[16] = -input[stride*16] + input[stride*(32 - 17)];
+  step[17] = -input[stride*17] + input[stride*(32 - 18)];
+  step[18] = -input[stride*18] + input[stride*(32 - 19)];
+  step[19] = -input[stride*19] + input[stride*(32 - 20)];
+  step[20] = -input[stride*20] + input[stride*(32 - 21)];
+  step[21] = -input[stride*21] + input[stride*(32 - 22)];
+  step[22] = -input[stride*22] + input[stride*(32 - 23)];
+  step[23] = -input[stride*23] + input[stride*(32 - 24)];
+  step[24] = -input[stride*24] + input[stride*(32 - 25)];
+  step[25] = -input[stride*25] + input[stride*(32 - 26)];
+  step[26] = -input[stride*26] + input[stride*(32 - 27)];
+  step[27] = -input[stride*27] + input[stride*(32 - 28)];
+  step[28] = -input[stride*28] + input[stride*(32 - 29)];
+  step[29] = -input[stride*29] + input[stride*(32 - 30)];
+  step[30] = -input[stride*30] + input[stride*(32 - 31)];
+  step[31] = -input[stride*31] + input[stride*(32 - 32)];
+
+  // Stage 2
+  output[stride*0] = step[0] + step[16 - 1];
+  output[stride*1] = step[1] + step[16 - 2];
+  output[stride*2] = step[2] + step[16 - 3];
+  output[stride*3] = step[3] + step[16 - 4];
+  output[stride*4] = step[4] + step[16 - 5];
+  output[stride*5] = step[5] + step[16 - 6];
+  output[stride*6] = step[6] + step[16 - 7];
+  output[stride*7] = step[7] + step[16 - 8];
+  output[stride*8] = -step[8] + step[16 - 9];
+  output[stride*9] = -step[9] + step[16 - 10];
+  output[stride*10] = -step[10] + step[16 - 11];
+  output[stride*11] = -step[11] + step[16 - 12];
+  output[stride*12] = -step[12] + step[16 - 13];
+  output[stride*13] = -step[13] + step[16 - 14];
+  output[stride*14] = -step[14] + step[16 - 15];
+  output[stride*15] = -step[15] + step[16 - 16];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18];
+  output[stride*19] = step[19];
+
+  output[stride*20] = (-step[20] + step[27])*C16;
+  output[stride*21] = (-step[21] + step[26])*C16;
+  output[stride*22] = (-step[22] + step[25])*C16;
+  output[stride*23] = (-step[23] + step[24])*C16;
+
+  output[stride*24] = (step[24] + step[23])*C16;
+  output[stride*25] = (step[25] + step[22])*C16;
+  output[stride*26] = (step[26] + step[21])*C16;
+  output[stride*27] = (step[27] + step[20])*C16;
+
+  output[stride*28] = step[28];
+  output[stride*29] = step[29];
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 3
+  step[0] = output[stride*0] + output[stride*(8 - 1)];
+  step[1] = output[stride*1] + output[stride*(8 - 2)];
+  step[2] = output[stride*2] + output[stride*(8 - 3)];
+  step[3] = output[stride*3] + output[stride*(8 - 4)];
+  step[4] = -output[stride*4] + output[stride*(8 - 5)];
+  step[5] = -output[stride*5] + output[stride*(8 - 6)];
+  step[6] = -output[stride*6] + output[stride*(8 - 7)];
+  step[7] = -output[stride*7] + output[stride*(8 - 8)];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9];
+  step[10] = (-output[stride*10] + output[stride*13])*C16;
+  step[11] = (-output[stride*11] + output[stride*12])*C16;
+  step[12] = (output[stride*12] + output[stride*11])*C16;
+  step[13] = (output[stride*13] + output[stride*10])*C16;
+  step[14] = output[stride*14];
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*23];
+  step[17] = output[stride*17] + output[stride*22];
+  step[18] = output[stride*18] + output[stride*21];
+  step[19] = output[stride*19] + output[stride*20];
+  step[20] = -output[stride*20] + output[stride*19];
+  step[21] = -output[stride*21] + output[stride*18];
+  step[22] = -output[stride*22] + output[stride*17];
+  step[23] = -output[stride*23] + output[stride*16];
+  step[24] = -output[stride*24] + output[stride*31];
+  step[25] = -output[stride*25] + output[stride*30];
+  step[26] = -output[stride*26] + output[stride*29];
+  step[27] = -output[stride*27] + output[stride*28];
+  step[28] = output[stride*28] + output[stride*27];
+  step[29] = output[stride*29] + output[stride*26];
+  step[30] = output[stride*30] + output[stride*25];
+  step[31] = output[stride*31] + output[stride*24];
+
+  // Stage 4
+  output[stride*0] = step[0] + step[3];
+  output[stride*1] = step[1] + step[2];
+  output[stride*2] = -step[2] + step[1];
+  output[stride*3] = -step[3] + step[0];
+  output[stride*4] = step[4];
+  output[stride*5] = (-step[5] + step[6])*C16;
+  output[stride*6] = (step[6] + step[5])*C16;
+  output[stride*7] = step[7];
+  output[stride*8] = step[8] + step[11];
+  output[stride*9] = step[9] + step[10];
+  output[stride*10] = -step[10] + step[9];
+  output[stride*11] = -step[11] + step[8];
+  output[stride*12] = -step[12] + step[15];
+  output[stride*13] = -step[13] + step[14];
+  output[stride*14] = step[14] + step[13];
+  output[stride*15] = step[15] + step[12];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18]*-C8 + step[29]*C24;
+  output[stride*19] = step[19]*-C8 + step[28]*C24;
+  output[stride*20] = step[20]*-C24 + step[27]*-C8;
+  output[stride*21] = step[21]*-C24 + step[26]*-C8;
+  output[stride*22] = step[22];
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25];
+  output[stride*26] = step[26]*C24 + step[21]*-C8;
+  output[stride*27] = step[27]*C24 + step[20]*-C8;
+  output[stride*28] = step[28]*C8 + step[19]*C24;
+  output[stride*29] = step[29]*C8 + step[18]*C24;
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 5
+  step[0] = (output[stride*0] + output[stride*1]) * C16;
+  step[1] = (-output[stride*1] + output[stride*0]) * C16;
+  step[2] = output[stride*2]*C24 + output[stride*3] * C8;
+  step[3] = output[stride*3]*C24 - output[stride*2] * C8;
+  step[4] = output[stride*4] + output[stride*5];
+  step[5] = -output[stride*5] + output[stride*4];
+  step[6] = -output[stride*6] + output[stride*7];
+  step[7] = output[stride*7] + output[stride*6];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9]*-C8 + output[stride*14]*C24;
+  step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;
+  step[11] = output[stride*11];
+  step[12] = output[stride*12];
+  step[13] = output[stride*13]*C24 + output[stride*10]*-C8;
+  step[14] = output[stride*14]*C8 + output[stride*9]*C24;
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*19];
+  step[17] = output[stride*17] + output[stride*18];
+  step[18] = -output[stride*18] + output[stride*17];
+  step[19] = -output[stride*19] + output[stride*16];
+  step[20] = -output[stride*20] + output[stride*23];
+  step[21] = -output[stride*21] + output[stride*22];
+  step[22] = output[stride*22] + output[stride*21];
+  step[23] = output[stride*23] + output[stride*20];
+  step[24] = output[stride*24] + output[stride*27];
+  step[25] = output[stride*25] + output[stride*26];
+  step[26] = -output[stride*26] + output[stride*25];
+  step[27] = -output[stride*27] + output[stride*24];
+  step[28] = -output[stride*28] + output[stride*31];
+  step[29] = -output[stride*29] + output[stride*30];
+  step[30] = output[stride*30] + output[stride*29];
+  step[31] = output[stride*31] + output[stride*28];
+
+  // Stage 6
+  output[stride*0] = step[0];
+  output[stride*1] = step[1];
+  output[stride*2] = step[2];
+  output[stride*3] = step[3];
+  output[stride*4] = step[4]*C28 + step[7]*C4;
+  output[stride*5] = step[5]*C12 + step[6]*C20;
+  output[stride*6] = step[6]*C12 + step[5]*-C20;
+  output[stride*7] = step[7]*C28 + step[4]*-C4;
+  output[stride*8] = step[8] + step[9];
+  output[stride*9] = -step[9] + step[8];
+  output[stride*10] = -step[10] + step[11];
+  output[stride*11] = step[11] + step[10];
+  output[stride*12] = step[12] + step[13];
+  output[stride*13] = -step[13] + step[12];
+  output[stride*14] = -step[14] + step[15];
+  output[stride*15] = step[15] + step[14];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17]*-C4 + step[30]*C28;
+  output[stride*18] = step[18]*-C28 + step[29]*-C4;
+  output[stride*19] = step[19];
+  output[stride*20] = step[20];
+  output[stride*21] = step[21]*-C20 + step[26]*C12;
+  output[stride*22] = step[22]*-C12 + step[25]*-C20;
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25]*C12 + step[22]*-C20;
+  output[stride*26] = step[26]*C20 + step[21]*C12;
+  output[stride*27] = step[27];
+  output[stride*28] = step[28];
+  output[stride*29] = step[29]*C28 + step[18]*-C4;
+  output[stride*30] = step[30]*C4 + step[17]*C28;
+  output[stride*31] = step[31];
+
+  // Stage 7
+  step[0] = output[stride*0];
+  step[1] = output[stride*1];
+  step[2] = output[stride*2];
+  step[3] = output[stride*3];
+  step[4] = output[stride*4];
+  step[5] = output[stride*5];
+  step[6] = output[stride*6];
+  step[7] = output[stride*7];
+  step[8] = output[stride*8]*C30 + output[stride*15]*C2;
+  step[9] = output[stride*9]*C14 + output[stride*14]*C18;
+  step[10] = output[stride*10]*C22 + output[stride*13]*C10;
+  step[11] = output[stride*11]*C6 + output[stride*12]*C26;
+  step[12] = output[stride*12]*C6 + output[stride*11]*-C26;
+  step[13] = output[stride*13]*C22 + output[stride*10]*-C10;
+  step[14] = output[stride*14]*C14 + output[stride*9]*-C18;
+  step[15] = output[stride*15]*C30 + output[stride*8]*-C2;
+
+  step[16] = output[stride*16] + output[stride*17];
+  step[17] = -output[stride*17] + output[stride*16];
+  step[18] = -output[stride*18] + output[stride*19];
+  step[19] = output[stride*19] + output[stride*18];
+  step[20] = output[stride*20] + output[stride*21];
+  step[21] = -output[stride*21] + output[stride*20];
+  step[22] = -output[stride*22] + output[stride*23];
+  step[23] = output[stride*23] + output[stride*22];
+  step[24] = output[stride*24] + output[stride*25];
+  step[25] = -output[stride*25] + output[stride*24];
+  step[26] = -output[stride*26] + output[stride*27];
+  step[27] = output[stride*27] + output[stride*26];
+  step[28] = output[stride*28] + output[stride*29];
+  step[29] = -output[stride*29] + output[stride*28];
+  step[30] = -output[stride*30] + output[stride*31];
+  step[31] = output[stride*31] + output[stride*30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[stride*0] = step[0];
+  output[stride*16] = step[1];
+  output[stride*8] = step[2];
+  output[stride*24] = step[3];
+  output[stride*4] = step[4];
+  output[stride*20] = step[5];
+  output[stride*12] = step[6];
+  output[stride*28] = step[7];
+  output[stride*2] = step[8];
+  output[stride*18] = step[9];
+  output[stride*10] = step[10];
+  output[stride*26] = step[11];
+  output[stride*6] = step[12];
+  output[stride*22] = step[13];
+  output[stride*14] = step[14];
+  output[stride*30] = step[15];
+
+  output[stride*1] = step[16]*C31 + step[31]*C1;
+  output[stride*17] = step[17]*C15 + step[30]*C17;
+  output[stride*9] = step[18]*C23 + step[29]*C9;
+  output[stride*25] = step[19]*C7 + step[28]*C25;
+  output[stride*5] = step[20]*C27 + step[27]*C5;
+  output[stride*21] = step[21]*C11 + step[26]*C21;
+  output[stride*13] = step[22]*C19 + step[25]*C13;
+  output[stride*29] = step[23]*C3 + step[24]*C29;
+  output[stride*3] = step[24]*C3 + step[23]*-C29;
+  output[stride*19] = step[25]*C19 + step[22]*-C13;
+  output[stride*11] = step[26]*C11 + step[21]*-C21;
+  output[stride*27] = step[27]*C27 + step[20]*-C5;
+  output[stride*7] = step[28]*C7 + step[19]*-C25;
+  output[stride*23] = step[29]*C23 + step[18]*-C9;
+  output[stride*15] = step[30]*C15 + step[17]*-C17;
+  output[stride*31] = step[31]*C31 + step[16]*-C1;
+}
+
+static void dct64_1d(double *input, double *output, int stride) {
+  double step1[64], step2[64];
+  int i;
+  static const double C[64] = {
+    1.00000000000000000000,  // cos(0 * pi / 128)
+    0.99969881869620424997,  // cos(1 * pi / 128)
+    0.99879545620517240501,  // cos(2 * pi / 128)
+    0.99729045667869020697,  // cos(3 * pi / 128)
+    0.99518472667219692873,  // cos(4 * pi / 128)
+    0.99247953459870996706,  // cos(5 * pi / 128)
+    0.98917650996478101444,  // cos(6 * pi / 128)
+    0.98527764238894122162,  // cos(7 * pi / 128)
+    0.98078528040323043058,  // cos(8 * pi / 128)
+    0.97570213003852857003,  // cos(9 * pi / 128)
+    0.97003125319454397424,  // cos(10 * pi / 128)
+    0.96377606579543984022,  // cos(11 * pi / 128)
+    0.95694033573220882438,  // cos(12 * pi / 128)
+    0.94952818059303667475,  // cos(13 * pi / 128)
+    0.94154406518302080631,  // cos(14 * pi / 128)
+    0.93299279883473895669,  // cos(15 * pi / 128)
+    0.92387953251128673848,  // cos(16 * pi / 128)
+    0.91420975570353069095,  // cos(17 * pi / 128)
+    0.90398929312344333820,  // cos(18 * pi / 128)
+    0.89322430119551532446,  // cos(19 * pi / 128)
+    0.88192126434835504956,  // cos(20 * pi / 128)
+    0.87008699110871146054,  // cos(21 * pi / 128)
+    0.85772861000027211809,  // cos(22 * pi / 128)
+    0.84485356524970711689,  // cos(23 * pi / 128)
+    0.83146961230254523567,  // cos(24 * pi / 128)
+    0.81758481315158371139,  // cos(25 * pi / 128)
+    0.80320753148064494287,  // cos(26 * pi / 128)
+    0.78834642762660633863,  // cos(27 * pi / 128)
+    0.77301045336273699338,  // cos(28 * pi / 128)
+    0.75720884650648456748,  // cos(29 * pi / 128)
+    0.74095112535495921691,  // cos(30 * pi / 128)
+    0.72424708295146700276,  // cos(31 * pi / 128)
+    0.70710678118654757274,  // cos(32 * pi / 128)
+    0.68954054473706694051,  // cos(33 * pi / 128)
+    0.67155895484701844111,  // cos(34 * pi / 128)
+    0.65317284295377686654,  // cos(35 * pi / 128)
+    0.63439328416364559882,  // cos(36 * pi / 128)
+    0.61523159058062693028,  // cos(37 * pi / 128)
+    0.59569930449243346793,  // cos(38 * pi / 128)
+    0.57580819141784544968,  // cos(39 * pi / 128)
+    0.55557023301960228867,  // cos(40 * pi / 128)
+    0.53499761988709737537,  // cos(41 * pi / 128)
+    0.51410274419322177231,  // cos(42 * pi / 128)
+    0.49289819222978414892,  // cos(43 * pi / 128)
+    0.47139673682599780857,  // cos(44 * pi / 128)
+    0.44961132965460659516,  // cos(45 * pi / 128)
+    0.42755509343028219593,  // cos(46 * pi / 128)
+    0.40524131400498980549,  // cos(47 * pi / 128)
+    0.38268343236508983729,  // cos(48 * pi / 128)
+    0.35989503653498827740,  // cos(49 * pi / 128)
+    0.33688985339222005111,  // cos(50 * pi / 128)
+    0.31368174039889151761,  // cos(51 * pi / 128)
+    0.29028467725446227554,  // cos(52 * pi / 128)
+    0.26671275747489842090,  // cos(53 * pi / 128)
+    0.24298017990326398197,  // cos(54 * pi / 128)
+    0.21910124015686976984,  // cos(55 * pi / 128)
+    0.19509032201612830359,  // cos(56 * pi / 128)
+    0.17096188876030135595,  // cos(57 * pi / 128)
+    0.14673047445536174793,  // cos(58 * pi / 128)
+    0.12241067519921627893,  // cos(59 * pi / 128)
+    0.09801714032956077016,  // cos(60 * pi / 128)
+    0.07356456359966745406,  // cos(61 * pi / 128)
+    0.04906767432741813290,  // cos(62 * pi / 128)
+    0.02454122852291226731,  // cos(63 * pi / 128)
+  };
+
+  for (i = 0; i < 32; ++i) {
+    step1[i] = input[stride * i] + input[stride * (63 - i)];
+    step1[32 + i] = (input[stride * i] -
+                     input[stride * (63 - i)]) * C[i * 2 + 1];
+  }
+
+  dct32_1d(step1, step2, 1);
+  dct32_1d(step1 + 32, step2 + 32, 1);
+
+  for (i = 0; i < 64; i += 2) {
+    output[stride*i] = step2[i / 2];
+  }
+  output[stride * 1] = 2 * step2[32] * C[32];
+  for (i = 3; i < 64; i += 2) {
+    output[stride * i] = 2 * step2[32 + i / 2] - output[stride * (i - 2)];
+  }
+}
+
+void vp9_fdct64x64_c(const int16_t *input, tran_low_t *out, int stride) {
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    int i, j;
+    double output[4096];
+    // First transform columns
+    for (i = 0; i < 64; i++) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; j++)
+        temp_in[j] = input[j * stride + i];
+      dct64_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; j++)
+        output[j * 64 + i] = temp_out[j];
+    }
+    // Then transform rows
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = output[j + i * 64];
+      dct64_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        output[j + i * 64] = temp_out[j];
+    }
+    // Scale by some magic number
+    for (i = 0; i < 4096; i++) {
+      out[i] = (tran_low_t)round(output[i] / 16);
+    }
+  }
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+void vp9_fdct64x64_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 64; ++r)
+    for (c = 0; c < 64; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 5;
+  output[1] = 0;
+}
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
                          int stride) {
@ -1498,4 +1950,15 @@ void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
                               int stride) {
  vp9_fdct32x32_rd_c(input, out, stride);
 }
+
+#if CONFIG_TX64X64
+void vp9_highbd_fdct64x64_1_c(const int16_t *input, tran_low_t *out,
+                              int stride) {
+  vp9_fdct64x64_1_c(input, out, stride);
+}
+
+void vp9_highbd_fdct64x64_c(const int16_t *input, tran_low_t *out, int stride) {
+  vp9_fdct64x64_c(input, out, stride);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -661,11 +661,23 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,

  // FIXME(rbultje) I'm pretty sure this should go to the end of this block
  // (i.e. after the output_enabled)
+#if CONFIG_TX64X64
+  if (bsize < BLOCK_64X64) {
+    if (bsize < BLOCK_32X32) {
+      if (bsize < BLOCK_16X16) {
+        ctx->tx_rd_diff[ALLOW_16X16] = ctx->tx_rd_diff[ALLOW_8X8];
+      }
+      ctx->tx_rd_diff[ALLOW_32X32] = ctx->tx_rd_diff[ALLOW_16X16];
+    }
+    ctx->tx_rd_diff[ALLOW_64X64] = ctx->tx_rd_diff[ALLOW_32X32];
+  }
+#else
  if (bsize < BLOCK_32X32) {
    if (bsize < BLOCK_16X16)
      ctx->tx_rd_diff[ALLOW_16X16] = ctx->tx_rd_diff[ALLOW_8X8];
    ctx->tx_rd_diff[ALLOW_32X32] = ctx->tx_rd_diff[ALLOW_16X16];
  }
+#endif

  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
@ -2581,8 +2593,8 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
      set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-                 cm->frame_type != KEY_FRAME ) {
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+               cm->frame_type != KEY_FRAME ) {
      choose_partitioning(cpi, tile, mi_row, mi_col);
      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
@ -2678,7 +2690,11 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
  if (cpi->mb.e_mbd.lossless)
    return ONLY_4X4;
  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+#if CONFIG_TX64X64
+    return ALLOW_64X64;
+#else
    return ALLOW_32X32;
+#endif
  else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
           cpi->sf.tx_size_search_method == USE_TX_8X8)
    return TX_MODE_SELECT;
@ -3404,9 +3420,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {

 #if CONFIG_VP9_HIGHBITDEPTH
  if (cm->use_highbitdepth)
-    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-  else
    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
  x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
                                      vp9_highbd_idct4x4_add;
 #else
@ -3581,41 +3597,99 @@ void vp9_encode_frame(VP9_COMP *cpi) {
      }
    }

+#if CONFIG_TX64X64
    if (cm->tx_mode == TX_MODE_SELECT) {
-      int count4x4 = 0;
-      int count8x8_lp = 0, count8x8_8x8p = 0;
+      int count4x4_lp = 0;
+      int count8x8_8x8p = 0, count8x8_lp = 0;
      int count16x16_16x16p = 0, count16x16_lp = 0;
-      int count32x32 = 0;
+      int count32x32_32x32p = 0, count32x32_lp = 0;
+      int count64x64_64x64p = 0;

      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += cm->counts.tx.p32x32[i][TX_4X4];
-        count4x4 += cm->counts.tx.p16x16[i][TX_4X4];
-        count4x4 += cm->counts.tx.p8x8[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p64x64[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p32x32[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p16x16[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p8x8[i][TX_4X4];
+
+        count8x8_lp += cm->counts.tx.p64x64[i][TX_8X8];
+        count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
+        count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
+        count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];
+
+        count16x16_lp += cm->counts.tx.p64x64[i][TX_16X16];
+        count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
+        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
+
+        count32x32_lp += cm->counts.tx.p64x64[i][TX_32X32];
+        count32x32_32x32p += cm->counts.tx.p32x32[i][TX_32X32];
+
+        count64x64_64x64p += cm->counts.tx.p64x64[i][TX_64X64];
+      }
+
+      if (count4x4_lp == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32_lp == 0 && count32x32_32x32p == 0 &&
+          count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_8X8;
+        reset_skip_tx_size(cm, TX_8X8);
+      } else if (count8x8_8x8p == 0 && count8x8_lp == 0 &&
+                 count16x16_16x16p == 0 && count16x16_lp == 0 &&
+                 count32x32_32x32p == 0 && count32x32_lp == 0 &&
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ONLY_4X4;
+        reset_skip_tx_size(cm, TX_4X4);
+      } else if (count4x4_lp == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count32x32_lp == 0) {
+        cm->tx_mode = ALLOW_64X64;
+      } else if (count4x4_lp == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_32X32;
+        reset_skip_tx_size(cm, TX_32X32);
+      } else if (count4x4_lp == 0 && count8x8_lp == 0 &&
+                 count32x32_lp == 0 && count32x32_32x32p == 0 &&
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_16X16;
+        reset_skip_tx_size(cm, TX_16X16);
+      }
+    }
+#else
+    if (cm->tx_mode == TX_MODE_SELECT) {
+      int count4x4_lp = 0;
+      int count8x8_8x8p = 0, count8x8_lp = 0;
+      int count16x16_16x16p = 0, count16x16_lp = 0;
+      int count32x32_32x32p = 0;
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        count4x4_lp += cm->counts.tx.p32x32[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p16x16[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p8x8[i][TX_4X4];

        count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
        count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
        count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];

-        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
        count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
-        count32x32 += cm->counts.tx.p32x32[i][TX_32X32];
+        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
+        count32x32_32x32p += cm->counts.tx.p32x32[i][TX_32X32];
      }

-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-          count32x32 == 0) {
+      if (count4x4_lp == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32_32x32p == 0) {
        cm->tx_mode = ALLOW_8X8;
        reset_skip_tx_size(cm, TX_8X8);
      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+                 count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count32x32_32x32p == 0) {
        cm->tx_mode = ONLY_4X4;
        reset_skip_tx_size(cm, TX_4X4);
-      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4_lp == 0) {
        cm->tx_mode = ALLOW_32X32;
-      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+      } else if (count32x32_32x32p == 0 && count8x8_lp == 0 &&
+                 count4x4_lp == 0) {
        cm->tx_mode = ALLOW_16X16;
        reset_skip_tx_size(cm, TX_16X16);
      }
    }
+#endif
  } else {
    cm->reference_mode = SINGLE_REFERENCE;
    encode_frame_internal(cpi);
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -135,16 +135,16 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
  struct macroblock_plane *const p = &mb->plane[plane];
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const int ref = is_inter_block(&xd->mi[0].src_mi->mbmi);
-  vp9_token_state tokens[1025][2];
-  unsigned best_index[1025][2];
-  uint8_t token_cache[1024];
+  vp9_token_state tokens[MAX_NUM_COEFS + 1][2];
+  unsigned best_index[MAX_NUM_COEFS + 1][2];
+  uint8_t token_cache[MAX_NUM_COEFS];
  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
  const int eob = p->eobs[block];
  const PLANE_TYPE type = pd->plane_type;
  const int default_eob = 16 << (tx_size << 1);
-  const int mul = 1 + (tx_size == TX_32X32);
+  const int mul = 1 << (tx_size >= TX_32X32 ? tx_size - TX_16X16 : 0);
  const int16_t *dequant_ptr = pd->dequant;
  const uint8_t *const band_translate = get_band_translate(tx_size);
  const scan_order *const so = get_scan(xd, tx_size, type, block);
@ -392,6 +392,16 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        vp9_highbd_fdct64x64(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp_64x64(coeff, 4096, x->skip_block, p->zbin,
+                                     p->round_fp, p->quant_fp, p->quant_shift,
+                                     qcoeff, dqcoeff, pd->dequant,
+                                     p->zbin_extra, eob, scan_order->scan,
+                                     scan_order->iscan);
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
@ -429,6 +439,15 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      vp9_fdct64x64(src_diff, coeff, diff_stride);
+      vp9_quantize_fp_64x64(coeff, 4096, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                            scan_order->iscan);
+      break;
+#endif  // CONFIG_TX64X64
    case TX_32X32:
      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
@ -482,6 +501,14 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        vp9_highbd_fdct64x64_1(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_dc_64x64(coeff, x->skip_block, p->round,
+                                     p->quant_fp[0], qcoeff, dqcoeff,
+                                     pd->dequant[0], eob);
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        vp9_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
        vp9_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
@ -514,6 +541,14 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      vp9_fdct64x64_1(src_diff, coeff, diff_stride);
+      vp9_quantize_dc_64x64(coeff, x->skip_block, p->round,
+                            p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
+      break;
+#endif  // CONFIG_TX64X64
    case TX_32X32:
      vp9_fdct32x32_1(src_diff, coeff, diff_stride);
      vp9_quantize_dc_32x32(coeff, x->skip_block, p->round,
@ -563,6 +598,15 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        vp9_highbd_fdct64x64(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b_64x64(coeff, 4096, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, p->zbin_extra, eob,
+                                    scan_order->scan, scan_order->iscan);
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
        vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
@ -599,6 +643,15 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      vp9_fdct64x64(src_diff, coeff, diff_stride);
+      vp9_quantize_b_64x64(coeff, 4096, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           scan_order->iscan);
+      break;
+#endif  // CONFIG_TX64X64
    case TX_32X32:
      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
@ -649,6 +702,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
  a = &ctx->ta[plane][i];
  l = &ctx->tl[plane][j];
+  if (plane) assert(tx_size != TX_64X64);

  // TODO(jingning): per transformed block zero forcing only enabled for
  // luma component. will integrate chroma components as well.
@ -695,6 +749,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        vp9_highbd_idct64x64_add(dqcoeff, dst, pd->dst.stride,
+                                 p->eobs[block], xd->bd);
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride,
                                 p->eobs[block], xd->bd);
@ -722,6 +782,11 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      vp9_idct64x64_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      break;
+#endif
    case TX_32X32:
      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
      break;
@ -832,6 +897,29 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        scan_order = &vp9_default_scan_orders[TX_64X64];
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block >> 8, bwl, TX_64X64, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+        if (!x->skip_recode) {
+          vp9_highbd_subtract_block(64, 64, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          vp9_highbd_fdct64x64(src_diff, coeff, diff_stride);
+          vp9_highbd_quantize_b_64x64(coeff, 4096, x->skip_block, p->zbin,
+                                      p->round, p->quant, p->quant_shift,
+                                      qcoeff, dqcoeff, pd->dequant,
+                                      p->zbin_extra, eob,
+                                      scan_order->scan, scan_order->iscan);
+          if (!x->skip_encode && *eob) {
+            vp9_highbd_idct64x64_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          }
+        }
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        scan_order = &vp9_default_scan_orders[TX_32X32];
        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
@ -941,6 +1029,28 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      assert(plane == 0);
+      scan_order = &vp9_default_scan_orders[TX_64X64];
+      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+      vp9_predict_intra_block(xd, block >> 8, bwl, TX_64X64, mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
+      if (!x->skip_recode) {
+        vp9_subtract_block(64, 64, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        vp9_fdct64x64(src_diff, coeff, diff_stride);
+        vp9_quantize_b_64x64(coeff, 4096, x->skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                             scan_order->iscan);
+      }
+      if (!x->skip_encode && *eob)
+        vp9_idct64x64_add(dqcoeff, dst, dst_stride, *eob);
+      break;
+#endif  // CONFIG_TX64X64
    case TX_32X32:
      scan_order = &vp9_default_scan_orders[TX_32X32];
      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@ -3136,7 +3136,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
  release_scaled_references(cpi);
  vp9_update_reference_frames(cpi);

-  for (t = TX_4X4; t <= TX_32X32; t++)
+  for (t = TX_4X4; t < TX_SIZES; t++)
    full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]);

  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@ -65,10 +65,15 @@ void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
 }
 #endif

-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+static INLINE void quantize_dc_bigtx(const tran_low_t *coeff_ptr,
+                                     int skip_block,
+                                     const int16_t *round_ptr,
+                                     const int16_t quant,
+                                     tran_low_t *qcoeff_ptr,
+                                     tran_low_t *dqcoeff_ptr,
+                                     const int16_t dequant_ptr,
+                                     uint16_t *eob_ptr,
+                                     int logsizeby32) {
  const int rc = 0;
  const int coeff = coeff_ptr[rc];
  const int coeff_sign = (coeff >> 31);
@ -78,24 +83,43 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
  if (!skip_block) {

    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
+    tmp = (tmp * quant) >> (15 - logsizeby32);
    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / (2 << logsizeby32);
    if (tmp)
      eob = 0;
  }
  *eob_ptr = eob + 1;
 }

+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  quantize_dc_bigtx(coeff_ptr, skip_block, round_ptr, quant,
+                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  quantize_dc_bigtx(coeff_ptr, skip_block, round_ptr, quant,
+                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 1);
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
+static INLINE void highbd_quantize_dc_bigtx(const tran_low_t *coeff_ptr,
+                                            int skip_block,
+                                            const int16_t *round_ptr,
+                                            const int16_t quant,
+                                            tran_low_t *qcoeff_ptr,
+                                            tran_low_t *dqcoeff_ptr,
+                                            const int16_t dequant_ptr,
+                                            uint16_t *eob_ptr,
+                                            int logsizeby32) {
  int eob = -1;

  if (!skip_block) {
@ -106,15 +130,41 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,

    const int64_t tmp =
        (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
-         quant) >> 15;
+         quant) >> (15 - logsizeby32);
    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / (2 << logsizeby32);
    if (tmp)
      eob = 0;
  }
  *eob_ptr = eob + 1;
 }
-#endif
+
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  highbd_quantize_dc_bigtx(coeff_ptr, skip_block, round_ptr, quant,
+                           qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  highbd_quantize_dc_bigtx(coeff_ptr, skip_block, round_ptr, quant,
+                           qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 1);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       int skip_block,
@ -210,15 +260,21 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,

 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr,
-                             int zbin_oq_value, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+static INLINE void quantize_fp_bigtx(const tran_low_t *coeff_ptr,
+                                     intptr_t n_coeffs,
+                                     int skip_block,
+                                     const int16_t *zbin_ptr,
+                                     const int16_t *round_ptr,
+                                     const int16_t *quant_ptr,
+                                     const int16_t *quant_shift_ptr,
+                                     tran_low_t *qcoeff_ptr,
+                                     tran_low_t *dqcoeff_ptr,
+                                     const int16_t *dequant_ptr,
+                                     int zbin_oq_value,
+                                     uint16_t *eob_ptr,
+                                     const int16_t *scan,
+                                     const int16_t *iscan,
+                                     int logsizeby32) {
  int i, eob = -1;
  (void)zbin_ptr;
  (void)quant_shift_ptr;
@ -236,12 +292,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      int tmp = 0;
      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> (2 + logsizeby32))) {
        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        tmp = (abs_coeff * quant_ptr[rc != 0]) >> (15 - logsizeby32);
        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] /
+                          (2 << logsizeby32);
      }

      if (tmp)
@ -251,18 +308,64 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  *eob_ptr = eob + 1;
 }

+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                             intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr,
+                             const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr,
+                             tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             int zbin_oq_value,
+                             uint16_t *eob_ptr,
+                             const int16_t *scan,
+                             const int16_t *iscan) {
+  quantize_fp_bigtx(coeff_ptr, n_coeffs, skip_block,
+                    zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                    zbin_oq_value, eob_ptr, scan, iscan, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_quantize_fp_64x64_c(const tran_low_t *coeff_ptr,
+                             intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr,
+                             const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr,
+                             tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             int zbin_oq_value,
+                             uint16_t *eob_ptr,
+                             const int16_t *scan,
+                             const int16_t *iscan) {
+  quantize_fp_bigtx(coeff_ptr, n_coeffs, skip_block,
+                    zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                    zbin_oq_value, eob_ptr, scan, iscan, 1);
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
-                                    intptr_t n_coeffs, int skip_block,
-                                    const int16_t *zbin_ptr,
-                                    const int16_t *round_ptr,
-                                    const int16_t *quant_ptr,
-                                    const int16_t *quant_shift_ptr,
-                                    tran_low_t *qcoeff_ptr,
-                                    tran_low_t *dqcoeff_ptr,
-                                    const int16_t *dequant_ptr,
-                                    int zbin_oq_value, uint16_t *eob_ptr,
-                                    const int16_t *scan, const int16_t *iscan) {
+static INLINE void highbd_quantize_fp_bigtx(const tran_low_t *coeff_ptr,
+                                            intptr_t n_coeffs,
+                                            int skip_block,
+                                            const int16_t *zbin_ptr,
+                                            const int16_t *round_ptr,
+                                            const int16_t *quant_ptr,
+                                            const int16_t *quant_shift_ptr,
+                                            tran_low_t *qcoeff_ptr,
+                                            tran_low_t *dqcoeff_ptr,
+                                            const int16_t *dequant_ptr,
+                                            int zbin_oq_value,
+                                            uint16_t *eob_ptr,
+                                            const int16_t *scan,
+                                            const int16_t *iscan,
+                                            int logsizeby32) {
  int i, eob = -1;
  (void)zbin_ptr;
  (void)quant_shift_ptr;
@ -280,12 +383,13 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
      int64_t tmp = 0;
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> (2 + logsizeby32))) {
        tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
                    INT32_MIN, INT32_MAX);
-        tmp = (tmp * quant_ptr[rc != 0]) >> 15;
+        tmp = (tmp * quant_ptr[rc != 0]) >> (15 - logsizeby32);
        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] /
+                          (2 << logsizeby32);
      }

      if (tmp)
@ -294,7 +398,49 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
  }
  *eob_ptr = eob + 1;
 }
-#endif
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs,
+                                    int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    int zbin_oq_value,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan,
+                                    const int16_t *iscan) {
+  highbd_quantize_fp_bigtx(coeff_ptr, n_coeffs, skip_block,
+                           zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                           qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                           zbin_oq_value, eob_ptr, scan, iscan, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_highbd_quantize_fp_64x64_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs,
+                                    int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    int zbin_oq_value,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan,
+                                    const int16_t *iscan) {
+  highbd_quantize_fp_bigtx(coeff_ptr, n_coeffs, skip_block,
+                           zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                           qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                           zbin_oq_value, eob_ptr, scan, iscan, 1);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                      int skip_block,
@ -403,23 +549,29 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  }
  *eob_ptr = eob + 1;
 }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

-void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr,
-                            int zbin_oq_value, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
+static INLINE void quantize_b_bigtx(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs,
+                                    int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    int zbin_oq_value,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan,
+                                    const int16_t *iscan,
+                                    int logsizeby32) {
  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

  int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[MAX_NUM_COEFS];
  int i, eob = -1;
  (void)iscan;

@ -446,13 +598,14 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      const int coeff_sign = (coeff >> 31);
      int tmp;
      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], (1 + logsizeby32));
      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-               quant_shift_ptr[rc != 0]) >> 15;
+               quant_shift_ptr[rc != 0]) >> (15 - logsizeby32);

      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] /
+                        (2 << logsizeby32);

      if (tmp)
        eob = idx_arr[i];
@ -461,24 +614,70 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  *eob_ptr = eob + 1;
 }

+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr,
+                            const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan,
+                            const int16_t *iscan) {
+  quantize_b_bigtx(coeff_ptr, n_coeffs, skip_block,
+                   zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                   qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                   zbin_oq_value, eob_ptr, scan, iscan, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_quantize_b_64x64_c(const tran_low_t *coeff_ptr,
+                            intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr,
+                            const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan,
+                            const int16_t *iscan) {
+  quantize_b_bigtx(coeff_ptr, n_coeffs, skip_block,
+                   zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                   qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                   zbin_oq_value, eob_ptr, scan, iscan, 1);
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, int skip_block,
-                                   const int16_t *zbin_ptr,
-                                   const int16_t *round_ptr,
-                                   const int16_t *quant_ptr,
-                                   const int16_t *quant_shift_ptr,
-                                   tran_low_t *qcoeff_ptr,
-                                   tran_low_t *dqcoeff_ptr,
-                                   const int16_t *dequant_ptr,
-                                   int zbin_oq_value, uint16_t *eob_ptr,
-                                   const int16_t *scan, const int16_t *iscan) {
+static INLINE void highbd_quantize_b_bigtx(const tran_low_t *coeff_ptr,
+                                           intptr_t n_coeffs,
+                                           int skip_block,
+                                           const int16_t *zbin_ptr,
+                                           const int16_t *round_ptr,
+                                           const int16_t *quant_ptr,
+                                           const int16_t *quant_shift_ptr,
+                                           tran_low_t *qcoeff_ptr,
+                                           tran_low_t *dqcoeff_ptr,
+                                           const int16_t *dequant_ptr,
+                                           int zbin_oq_value,
+                                           uint16_t *eob_ptr,
+                                           const int16_t *scan,
+                                           const int16_t *iscan,
+                                           int logsizeby32) {
  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };

  int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[MAX_NUM_COEFS];
  int i, eob = -1;
  (void)iscan;

@ -504,14 +703,15 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
      const int coeff = coeff_ptr[rc];
      const int coeff_sign = (coeff >> 31);
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int64_t tmp = clamp(abs_coeff +
-                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                          INT32_MIN, INT32_MAX);
+      int64_t tmp = clamp(
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], (1 + logsizeby32)),
+          INT32_MIN, INT32_MAX);
      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >> 15;
+               quant_shift_ptr[rc != 0]) >> (15 - logsizeby32);

      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] /
+                        (2 << logsizeby32);

      if (tmp)
        eob = idx_arr[i];
@ -519,7 +719,49 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
  }
  *eob_ptr = eob + 1;
 }
-#endif
+
+void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs,
+                                   int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   int zbin_oq_value,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan,
+                                   const int16_t *iscan) {
+  highbd_quantize_b_bigtx(coeff_ptr, n_coeffs, skip_block,
+                          zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          zbin_oq_value, eob_ptr, scan, iscan, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_highbd_quantize_b_64x64_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs,
+                                   int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   int zbin_oq_value,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan,
+                                   const int16_t *iscan) {
+  highbd_quantize_b_bigtx(coeff_ptr, n_coeffs, skip_block,
+                          zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          zbin_oq_value, eob_ptr, scan, iscan, 1);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                const int16_t *scan, const int16_t *iscan) {
@ -530,21 +772,21 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    vp9_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
-                        16, x->skip_block,
-                        p->zbin, p->round, p->quant, p->quant_shift,
-                        BLOCK_OFFSET(p->qcoeff, block),
-                        BLOCK_OFFSET(pd->dqcoeff, block),
-                        pd->dequant, p->zbin_extra, &p->eobs[block],
-                        scan, iscan);
+                          16, x->skip_block,
+                          p->zbin, p->round, p->quant, p->quant_shift,
+                          BLOCK_OFFSET(p->qcoeff, block),
+                          BLOCK_OFFSET(pd->dqcoeff, block),
+                          pd->dequant, p->zbin_extra, &p->eobs[block],
+                          scan, iscan);
    return;
  }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
  vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
-           16, x->skip_block,
-           p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(p->qcoeff, block),
-           BLOCK_OFFSET(pd->dqcoeff, block),
-           pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
+                 16, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift,
+                 BLOCK_OFFSET(p->qcoeff, block),
+                 BLOCK_OFFSET(pd->dqcoeff, block),
+                 pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
 }

 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@ -45,6 +45,12 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                           const int16_t *round_ptr, const int16_t quant_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+#if CONFIG_TX64X64
+void vp9_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                const int16_t *scan, const int16_t *iscan);

@ -61,7 +67,17 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t dequant_ptr,
                                  uint16_t *eob_ptr);
-#endif
+#if CONFIG_TX64X64
+void vp9_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 struct VP9_COMP;
 struct VP9Common;
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@ -76,7 +76,7 @@ static void fill_token_costs(vp9_coeff_cost *c,
                             vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
  int i, j, k, l;
  TX_SIZE t;
-  for (t = TX_4X4; t <= TX_32X32; ++t)
+  for (t = TX_4X4; t < TX_SIZES; ++t)
    for (i = 0; i < PLANE_TYPES; ++i)
      for (j = 0; j < REF_TYPES; ++j)
        for (k = 0; k < COEF_BANDS; ++k)
@ -425,6 +425,14 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
      for (i = 0; i < num_4x4_h; i += 8)
        t_left[i] = !!*(const uint64_t *)&left[i];
      break;
+#if CONFIG_TX64X64
+    case TX_64X64:
+      for (i = 0; i < num_4x4_w; i += 16)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 16)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+#endif
    default:
      assert(0 && "Invalid transform size.");
      break;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -340,6 +340,9 @@ static const int16_t band_counts[TX_SIZES][8] = {
  { 1, 2, 3, 4, 11,   64 - 21, 0 },
  { 1, 2, 3, 4, 11,  256 - 21, 0 },
  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+#if CONFIG_TX64X64
+  { 1, 2, 3, 4, 11, 4096 - 21, 0 },
+#endif
 };
 static INLINE int cost_coeffs(MACROBLOCK *x,
                              int plane, int block,
@ -357,7 +360,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
  unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_NUM_COEFS];
  int pt = combine_entropy_contexts(*A, *L);
  int c, cost;
  // Check for consistency of tx_size with mode info
@ -416,6 +419,8 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
  return cost;
 }

+#define right_shift_signed(x, s) ((s) < 0 ? (x) << (-(s)) : (x) >> (s))
+
 #if CONFIG_VP9_HIGHBITDEPTH
 static void dist_block(int plane, int block, TX_SIZE tx_size,
                       struct rdcost_block_args* args, int bd) {
@ -429,17 +434,23 @@ static void dist_block(int plane, int block, TX_SIZE tx_size,
  const struct macroblock_plane *const p = &x->plane[plane];
  const struct macroblockd_plane *const pd = &xd->plane[plane];
  int64_t this_sse;
+#if CONFIG_TX64X64
+  int shift = (tx_size == TX_64X64 ? -2 : (tx_size == TX_32X32 ? 0 : 2));
+#else
  int shift = tx_size == TX_32X32 ? 0 : 2;
+#endif
  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
-  args->dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                      &this_sse, bd) >> shift;
+  args->dist = right_shift_signed(
+      vp9_highbd_block_error(
+          coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd), shift);
 #else
-  args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                               &this_sse) >> shift;
+  args->dist = right_shift_signed(
+      vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse), shift);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  args->sse  = this_sse >> shift;
+  args->sse = right_shift_signed(this_sse, shift);

  if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {
    // TODO(jingning): tune the model to better capture the distortion.
@ -514,9 +525,12 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 #if CONFIG_VP9_HIGHBITDEPTH
        dc_correct >>= ((xd->bd - 8) * 2);
 #endif
-        if (tx_size != TX_32X32)
+        if (tx_size < TX_32X32)
          dc_correct >>= 2;
-
+#if CONFIG_TX64X64
+        else if (tx_size == TX_64X64)
+          dc_correct <<= 2;
+#endif
        args->dist = MAX(0, args->sse - dc_correct);
      }
    } else {
@ -629,10 +643,15 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
  int r[TX_SIZES][2], s[TX_SIZES];
  int64_t d[TX_SIZES], sse[TX_SIZES];
-  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX}};
+  int64_t rd[TX_SIZES][2] = {
+    {INT64_MAX, INT64_MAX},
+    {INT64_MAX, INT64_MAX},
+    {INT64_MAX, INT64_MAX},
+    {INT64_MAX, INT64_MAX},
+#if CONFIG_TX64X64
+    {INT64_MAX, INT64_MAX},
+#endif
+  };
  int n, m;
  int s0, s1;
  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
@ -681,7 +700,6 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
                      best_tx : MIN(max_tx_size, max_mode_tx_size);

-
  *distortion = d[mbmi->tx_size];
  *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
  *skip       = s[mbmi->tx_size];
@ -691,8 +709,14 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
  tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
  tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
+#if CONFIG_TX64X64
+  tx_cache[ALLOW_64X64] = rd[MIN(max_tx_size, TX_64X64)][0];
+#endif

-  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
+#if CONFIG_TX64X64
+  if (max_tx_size >= TX_64X64 && best_tx == TX_64X64) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_64X64][1];
+  } else if (max_tx_size >= TX_32X32 && best_tx == TX_32X32) {
    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
@ -701,6 +725,17 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  } else {
    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
  }
+#else
+  if (max_tx_size >= TX_32X32 && best_tx == TX_32X32) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
+  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
+  } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
+  } else {
+    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
+  }
+#endif
 }

 static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
@ -1970,12 +2005,13 @@ static void estimate_ref_frame_costs(const VP9_COMMON *cm,
  }
 }

-static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
-                         int mode_index,
-                         int64_t comp_pred_diff[REFERENCE_MODES],
-                         const int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
-                         int skippable) {
+static void store_coding_context(
+    MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+    int mode_index,
+    int64_t comp_pred_diff[REFERENCE_MODES],
+    const int64_t tx_size_diff[TX_MODES],
+    int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
+    int skippable) {
  MACROBLOCKD *const xd = &x->e_mbd;

  // Take a snapshot of the coding context so it can be
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@ -48,6 +48,10 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
    sf->adaptive_pred_interp_filter = 1;

    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
+#endif
    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
@ -114,6 +118,10 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
    sf->adaptive_rd_thresh = 3;
    sf->mode_skip_start = 6;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC;
+#endif
    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
    sf->adaptive_interp_filter_search = 1;
@ -184,6 +192,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
    sf->adaptive_pred_interp_filter = 1;
    sf->mv.auto_mv_step_size = 1;
    sf->adaptive_rd_thresh = 2;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
+#endif
    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
@ -246,6 +258,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
      sf->intra_uv_mode_mask[i] = INTRA_DC;
    }
    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+#endif
    sf->frame_parameter_update = 0;
    sf->mv.search_method = FAST_HEX;

--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@ -296,7 +296,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
  VP9_COMP *cpi = args->cpi;
  MACROBLOCKD *xd = args->xd;
  TOKENEXTRA **tp = args->tp;
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_NUM_COEFS];
  struct macroblock_plane *p = &cpi->mb.plane[plane];
  struct macroblockd_plane *pd = &xd->plane[plane];
  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
@ -374,7 +374,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                       counts[band[c]][pt]);
    ++eob_branch[band[c]][pt];
  }
-
  *tp = t;

  vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);