Merge "Adding a 64x64 transform mode" into nextgen

2014-10-30 00:51:35 -07:00 · 2014-10-30 00:51:35 -07:00 · 8bdf4cebb9
commit 8bdf4cebb9
parent 0e64aa5073 0c7a94f49b
28 changed files with 4380 additions and 191 deletions
--- a/1
+++ b/1
@ -282,6 +282,7 @@ EXPERIMENT_LIST="
    vp9_temporal_denoising
    fp_mb_stats
    emulate_hardware
+    tx64x64
 "
 CONFIG_LIST="
    external_build
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@ -101,22 +101,35 @@ const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
  TX_4X4,   TX_4X4,   TX_4X4,
  TX_8X8,   TX_8X8,   TX_8X8,
  TX_16X16, TX_16X16, TX_16X16,
-  TX_32X32, TX_32X32, TX_32X32, TX_32X32
+  TX_32X32, TX_32X32, TX_32X32,
+#if CONFIG_TX64X64
+  TX_64X64,
+#else
+  TX_32X32,
+#endif
 };

 const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
-    BLOCK_4X4,  // TX_4X4
-    BLOCK_8X8,  // TX_8X8
+    BLOCK_4X4,    // TX_4X4
+    BLOCK_8X8,    // TX_8X8
    BLOCK_16X16,  // TX_16X16
    BLOCK_32X32,  // TX_32X32
+#if CONFIG_TX64X64
+    BLOCK_32X32,  // TX_64X64
+#endif
 };

 const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
-  TX_4X4,  // ONLY_4X4
-  TX_8X8,  // ALLOW_8X8
+  TX_4X4,    // ONLY_4X4
+  TX_8X8,    // ALLOW_8X8
  TX_16X16,  // ALLOW_16X16
  TX_32X32,  // ALLOW_32X32
+#if CONFIG_TX64X64
+  TX_64X64,  // ALLOW_64X64
+  TX_64X64,  // TX_MODE_SELECT
+#else
  TX_32X32,  // TX_MODE_SELECT
+#endif
 };

 const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@ -44,7 +44,7 @@ const vp9_prob vp9_cat6_prob_high12[] = {
 };
 #endif

-const uint8_t vp9_coefband_trans_8x8plus[1024] = {
+const uint8_t vp9_coefband_trans_8x8plus[MAX_NUM_COEFS] = {
  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
  4, 4, 4, 4, 4, 5,
  // beyond MAXBAND_INDEX+1 all values are filled as 5
@ -111,6 +111,200 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+#if CONFIG_TX64X64
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+#endif
 };

 const uint8_t vp9_coefband_trans_4x4[16] = {
@ -736,6 +930,92 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
  }
 };

+#if CONFIG_TX64X64
+static const vp9_coeff_probs_model default_coef_probs_64x64[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
+  }
+};
+#endif  // CONFIG_TX64X64
+
 static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
  vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
             MODEL_NODES * sizeof(vp9_prob));
@ -752,6 +1032,9 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+#if CONFIG_TX64X64
+  vp9_copy(cm->fc.coef_probs[TX_64X64], default_coef_probs_64x64);
+#endif
 }

 #define COEF_COUNT_SAT 24
@ -806,6 +1089,6 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) {
    update_factor = COEF_MAX_UPDATE_FACTOR;
    count_sat = COEF_COUNT_SAT;
  }
-  for (t = TX_4X4; t <= TX_32X32; t++)
+  for (t = TX_4X4; t < TX_SIZES; t++)
    adapt_coef_probs(cm, t, count_sat, update_factor);
 }
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@ -90,10 +90,20 @@ extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS];
 extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS];
 #endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_TX64X64
+#define DCT_MAX_VALUE           32768
+#else
 #define DCT_MAX_VALUE           16384
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_TX64X64
+#define DCT_MAX_VALUE_HIGH10   131072
+#define DCT_MAX_VALUE_HIGH12   524288
+#else
 #define DCT_MAX_VALUE_HIGH10    65536
 #define DCT_MAX_VALUE_HIGH12   262144
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 /* Coefficients are predicted via a 3-dimensional probability table. */
@ -153,7 +163,14 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 // This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21

-DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+#if CONFIG_TX64X64
+#define MAX_NUM_COEFS 4096
+#else
+#define MAX_NUM_COEFS 1024
+#endif
+
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_coefband_trans_8x8plus[MAX_NUM_COEFS]);
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]);

 static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
@ -204,6 +221,12 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
      above_ec = !!*(const uint64_t *)a;
      left_ec  = !!*(const uint64_t *)l;
      break;
+#if CONFIG_TX64X64
+    case TX_64X64:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
+      break;
+#endif
    default:
      assert(0 && "Invalid transform size.");
      break;
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@ -229,7 +229,7 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
  -D45_PRED, 14,                    /* 6 = D45_NODE */
  -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D207_PRED            /* 8 = D153_NODE */
 };

 const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
@ -265,6 +265,11 @@ static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
 };

 static const struct tx_probs default_tx_probs = {
+#if CONFIG_TX64X64
+  { { 3, 3, 136, 37 },
+    { 3, 5, 52,  13 } },
+#endif
+
  { { 3, 136, 37 },
    { 5, 52,  13 } },

@ -275,6 +280,26 @@ static const struct tx_probs default_tx_probs = {
    { 66  } }
 };

+#if CONFIG_TX64X64
+void tx_counts_to_branch_counts_64x64(const unsigned int *tx_count_64x64p,
+                                      unsigned int (*ct_64x64p)[2]) {
+  ct_64x64p[0][0] = tx_count_64x64p[TX_4X4];
+  ct_64x64p[0][1] = tx_count_64x64p[TX_8X8] +
+                    tx_count_64x64p[TX_16X16] +
+                    tx_count_64x64p[TX_32X32] +
+                    tx_count_64x64p[TX_64X64];
+  ct_64x64p[1][0] = tx_count_64x64p[TX_8X8];
+  ct_64x64p[1][1] = tx_count_64x64p[TX_16X16] +
+                    tx_count_64x64p[TX_32X32] +
+                    tx_count_64x64p[TX_64X64];
+  ct_64x64p[2][0] = tx_count_64x64p[TX_16X16];
+  ct_64x64p[2][1] = tx_count_64x64p[TX_32X32] +
+                    tx_count_64x64p[TX_64X64];
+  ct_64x64p[3][0] = tx_count_64x64p[TX_32X32];
+  ct_64x64p[3][1] = tx_count_64x64p[TX_64X64];
+}
+#endif
+
 void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]) {
  ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
@ -392,25 +417,34 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {

  if (cm->tx_mode == TX_MODE_SELECT) {
    int j;
-    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
-    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
-    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
+    unsigned int branch_ct_8x8p[1][2];
+    unsigned int branch_ct_16x16p[2][2];
+    unsigned int branch_ct_32x32p[3][2];
+#if CONFIG_TX64X64
+    unsigned int branch_ct_64x64p[4][2];
+#endif

    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; ++j)
+      for (j = 0; j < 1; ++j)
        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
                                             branch_ct_8x8p[j]);

      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; ++j)
+      for (j = 0; j < 2; ++j)
        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
                                               branch_ct_16x16p[j]);

      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; ++j)
+      for (j = 0; j < 3; ++j)
        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
                                               branch_ct_32x32p[j]);
+#if CONFIG_TX64X64
+      tx_counts_to_branch_counts_64x64(counts->tx.p64x64[i], branch_ct_64x64p);
+      for (j = 0; j < 4; ++j)
+        fc->tx_probs.p64x64[i][j] = adapt_prob(pre_fc->tx_probs.p64x64[i][j],
+                                               branch_ct_64x64p[j]);
+#endif
    }
  }

--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@ -24,15 +24,21 @@ extern "C" {
 struct VP9Common;

 struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
+#if CONFIG_TX64X64
+  vp9_prob p64x64[TX_SIZE_CONTEXTS][4];
+#endif
+  vp9_prob p32x32[TX_SIZE_CONTEXTS][3];
+  vp9_prob p16x16[TX_SIZE_CONTEXTS][2];
+  vp9_prob p8x8[TX_SIZE_CONTEXTS][1];
 };

 struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+#if CONFIG_TX64X64
+  unsigned int p64x64[TX_SIZE_CONTEXTS][5];
+#endif
+  unsigned int p32x32[TX_SIZE_CONTEXTS][4];
+  unsigned int p16x16[TX_SIZE_CONTEXTS][3];
+  unsigned int p8x8[TX_SIZE_CONTEXTS][2];
 };

 typedef struct frame_contexts {
@ -88,6 +94,10 @@ void vp9_init_mode_probs(FRAME_CONTEXT *fc);

 void vp9_adapt_mode_probs(struct VP9Common *cm);

+#if CONFIG_TX64X64
+void tx_counts_to_branch_counts_64x64(const unsigned int *tx_count_64x64p,
+                                      unsigned int (*ct_64x64p)[2]);
+#endif
 void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                      unsigned int (*ct_32x32p)[2]);
 void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@ -77,6 +77,9 @@ typedef enum {
  TX_8X8 = 1,                      // 8x8 transform
  TX_16X16 = 2,                    // 16x16 transform
  TX_32X32 = 3,                    // 32x32 transform
+#if CONFIG_TX64X64
+  TX_64X64 = 4,                    // 64x64 transform
+#endif
  TX_SIZES
 } TX_SIZE;

@ -86,8 +89,11 @@ typedef enum {
  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
-  TX_MODE_SELECT      = 4,        // transform specified for each block
-  TX_MODES            = 5,
+#if CONFIG_TX64X64
+  ALLOW_64X64         = 4,        // allow block transform size up to 32x32
+#endif
+  TX_MODE_SELECT,                 // transform specified for each block
+  TX_MODES,
 } TX_MODE;

 typedef enum {
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@ -1457,6 +1457,458 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
  }
 }

+#if CONFIG_TX64X64
+#define DownshiftMultiplyBy2(x) x * 2
+#define DownshiftMultiply(x) x
+
+static void idct16f(double *input, double *output, int stride) {
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
+  double step[16];
+  double intermediate[16];
+  double temp1, temp2;
+
+  // step 1 and 2
+  step[ 0] = input[stride*0] + input[stride*8];
+  step[ 1] = input[stride*0] - input[stride*8];
+
+  temp1 = input[stride*4]*C12;
+  temp2 = input[stride*12]*C4;
+
+  temp1 -= temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+
+  step[ 2] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*4]*C4;
+  temp2 = input[stride*12]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  step[ 3] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*2]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] + input[stride*10];
+
+  step[ 4] = temp1 + temp2;
+  step[ 5] = temp1 - temp2;
+
+  temp1 = input[stride*14]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] - input[stride*10];
+
+  step[ 6] = temp2 - temp1;
+  step[ 7] = temp2 + temp1;
+
+  // for odd input
+  temp1 = input[stride*3]*C12;
+  temp2 = input[stride*13]*C4;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[ 8] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*3]*C4;
+  temp2 = input[stride*13]*C12;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[ 9] = DownshiftMultiplyBy2(temp2);
+
+  intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8);
+  intermediate[11] = input[stride*15] - input[stride*1];
+  intermediate[12] = input[stride*15] + input[stride*1];
+  intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8));
+
+  temp1 = input[stride*11]*C12;
+  temp2 = input[stride*5]*C4;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[14] = DownshiftMultiplyBy2(temp2);
+
+  temp1 = input[stride*11]*C4;
+  temp2 = input[stride*5]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[15] = DownshiftMultiplyBy2(temp1);
+
+  step[ 8] = intermediate[ 8] + intermediate[14];
+  step[ 9] = intermediate[ 9] + intermediate[15];
+  step[10] = intermediate[10] + intermediate[11];
+  step[11] = intermediate[10] - intermediate[11];
+  step[12] = intermediate[12] + intermediate[13];
+  step[13] = intermediate[12] - intermediate[13];
+  step[14] = intermediate[ 8] - intermediate[14];
+  step[15] = intermediate[ 9] - intermediate[15];
+
+  // step 3
+  output[stride*0] = step[ 0] + step[ 3];
+  output[stride*1] = step[ 1] + step[ 2];
+  output[stride*2] = step[ 1] - step[ 2];
+  output[stride*3] = step[ 0] - step[ 3];
+
+  temp1 = step[ 4]*C14;
+  temp2 = step[ 7]*C2;
+  temp1 -= temp2;
+  output[stride*4] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 4]*C2;
+  temp2 = step[ 7]*C14;
+  temp1 += temp2;
+  output[stride*7] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C10;
+  temp2 = step[ 6]*C6;
+  temp1 -= temp2;
+  output[stride*5] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C6;
+  temp2 = step[ 6]*C10;
+  temp1 += temp2;
+  output[stride*6] =  DownshiftMultiply(temp1);
+
+  output[stride*8] = step[ 8] + step[11];
+  output[stride*9] = step[ 9] + step[10];
+  output[stride*10] = step[ 9] - step[10];
+  output[stride*11] = step[ 8] - step[11];
+  output[stride*12] = step[12] + step[15];
+  output[stride*13] = step[13] + step[14];
+  output[stride*14] = step[13] - step[14];
+  output[stride*15] = step[12] - step[15];
+
+  // output 4
+  step[ 0] = output[stride*0] + output[stride*7];
+  step[ 1] = output[stride*1] + output[stride*6];
+  step[ 2] = output[stride*2] + output[stride*5];
+  step[ 3] = output[stride*3] + output[stride*4];
+  step[ 4] = output[stride*3] - output[stride*4];
+  step[ 5] = output[stride*2] - output[stride*5];
+  step[ 6] = output[stride*1] - output[stride*6];
+  step[ 7] = output[stride*0] - output[stride*7];
+
+  temp1 = output[stride*8]*C7;
+  temp2 = output[stride*15]*C9;
+  temp1 -= temp2;
+  step[ 8] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C11;
+  temp2 = output[stride*14]*C5;
+  temp1 += temp2;
+  step[ 9] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*10]*C3;
+  temp2 = output[stride*13]*C13;
+  temp1 -= temp2;
+  step[10] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C15;
+  temp2 = output[stride*12]*C1;
+  temp1 += temp2;
+  step[11] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C1;
+  temp2 = output[stride*12]*C15;
+  temp2 -= temp1;
+  step[12] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*10]*C13;
+  temp2 = output[stride*13]*C3;
+  temp1 += temp2;
+  step[13] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C5;
+  temp2 = output[stride*14]*C11;
+  temp2 -= temp1;
+  step[14] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*8]*C9;
+  temp2 = output[stride*15]*C7;
+  temp1 += temp2;
+  step[15] = DownshiftMultiply(temp1);
+
+  // step 5
+  output[stride*0] = step[0] + step[15];
+  output[stride*1] = step[1] + step[14];
+  output[stride*2] = step[2] + step[13];
+  output[stride*3] = step[3] + step[12];
+  output[stride*4] = step[4] + step[11];
+  output[stride*5] = step[5] + step[10];
+  output[stride*6] = step[6] + step[ 9];
+  output[stride*7] = step[7] + step[ 8];
+
+  output[stride*15] = step[0] - step[15];
+  output[stride*14] = step[1] - step[14];
+  output[stride*13] = step[2] - step[13];
+  output[stride*12] = step[3] - step[12];
+  output[stride*11] = step[4] - step[11];
+  output[stride*10] = step[5] - step[10];
+  output[stride*9] = step[6] - step[ 9];
+  output[stride*8] = step[7] - step[ 8];
+}
+
+static void butterfly_32_idct_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step1[32];
+  double step2[32];
+
+  step1[ 0] = input[stride*0];
+  step1[ 1] = input[stride*2];
+  step1[ 2] = input[stride*4];
+  step1[ 3] = input[stride*6];
+  step1[ 4] = input[stride*8];
+  step1[ 5] = input[stride*10];
+  step1[ 6] = input[stride*12];
+  step1[ 7] = input[stride*14];
+  step1[ 8] = input[stride*16];
+  step1[ 9] = input[stride*18];
+  step1[10] = input[stride*20];
+  step1[11] = input[stride*22];
+  step1[12] = input[stride*24];
+  step1[13] = input[stride*26];
+  step1[14] = input[stride*28];
+  step1[15] = input[stride*30];
+
+  step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16);
+  step1[17] = (input[stride*3] + input[stride*1]);
+  step1[18] = (input[stride*5] + input[stride*3]);
+  step1[19] = (input[stride*7] + input[stride*5]);
+  step1[20] = (input[stride*9] + input[stride*7]);
+  step1[21] = (input[stride*11] + input[stride*9]);
+  step1[22] = (input[stride*13] + input[stride*11]);
+  step1[23] = (input[stride*15] + input[stride*13]);
+  step1[24] = (input[stride*17] + input[stride*15]);
+  step1[25] = (input[stride*19] + input[stride*17]);
+  step1[26] = (input[stride*21] + input[stride*19]);
+  step1[27] = (input[stride*23] + input[stride*21]);
+  step1[28] = (input[stride*25] + input[stride*23]);
+  step1[29] = (input[stride*27] + input[stride*25]);
+  step1[30] = (input[stride*29] + input[stride*27]);
+  step1[31] = (input[stride*31] + input[stride*29]);
+
+  idct16f(step1, step2, 1);
+  idct16f(step1 + 16, step2 + 16, 1);
+
+  step2[16] = DownshiftMultiply(step2[16] / (2*C1));
+  step2[17] = DownshiftMultiply(step2[17] / (2*C3));
+  step2[18] = DownshiftMultiply(step2[18] / (2*C5));
+  step2[19] = DownshiftMultiply(step2[19] / (2*C7));
+  step2[20] = DownshiftMultiply(step2[20] / (2*C9));
+  step2[21] = DownshiftMultiply(step2[21] / (2*C11));
+  step2[22] = DownshiftMultiply(step2[22] / (2*C13));
+  step2[23] = DownshiftMultiply(step2[23] / (2*C15));
+  step2[24] = DownshiftMultiply(step2[24] / (2*C17));
+  step2[25] = DownshiftMultiply(step2[25] / (2*C19));
+  step2[26] = DownshiftMultiply(step2[26] / (2*C21));
+  step2[27] = DownshiftMultiply(step2[27] / (2*C23));
+  step2[28] = DownshiftMultiply(step2[28] / (2*C25));
+  step2[29] = DownshiftMultiply(step2[29] / (2*C27));
+  step2[30] = DownshiftMultiply(step2[30] / (2*C29));
+  step2[31] = DownshiftMultiply(step2[31] / (2*C31));
+
+  output[stride* 0] = step2[ 0] + step2[16];
+  output[stride* 1] = step2[ 1] + step2[17];
+  output[stride* 2] = step2[ 2] + step2[18];
+  output[stride* 3] = step2[ 3] + step2[19];
+  output[stride* 4] = step2[ 4] + step2[20];
+  output[stride* 5] = step2[ 5] + step2[21];
+  output[stride* 6] = step2[ 6] + step2[22];
+  output[stride* 7] = step2[ 7] + step2[23];
+  output[stride* 8] = step2[ 8] + step2[24];
+  output[stride* 9] = step2[ 9] + step2[25];
+  output[stride*10] = step2[10] + step2[26];
+  output[stride*11] = step2[11] + step2[27];
+  output[stride*12] = step2[12] + step2[28];
+  output[stride*13] = step2[13] + step2[29];
+  output[stride*14] = step2[14] + step2[30];
+  output[stride*15] = step2[15] + step2[31];
+  output[stride*16] = step2[15] - step2[(31 - 0)];
+  output[stride*17] = step2[14] - step2[(31 - 1)];
+  output[stride*18] = step2[13] - step2[(31 - 2)];
+  output[stride*19] = step2[12] - step2[(31 - 3)];
+  output[stride*20] = step2[11] - step2[(31 - 4)];
+  output[stride*21] = step2[10] - step2[(31 - 5)];
+  output[stride*22] = step2[ 9] - step2[(31 - 6)];
+  output[stride*23] = step2[ 8] - step2[(31 - 7)];
+  output[stride*24] = step2[ 7] - step2[(31 - 8)];
+  output[stride*25] = step2[ 6] - step2[(31 - 9)];
+  output[stride*26] = step2[ 5] - step2[(31 - 10)];
+  output[stride*27] = step2[ 4] - step2[(31 - 11)];
+  output[stride*28] = step2[ 3] - step2[(31 - 12)];
+  output[stride*29] = step2[ 2] - step2[(31 - 13)];
+  output[stride*30] = step2[ 1] - step2[(31 - 14)];
+  output[stride*31] = step2[ 0] - step2[(31 - 15)];
+}
+
+static void butterfly_64_idct_1d(double *input, double *output, int stride) {
+  double step1[64], step2[64];
+  int i;
+  static const double C[64] = {
+    1.00000000000000000000,  // cos(0 * pi / 128)
+    0.99969881869620424997,  // cos(1 * pi / 128)
+    0.99879545620517240501,  // cos(2 * pi / 128)
+    0.99729045667869020697,  // cos(3 * pi / 128)
+    0.99518472667219692873,  // cos(4 * pi / 128)
+    0.99247953459870996706,  // cos(5 * pi / 128)
+    0.98917650996478101444,  // cos(6 * pi / 128)
+    0.98527764238894122162,  // cos(7 * pi / 128)
+    0.98078528040323043058,  // cos(8 * pi / 128)
+    0.97570213003852857003,  // cos(9 * pi / 128)
+    0.97003125319454397424,  // cos(10 * pi / 128)
+    0.96377606579543984022,  // cos(11 * pi / 128)
+    0.95694033573220882438,  // cos(12 * pi / 128)
+    0.94952818059303667475,  // cos(13 * pi / 128)
+    0.94154406518302080631,  // cos(14 * pi / 128)
+    0.93299279883473895669,  // cos(15 * pi / 128)
+    0.92387953251128673848,  // cos(16 * pi / 128)
+    0.91420975570353069095,  // cos(17 * pi / 128)
+    0.90398929312344333820,  // cos(18 * pi / 128)
+    0.89322430119551532446,  // cos(19 * pi / 128)
+    0.88192126434835504956,  // cos(20 * pi / 128)
+    0.87008699110871146054,  // cos(21 * pi / 128)
+    0.85772861000027211809,  // cos(22 * pi / 128)
+    0.84485356524970711689,  // cos(23 * pi / 128)
+    0.83146961230254523567,  // cos(24 * pi / 128)
+    0.81758481315158371139,  // cos(25 * pi / 128)
+    0.80320753148064494287,  // cos(26 * pi / 128)
+    0.78834642762660633863,  // cos(27 * pi / 128)
+    0.77301045336273699338,  // cos(28 * pi / 128)
+    0.75720884650648456748,  // cos(29 * pi / 128)
+    0.74095112535495921691,  // cos(30 * pi / 128)
+    0.72424708295146700276,  // cos(31 * pi / 128)
+    0.70710678118654757274,  // cos(32 * pi / 128)
+    0.68954054473706694051,  // cos(33 * pi / 128)
+    0.67155895484701844111,  // cos(34 * pi / 128)
+    0.65317284295377686654,  // cos(35 * pi / 128)
+    0.63439328416364559882,  // cos(36 * pi / 128)
+    0.61523159058062693028,  // cos(37 * pi / 128)
+    0.59569930449243346793,  // cos(38 * pi / 128)
+    0.57580819141784544968,  // cos(39 * pi / 128)
+    0.55557023301960228867,  // cos(40 * pi / 128)
+    0.53499761988709737537,  // cos(41 * pi / 128)
+    0.51410274419322177231,  // cos(42 * pi / 128)
+    0.49289819222978414892,  // cos(43 * pi / 128)
+    0.47139673682599780857,  // cos(44 * pi / 128)
+    0.44961132965460659516,  // cos(45 * pi / 128)
+    0.42755509343028219593,  // cos(46 * pi / 128)
+    0.40524131400498980549,  // cos(47 * pi / 128)
+    0.38268343236508983729,  // cos(48 * pi / 128)
+    0.35989503653498827740,  // cos(49 * pi / 128)
+    0.33688985339222005111,  // cos(50 * pi / 128)
+    0.31368174039889151761,  // cos(51 * pi / 128)
+    0.29028467725446227554,  // cos(52 * pi / 128)
+    0.26671275747489842090,  // cos(53 * pi / 128)
+    0.24298017990326398197,  // cos(54 * pi / 128)
+    0.21910124015686976984,  // cos(55 * pi / 128)
+    0.19509032201612830359,  // cos(56 * pi / 128)
+    0.17096188876030135595,  // cos(57 * pi / 128)
+    0.14673047445536174793,  // cos(58 * pi / 128)
+    0.12241067519921627893,  // cos(59 * pi / 128)
+    0.09801714032956077016,  // cos(60 * pi / 128)
+    0.07356456359966745406,  // cos(61 * pi / 128)
+    0.04906767432741813290,  // cos(62 * pi / 128)
+    0.02454122852291226731,  // cos(63 * pi / 128)
+  };
+
+  for (i = 0; i < 64; i += 2) {
+    step1[i / 2] = input[stride * i];
+  }
+  step1[32] = DownshiftMultiplyBy2(input[stride*1] * C[32]);
+  for (i = 3; i < 64; i+=2) {
+    step1[32 + i/2] = (input[stride * i] + input[stride * (i - 2)]);
+  }
+
+  butterfly_32_idct_1d(step1, step2, 1);
+  butterfly_32_idct_1d(step1 + 32, step2 + 32, 1);
+
+  for (i = 32; i < 64; ++i) {
+    step2[i] = DownshiftMultiply(step2[i] / (2 * C[(i - 32) * 2 + 1]));
+  }
+
+  for (i = 0; i < 32; ++i) {
+    output[stride * i] = step2[i] + step2[32 + i];
+  }
+
+  for (i = 0; i < 32; ++i) {
+    output[stride * (i + 32)] = step2[31 - i] - step2[63 - i];
+  }
+}
+
+void vp9_idct64x64_4096_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double out[64 * 64], out2[64 * 64];
+    int i, j;
+    // First transform rows
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = input[j + i * 64];
+      butterfly_64_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        out[j + i * 64] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = out[j * 64 + i];
+      butterfly_64_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        out2[j * 64 + i] = temp_out[j];
+    }
+
+    for (j = 0; j < 64; ++j) {
+      for (i = 0; i < 64; ++i)
+        dest[i] = clip_pixel_add(dest[i], round(out2[j * 64 + i] / 128));
+      dest += stride;
+    }
+  }
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+void vp9_idct64x64_add(const tran_low_t *input, uint8_t *dest,
+                       int stride, int eob) {
+  (void) eob;
+  vp9_idct64x64_4096_add_c(input, dest, stride);
+}
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int bd) {
@ -2899,4 +3351,47 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
    vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
  }
 }
+
+#if CONFIG_TX64X64
+void vp9_highbd_idct64x64_4096_add_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, int bd) {
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double out[64 * 64], out2[64 * 64];
+    int i, j;
+    // First transform rows
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = input[j + i * 64];
+      butterfly_64_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        out[j + i * 64] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = out[j * 64 + i];
+      butterfly_64_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        out2[j * 64 + i] = temp_out[j];
+    }
+
+    for (j = 0; j < 64; ++j) {
+      for (i = 0; i < 64; ++i)
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * 64 + i], round(out2[j * 64 + i] / 128), bd);
+      dest += stride;
+    }
+  }
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+void vp9_highbd_idct64x64_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd) {
+  (void) eob;
+  vp9_highbd_idct64x64_4096_add_c(input, dest, stride, bd);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@ -122,11 +122,14 @@ void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
                     int eob);
-void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int
-                       eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                       int eob);
-
+#if CONFIG_TX64X64
+void vp9_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+#endif
 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                    int stride, int eob);
 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
@ -145,6 +148,10 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, int bd);
 void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, int bd);
+#if CONFIG_TX64X64
+void vp9_highbd_idct64x64_add(const tran_low_t *input, uint8_t *dest,
+                              int stride, int eob, int bd);
+#endif
 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
                           uint8_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -38,6 +38,9 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
  0xffffffffffffffff,  // TX_8x8
  0x5555555555555555,  // TX_16x16
  0x1111111111111111,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0101010101010101,  // TX_64x64
+#endif
 };

 // 64 bit masks for above transform size. Each 1 represents a position where
@ -62,6 +65,9 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
  0xffffffffffffffff,  // TX_8x8
  0x00ff00ff00ff00ff,  // TX_16x16
  0x000000ff000000ff,  // TX_32x32
+#if CONFIG_TX64X64
+  0x00000000000000ff,  // TX_64x64
+#endif
 };

 // 64 bit masks for prediction sizes (left). Each 1 represents a position
@ -140,6 +146,9 @@ static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
  0xffff,  // TX_8x8
  0x5555,  // TX_16x16
  0x1111,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0101,  // TX_64x64, never used
+#endif
 };

 static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
@ -147,6 +156,9 @@ static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
  0xffff,  // TX_8x8
  0x0f0f,  // TX_16x16
  0x000f,  // TX_32x32
+#if CONFIG_TX64X64
+  0x0003,  // TX_64x64, never used
+#endif
 };

 // 16 bit left mask to shift and set for each uv prediction size.
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@ -107,6 +107,10 @@ static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
      return tx_probs->p16x16[ctx];
    case TX_32X32:
      return tx_probs->p32x32[ctx];
+#if CONFIG_TX64X64
+    case TX_64X64:
+      return tx_probs->p64x64[ctx];
+#endif
    default:
      assert(0 && "Invalid max_tx_size.");
      return NULL;
@ -128,6 +132,10 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
      return tx_counts->p16x16[ctx];
    case TX_32X32:
      return tx_counts->p32x32[ctx];
+#if CONFIG_TX64X64
+    case TX_64X64:
+      return tx_counts->p64x64[ctx];
+#endif
    default:
      assert(0 && "Invalid max_tx_size.");
      return NULL;
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@ -47,7 +47,34 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
      const uint16_t *left, int bd) { \
    highbd_##type##_predictor(dst, stride, size, above, left, bd); \
  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_TX64X64
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_sized(type, 64) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32) \
+  intra_pred_highbd_sized(type, 64)
+#else
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_sized(type, 64)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#else   // CONFIG_TX64X64
+
+#if CONFIG_VP9_HIGHBITDEPTH
 #define intra_pred_allsizes(type) \
  intra_pred_sized(type, 4) \
  intra_pred_sized(type, 8) \
@ -57,9 +84,7 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
  intra_pred_highbd_sized(type, 8) \
  intra_pred_highbd_sized(type, 16) \
  intra_pred_highbd_sized(type, 32)
-
 #else
-
 #define intra_pred_allsizes(type) \
  intra_pred_sized(type, 4) \
  intra_pred_sized(type, 8) \
@ -67,6 +92,8 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
  intra_pred_sized(type, 32)
 #endif  // CONFIG_VP9_HIGHBITDEPTH

+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
                                         int bs, const uint16_t *above,
@ -575,16 +602,25 @@ static intra_pred_fn dc_pred[2][2][TX_SIZES];
 typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
                                   const uint16_t *above, const uint16_t *left,
                                   int bd);
-static intra_high_pred_fn pred_high[INTRA_MODES][4];
-static intra_high_pred_fn dc_pred_high[2][2][4];
+static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES];
+static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES];
 #endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_init_intra_predictors() {
+#if CONFIG_TX64X64
+#define INIT_ALL_SIZES(p, type) \
+  p[TX_4X4] = vp9_##type##_predictor_4x4; \
+  p[TX_8X8] = vp9_##type##_predictor_8x8; \
+  p[TX_16X16] = vp9_##type##_predictor_16x16; \
+  p[TX_32X32] = vp9_##type##_predictor_32x32; \
+  p[TX_64X64] = vp9_##type##_predictor_64x64
+#else
 #define INIT_ALL_SIZES(p, type) \
  p[TX_4X4] = vp9_##type##_predictor_4x4; \
  p[TX_8X8] = vp9_##type##_predictor_8x8; \
  p[TX_16X16] = vp9_##type##_predictor_16x16; \
  p[TX_32X32] = vp9_##type##_predictor_32x32
+#endif

  INIT_ALL_SIZES(pred[V_PRED], v);
  INIT_ALL_SIZES(pred[H_PRED], h);
@ -638,7 +674,11 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 64);
+#if CONFIG_TX64X64
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 256 + 16);
+#else
  DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 128 + 16);
+#endif
  uint16_t *above_row = above_data + 16;
  const uint16_t *const_above_row = above_row;
  const int bs = 4 << tx_size;
@ -767,7 +807,11 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                   int plane) {
  int i;
  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
+#if CONFIG_TX64X64
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 256 + 16);
+#else
  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+#endif
  uint8_t *above_row = above_data + 16;
  const uint8_t *const_above_row = above_row;
  const int bs = 4 << tx_size;
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -224,6 +224,47 @@ specialize qw/vp9_dc_left_predictor_32x32/;
 add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_128_predictor_32x32/;

+if (vpx_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void vp9_d207_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d207_predictor_64x64/;
+
+  add_proto qw/void vp9_d45_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d45_predictor_64x64/;
+
+  add_proto qw/void vp9_d63_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d63_predictor_64x64/;
+
+  add_proto qw/void vp9_h_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_h_predictor_64x64/;
+
+  add_proto qw/void vp9_d117_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d117_predictor_64x64/;
+
+  add_proto qw/void vp9_d135_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d135_predictor_64x64/;
+
+  add_proto qw/void vp9_d153_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_d153_predictor_64x64/;
+
+  add_proto qw/void vp9_v_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_v_predictor_64x64/;
+
+  add_proto qw/void vp9_tm_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_tm_predictor_64x64/;
+
+  add_proto qw/void vp9_dc_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_dc_predictor_64x64/;
+
+  add_proto qw/void vp9_dc_top_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_dc_top_predictor_64x64/;
+
+  add_proto qw/void vp9_dc_left_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_dc_left_predictor_64x64/;
+
+  add_proto qw/void vp9_dc_128_predictor_64x64/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/vp9_dc_128_predictor_64x64/;
+}
+
 #
 # Loopfilter
 #
@ -366,6 +407,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
  specialize qw/vp9_idct32x32_1_add/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vp9_idct64x64_4096_add/;
+  }
+
  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
  specialize qw/vp9_iht4x4_16_add/;

@ -419,6 +465,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vp9_idct32x32_1_add/;

+    if (vpx_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+      specialize qw/vp9_idct64x64_4096_add/;
+    }
+
    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp9_iht4x4_16_add/;

@ -480,6 +531,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
    $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;

+    if (vpx_config("CONFIG_TX64X64") eq "yes") {
+      add_proto qw/void vp9_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+      specialize qw/vp9_idct64x64_4096_add/;
+    }
+
    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
    specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
    $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
@ -662,6 +718,46 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
  specialize qw/vp9_highbd_dc_128_predictor_32x32/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_highbd_d207_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d207_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d45_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d45_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d63_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d63_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_h_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_h_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d117_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d117_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d135_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d135_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_d153_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_d153_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_v_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_v_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_tm_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_tm_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_dc_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_dc_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_dc_top_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_dc_top_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_dc_left_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_dc_left_predictor_64x64/;
+
+    add_proto qw/void vp9_highbd_dc_128_predictor_64x64/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp9_highbd_dc_128_predictor_64x64/;
+  }
  #
  # Sub Pixel Filters
  #
@ -774,6 +870,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct32x32_1024_add/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_highbd_idct64x64_4096_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct64x64_4096_add/;
+  }
+
  add_proto qw/void vp9_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
  specialize qw/vp9_highbd_idct32x32_34_add/;

@ -1149,6 +1250,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  specialize qw/vp9_quantize_b_32x32/;
+
+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_quantize_fp_64x64/;
+
+    add_proto qw/void vp9_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_quantize_b_64x64/;
+  }
 } else {
  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
  specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
@ -1164,6 +1273,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+
+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_quantize_fp_64x64/;
+
+    add_proto qw/void vp9_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_quantize_b_64x64/;
+  }
 }

 #
@ -1218,6 +1335,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_fdct32x32_rd/;
+
+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct64x64_1/;
+
+    add_proto qw/void vp9_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct64x64/;
+  }
 } else {
  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
  specialize qw/vp9_fht4x4 sse2/;
@ -1257,6 +1382,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+
+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct64x64_1/;
+
+    add_proto qw/void vp9_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_fdct64x64/;
+  }
 }

 #
@ -1873,6 +2006,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  specialize qw/vp9_highbd_quantize_b_32x32/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_highbd_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_highbd_quantize_fp_64x64/;
+
+    add_proto qw/void vp9_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp9_highbd_quantize_b_64x64/;
+  }
+
  #
  # Structured Similarity (SSIM)
  #
@ -1918,6 +2059,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
  specialize qw/vp9_highbd_fdct32x32_rd/;

+  if (vpx_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void vp9_highbd_fdct64x64_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_highbd_fdct64x64_1/;
+
+    add_proto qw/void vp9_highbd_fdct64x64/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/vp9_highbd_fdct64x64/;
+  }
+
  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
  specialize qw/vp9_highbd_temporal_filter_apply/;

--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@ -80,8 +80,15 @@ static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {

 static TX_MODE read_tx_mode(vp9_reader *r) {
  TX_MODE tx_mode = vp9_read_literal(r, 2);
+#if CONFIG_TX64X64
+  if (tx_mode == 2)
+    tx_mode += vp9_read_bit(r);      // ALLOW_16X16 and ALLOW_32X32
+  else if (tx_mode == 3)
+    tx_mode += 1 + vp9_read_bit(r);  // ALLOW_64X64 and TX_MODE_SELECT
+#else
  if (tx_mode == ALLOW_32X32)
    tx_mode += vp9_read_bit(r);
+#endif
  return tx_mode;
 }

@ -89,16 +96,22 @@ static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) {
  int i, j;

  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 3; ++j)
+    for (j = 0; j < 1; ++j)
      vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);

  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 2; ++j)
+    for (j = 0; j < 2; ++j)
      vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);

  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 1; ++j)
+    for (j = 0; j < 3; ++j)
      vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
+
+#if CONFIG_TX64X64
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
+    for (j = 0; j < 4; ++j)
+      vp9_diff_update_prob(r, &tx_probs->p64x64[i][j]);
+#endif
 }

 static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
@ -220,6 +233,12 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
            tx_type = DCT_DCT;
            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
            break;
+#if CONFIG_TX64X64
+          case TX_64X64:
+            tx_type = DCT_DCT;
+            vp9_highbd_idct64x64_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+#endif
          default:
            assert(0 && "Invalid transform size");
        }
@ -247,6 +266,12 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
            tx_type = DCT_DCT;
            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
            break;
+#if CONFIG_TX64X64
+          case TX_64X64:
+            tx_type = DCT_DCT;
+            vp9_idct64x64_add(dqcoeff, dst, stride, eob);
+            break;
+#endif
          default:
            assert(0 && "Invalid transform size");
            return;
@ -276,6 +301,12 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
          tx_type = DCT_DCT;
          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
          break;
+#if CONFIG_TX64X64
+        case TX_64X64:
+          tx_type = DCT_DCT;
+          vp9_idct64x64_add(dqcoeff, dst, stride, eob);
+          break;
+#endif
        default:
          assert(0 && "Invalid transform size");
          return;
@ -321,7 +352,6 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
                          b_width_log2_lookup[plane_bsize], tx_size, mode,
                          dst, pd->dst.stride, dst, pd->dst.stride,
                          x, y, plane);
-
  if (!mi->mbmi.skip) {
    const int eob = vp9_decode_block_tokens(cm, xd, plane, block,
                                            plane_bsize, x, y, tx_size,
@ -701,14 +731,14 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
  setup_display_size(cm, rb);

  if (vp9_realloc_frame_buffer(
-          get_frame_new_buffer(cm), cm->width, cm->height,
-          cm->subsampling_x, cm->subsampling_y,
+      get_frame_new_buffer(cm), cm->width, cm->height,
+      cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-          cm->use_highbitdepth,
+      cm->use_highbitdepth,
 #endif
-          VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+      VP9_DEC_BORDER_IN_PIXELS,
+      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
+      cm->cb_priv)) {
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate frame buffer");
  }
@ -779,14 +809,14 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
  setup_display_size(cm, rb);

  if (vp9_realloc_frame_buffer(
-          get_frame_new_buffer(cm), cm->width, cm->height,
-          cm->subsampling_x, cm->subsampling_y,
+      get_frame_new_buffer(cm), cm->width, cm->height,
+      cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-          cm->use_highbitdepth,
+      cm->use_highbitdepth,
 #endif
-          VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+      VP9_DEC_BORDER_IN_PIXELS,
+      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
+      cm->cb_priv)) {
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate frame buffer");
  }
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@ -65,8 +65,14 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
  int tx_size = vp9_read(r, tx_probs[0]);
  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
    tx_size += vp9_read(r, tx_probs[1]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) {
      tx_size += vp9_read(r, tx_probs[2]);
+#if CONFIG_TX64X64
+      if (tx_size != TX_16X16 && max_tx_size >= TX_64X64) {
+        tx_size += vp9_read(r, tx_probs[3]);
+      }
+#endif
+    }
  }

  if (!cm->frame_parallel_decoding_mode)
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@ -32,7 +32,7 @@
 #define INCREMENT_COUNT(token)                              \
  do {                                                      \
     if (!cm->frame_parallel_decoding_mode)                 \
-       ++coef_counts[band][ctx][token];                      \
+       ++coef_counts[band][ctx][token];                     \
  } while (0)

 static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
@ -69,9 +69,9 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
      counts->coef[tx_size][type][ref];
  unsigned int (*eob_branch_count)[COEFF_CONTEXTS] =
      counts->eob_branch[tx_size][type][ref];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_NUM_COEFS];
  const uint8_t *band_translate = get_band_translate(tx_size);
-  const int dq_shift = (tx_size == TX_32X32);
+  const int dq_shift = (tx_size > TX_16X16) ? tx_size - TX_16X16 : 0;
  int v, token;
  int16_t dqv = dq[0];
  const uint8_t *cat1_prob;
@ -214,6 +214,9 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
  const int eob = decode_coefs(cm, xd, pd->plane_type,
                               BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
                               pd->dequant, ctx, so->scan, so->neighbors, r);
+#if CONFIG_TX64X64
+  if (plane > 0) assert(tx_size != TX_64X64);
+#endif
  vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
  return eob;
 }
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@ -88,8 +88,13 @@ static void write_selected_tx_size(const VP9_COMMON *cm,
  vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
    vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) {
      vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
+#if CONFIG_TX64X64
+      if (tx_size != TX_16X16 && max_tx_size >= TX_64X64)
+        vp9_write(w, tx_size != TX_32X32, tx_probs[3]);
+#endif
+    }
  }
 }

@ -684,7 +689,7 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];

-  for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
    build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
                            frame_coef_probs[tx_size]);

@ -815,37 +820,60 @@ static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,

 static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
  // Mode
+#if CONFIG_TX64X64
+  if (cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32) {
+    vp9_write_literal(w, 2, 2);
+    vp9_write_bit(w, cm->tx_mode == ALLOW_32X32);
+  } else if (cm->tx_mode == ALLOW_64X64 || cm->tx_mode == TX_MODE_SELECT) {
+    vp9_write_literal(w, 3, 2);
+    vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+  } else {
+    vp9_write_literal(w, cm->tx_mode, 2);
+  }
+#else
  vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
  if (cm->tx_mode >= ALLOW_32X32)
    vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+#endif  // CONFIG_TX64X64

  // Probabilities
  if (cm->tx_mode == TX_MODE_SELECT) {
    int i, j;
-    unsigned int ct_8x8p[TX_SIZES - 3][2];
-    unsigned int ct_16x16p[TX_SIZES - 2][2];
-    unsigned int ct_32x32p[TX_SIZES - 1][2];
-
+    unsigned int ct_8x8p[1][2];
+    unsigned int ct_16x16p[2][2];
+    unsigned int ct_32x32p[3][2];
+#if CONFIG_TX64X64
+    unsigned int ct_64x64p[4][2];
+#endif

    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; j++)
+      for (j = 0; j < 1; j++)
        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
    }

    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; j++)
+      for (j = 0; j < 2; j++)
        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
                                  ct_16x16p[j]);
    }

    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
      tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; j++)
+      for (j = 0; j < 3; j++)
        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
                                  ct_32x32p[j]);
    }
+
+#if CONFIG_TX64X64
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      tx_counts_to_branch_counts_64x64(cm->counts.tx.p64x64[i], ct_64x64p);
+      for (j = 0; j < 4; j++)
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p64x64[i][j],
+                                  ct_64x64p[j]);
+    }
+#endif  // CONFIG_TX64X64
  }
 }

--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@ -1439,6 +1439,458 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
  }
 }

+#if CONFIG_TX64X64
+// TODO(debargha): Using a floating point implementation for now.
+// Should re-use the 32x32 integer dct we already have.
+static void dct32_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C4 = 0.980785280403;  // cos(pi * 4 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C6 = 0.956940335732;  // cos(pi * 6 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C8 = 0.923879532511;  // cos(pi * 8 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C10 = 0.881921264348;  // cos(pi * 10 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C12 = 0.831469612303;  // cos(pi * 12 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C14 = 0.773010453363;  // cos(pi * 14 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C18 = 0.634393284164;  // cos(pi * 18 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C20 = 0.555570233020;  // cos(pi * 20 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C22 = 0.471396736826;  // cos(pi * 22 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C24 = 0.382683432365;  // cos(pi * 24 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C26 = 0.290284677254;  // cos(pi * 26 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C28 = 0.195090322016;  // cos(pi * 28 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C30 = 0.098017140330;  // cos(pi * 30 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step[32];
+
+  // Stage 1
+  step[0] = input[stride*0] + input[stride*(32 - 1)];
+  step[1] = input[stride*1] + input[stride*(32 - 2)];
+  step[2] = input[stride*2] + input[stride*(32 - 3)];
+  step[3] = input[stride*3] + input[stride*(32 - 4)];
+  step[4] = input[stride*4] + input[stride*(32 - 5)];
+  step[5] = input[stride*5] + input[stride*(32 - 6)];
+  step[6] = input[stride*6] + input[stride*(32 - 7)];
+  step[7] = input[stride*7] + input[stride*(32 - 8)];
+  step[8] = input[stride*8] + input[stride*(32 - 9)];
+  step[9] = input[stride*9] + input[stride*(32 - 10)];
+  step[10] = input[stride*10] + input[stride*(32 - 11)];
+  step[11] = input[stride*11] + input[stride*(32 - 12)];
+  step[12] = input[stride*12] + input[stride*(32 - 13)];
+  step[13] = input[stride*13] + input[stride*(32 - 14)];
+  step[14] = input[stride*14] + input[stride*(32 - 15)];
+  step[15] = input[stride*15] + input[stride*(32 - 16)];
+  step[16] = -input[stride*16] + input[stride*(32 - 17)];
+  step[17] = -input[stride*17] + input[stride*(32 - 18)];
+  step[18] = -input[stride*18] + input[stride*(32 - 19)];
+  step[19] = -input[stride*19] + input[stride*(32 - 20)];
+  step[20] = -input[stride*20] + input[stride*(32 - 21)];
+  step[21] = -input[stride*21] + input[stride*(32 - 22)];
+  step[22] = -input[stride*22] + input[stride*(32 - 23)];
+  step[23] = -input[stride*23] + input[stride*(32 - 24)];
+  step[24] = -input[stride*24] + input[stride*(32 - 25)];
+  step[25] = -input[stride*25] + input[stride*(32 - 26)];
+  step[26] = -input[stride*26] + input[stride*(32 - 27)];
+  step[27] = -input[stride*27] + input[stride*(32 - 28)];
+  step[28] = -input[stride*28] + input[stride*(32 - 29)];
+  step[29] = -input[stride*29] + input[stride*(32 - 30)];
+  step[30] = -input[stride*30] + input[stride*(32 - 31)];
+  step[31] = -input[stride*31] + input[stride*(32 - 32)];
+
+  // Stage 2
+  output[stride*0] = step[0] + step[16 - 1];
+  output[stride*1] = step[1] + step[16 - 2];
+  output[stride*2] = step[2] + step[16 - 3];
+  output[stride*3] = step[3] + step[16 - 4];
+  output[stride*4] = step[4] + step[16 - 5];
+  output[stride*5] = step[5] + step[16 - 6];
+  output[stride*6] = step[6] + step[16 - 7];
+  output[stride*7] = step[7] + step[16 - 8];
+  output[stride*8] = -step[8] + step[16 - 9];
+  output[stride*9] = -step[9] + step[16 - 10];
+  output[stride*10] = -step[10] + step[16 - 11];
+  output[stride*11] = -step[11] + step[16 - 12];
+  output[stride*12] = -step[12] + step[16 - 13];
+  output[stride*13] = -step[13] + step[16 - 14];
+  output[stride*14] = -step[14] + step[16 - 15];
+  output[stride*15] = -step[15] + step[16 - 16];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18];
+  output[stride*19] = step[19];
+
+  output[stride*20] = (-step[20] + step[27])*C16;
+  output[stride*21] = (-step[21] + step[26])*C16;
+  output[stride*22] = (-step[22] + step[25])*C16;
+  output[stride*23] = (-step[23] + step[24])*C16;
+
+  output[stride*24] = (step[24] + step[23])*C16;
+  output[stride*25] = (step[25] + step[22])*C16;
+  output[stride*26] = (step[26] + step[21])*C16;
+  output[stride*27] = (step[27] + step[20])*C16;
+
+  output[stride*28] = step[28];
+  output[stride*29] = step[29];
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 3
+  step[0] = output[stride*0] + output[stride*(8 - 1)];
+  step[1] = output[stride*1] + output[stride*(8 - 2)];
+  step[2] = output[stride*2] + output[stride*(8 - 3)];
+  step[3] = output[stride*3] + output[stride*(8 - 4)];
+  step[4] = -output[stride*4] + output[stride*(8 - 5)];
+  step[5] = -output[stride*5] + output[stride*(8 - 6)];
+  step[6] = -output[stride*6] + output[stride*(8 - 7)];
+  step[7] = -output[stride*7] + output[stride*(8 - 8)];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9];
+  step[10] = (-output[stride*10] + output[stride*13])*C16;
+  step[11] = (-output[stride*11] + output[stride*12])*C16;
+  step[12] = (output[stride*12] + output[stride*11])*C16;
+  step[13] = (output[stride*13] + output[stride*10])*C16;
+  step[14] = output[stride*14];
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*23];
+  step[17] = output[stride*17] + output[stride*22];
+  step[18] = output[stride*18] + output[stride*21];
+  step[19] = output[stride*19] + output[stride*20];
+  step[20] = -output[stride*20] + output[stride*19];
+  step[21] = -output[stride*21] + output[stride*18];
+  step[22] = -output[stride*22] + output[stride*17];
+  step[23] = -output[stride*23] + output[stride*16];
+  step[24] = -output[stride*24] + output[stride*31];
+  step[25] = -output[stride*25] + output[stride*30];
+  step[26] = -output[stride*26] + output[stride*29];
+  step[27] = -output[stride*27] + output[stride*28];
+  step[28] = output[stride*28] + output[stride*27];
+  step[29] = output[stride*29] + output[stride*26];
+  step[30] = output[stride*30] + output[stride*25];
+  step[31] = output[stride*31] + output[stride*24];
+
+  // Stage 4
+  output[stride*0] = step[0] + step[3];
+  output[stride*1] = step[1] + step[2];
+  output[stride*2] = -step[2] + step[1];
+  output[stride*3] = -step[3] + step[0];
+  output[stride*4] = step[4];
+  output[stride*5] = (-step[5] + step[6])*C16;
+  output[stride*6] = (step[6] + step[5])*C16;
+  output[stride*7] = step[7];
+  output[stride*8] = step[8] + step[11];
+  output[stride*9] = step[9] + step[10];
+  output[stride*10] = -step[10] + step[9];
+  output[stride*11] = -step[11] + step[8];
+  output[stride*12] = -step[12] + step[15];
+  output[stride*13] = -step[13] + step[14];
+  output[stride*14] = step[14] + step[13];
+  output[stride*15] = step[15] + step[12];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18]*-C8 + step[29]*C24;
+  output[stride*19] = step[19]*-C8 + step[28]*C24;
+  output[stride*20] = step[20]*-C24 + step[27]*-C8;
+  output[stride*21] = step[21]*-C24 + step[26]*-C8;
+  output[stride*22] = step[22];
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25];
+  output[stride*26] = step[26]*C24 + step[21]*-C8;
+  output[stride*27] = step[27]*C24 + step[20]*-C8;
+  output[stride*28] = step[28]*C8 + step[19]*C24;
+  output[stride*29] = step[29]*C8 + step[18]*C24;
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 5
+  step[0] = (output[stride*0] + output[stride*1]) * C16;
+  step[1] = (-output[stride*1] + output[stride*0]) * C16;
+  step[2] = output[stride*2]*C24 + output[stride*3] * C8;
+  step[3] = output[stride*3]*C24 - output[stride*2] * C8;
+  step[4] = output[stride*4] + output[stride*5];
+  step[5] = -output[stride*5] + output[stride*4];
+  step[6] = -output[stride*6] + output[stride*7];
+  step[7] = output[stride*7] + output[stride*6];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9]*-C8 + output[stride*14]*C24;
+  step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;
+  step[11] = output[stride*11];
+  step[12] = output[stride*12];
+  step[13] = output[stride*13]*C24 + output[stride*10]*-C8;
+  step[14] = output[stride*14]*C8 + output[stride*9]*C24;
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*19];
+  step[17] = output[stride*17] + output[stride*18];
+  step[18] = -output[stride*18] + output[stride*17];
+  step[19] = -output[stride*19] + output[stride*16];
+  step[20] = -output[stride*20] + output[stride*23];
+  step[21] = -output[stride*21] + output[stride*22];
+  step[22] = output[stride*22] + output[stride*21];
+  step[23] = output[stride*23] + output[stride*20];
+  step[24] = output[stride*24] + output[stride*27];
+  step[25] = output[stride*25] + output[stride*26];
+  step[26] = -output[stride*26] + output[stride*25];
+  step[27] = -output[stride*27] + output[stride*24];
+  step[28] = -output[stride*28] + output[stride*31];
+  step[29] = -output[stride*29] + output[stride*30];
+  step[30] = output[stride*30] + output[stride*29];
+  step[31] = output[stride*31] + output[stride*28];
+
+  // Stage 6
+  output[stride*0] = step[0];
+  output[stride*1] = step[1];
+  output[stride*2] = step[2];
+  output[stride*3] = step[3];
+  output[stride*4] = step[4]*C28 + step[7]*C4;
+  output[stride*5] = step[5]*C12 + step[6]*C20;
+  output[stride*6] = step[6]*C12 + step[5]*-C20;
+  output[stride*7] = step[7]*C28 + step[4]*-C4;
+  output[stride*8] = step[8] + step[9];
+  output[stride*9] = -step[9] + step[8];
+  output[stride*10] = -step[10] + step[11];
+  output[stride*11] = step[11] + step[10];
+  output[stride*12] = step[12] + step[13];
+  output[stride*13] = -step[13] + step[12];
+  output[stride*14] = -step[14] + step[15];
+  output[stride*15] = step[15] + step[14];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17]*-C4 + step[30]*C28;
+  output[stride*18] = step[18]*-C28 + step[29]*-C4;
+  output[stride*19] = step[19];
+  output[stride*20] = step[20];
+  output[stride*21] = step[21]*-C20 + step[26]*C12;
+  output[stride*22] = step[22]*-C12 + step[25]*-C20;
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25]*C12 + step[22]*-C20;
+  output[stride*26] = step[26]*C20 + step[21]*C12;
+  output[stride*27] = step[27];
+  output[stride*28] = step[28];
+  output[stride*29] = step[29]*C28 + step[18]*-C4;
+  output[stride*30] = step[30]*C4 + step[17]*C28;
+  output[stride*31] = step[31];
+
+  // Stage 7
+  step[0] = output[stride*0];
+  step[1] = output[stride*1];
+  step[2] = output[stride*2];
+  step[3] = output[stride*3];
+  step[4] = output[stride*4];
+  step[5] = output[stride*5];
+  step[6] = output[stride*6];
+  step[7] = output[stride*7];
+  step[8] = output[stride*8]*C30 + output[stride*15]*C2;
+  step[9] = output[stride*9]*C14 + output[stride*14]*C18;
+  step[10] = output[stride*10]*C22 + output[stride*13]*C10;
+  step[11] = output[stride*11]*C6 + output[stride*12]*C26;
+  step[12] = output[stride*12]*C6 + output[stride*11]*-C26;
+  step[13] = output[stride*13]*C22 + output[stride*10]*-C10;
+  step[14] = output[stride*14]*C14 + output[stride*9]*-C18;
+  step[15] = output[stride*15]*C30 + output[stride*8]*-C2;
+
+  step[16] = output[stride*16] + output[stride*17];
+  step[17] = -output[stride*17] + output[stride*16];
+  step[18] = -output[stride*18] + output[stride*19];
+  step[19] = output[stride*19] + output[stride*18];
+  step[20] = output[stride*20] + output[stride*21];
+  step[21] = -output[stride*21] + output[stride*20];
+  step[22] = -output[stride*22] + output[stride*23];
+  step[23] = output[stride*23] + output[stride*22];
+  step[24] = output[stride*24] + output[stride*25];
+  step[25] = -output[stride*25] + output[stride*24];
+  step[26] = -output[stride*26] + output[stride*27];
+  step[27] = output[stride*27] + output[stride*26];
+  step[28] = output[stride*28] + output[stride*29];
+  step[29] = -output[stride*29] + output[stride*28];
+  step[30] = -output[stride*30] + output[stride*31];
+  step[31] = output[stride*31] + output[stride*30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[stride*0] = step[0];
+  output[stride*16] = step[1];
+  output[stride*8] = step[2];
+  output[stride*24] = step[3];
+  output[stride*4] = step[4];
+  output[stride*20] = step[5];
+  output[stride*12] = step[6];
+  output[stride*28] = step[7];
+  output[stride*2] = step[8];
+  output[stride*18] = step[9];
+  output[stride*10] = step[10];
+  output[stride*26] = step[11];
+  output[stride*6] = step[12];
+  output[stride*22] = step[13];
+  output[stride*14] = step[14];
+  output[stride*30] = step[15];
+
+  output[stride*1] = step[16]*C31 + step[31]*C1;
+  output[stride*17] = step[17]*C15 + step[30]*C17;
+  output[stride*9] = step[18]*C23 + step[29]*C9;
+  output[stride*25] = step[19]*C7 + step[28]*C25;
+  output[stride*5] = step[20]*C27 + step[27]*C5;
+  output[stride*21] = step[21]*C11 + step[26]*C21;
+  output[stride*13] = step[22]*C19 + step[25]*C13;
+  output[stride*29] = step[23]*C3 + step[24]*C29;
+  output[stride*3] = step[24]*C3 + step[23]*-C29;
+  output[stride*19] = step[25]*C19 + step[22]*-C13;
+  output[stride*11] = step[26]*C11 + step[21]*-C21;
+  output[stride*27] = step[27]*C27 + step[20]*-C5;
+  output[stride*7] = step[28]*C7 + step[19]*-C25;
+  output[stride*23] = step[29]*C23 + step[18]*-C9;
+  output[stride*15] = step[30]*C15 + step[17]*-C17;
+  output[stride*31] = step[31]*C31 + step[16]*-C1;
+}
+
+static void dct64_1d(double *input, double *output, int stride) {
+  double step1[64], step2[64];
+  int i;
+  static const double C[64] = {
+    1.00000000000000000000,  // cos(0 * pi / 128)
+    0.99969881869620424997,  // cos(1 * pi / 128)
+    0.99879545620517240501,  // cos(2 * pi / 128)
+    0.99729045667869020697,  // cos(3 * pi / 128)
+    0.99518472667219692873,  // cos(4 * pi / 128)
+    0.99247953459870996706,  // cos(5 * pi / 128)
+    0.98917650996478101444,  // cos(6 * pi / 128)
+    0.98527764238894122162,  // cos(7 * pi / 128)
+    0.98078528040323043058,  // cos(8 * pi / 128)
+    0.97570213003852857003,  // cos(9 * pi / 128)
+    0.97003125319454397424,  // cos(10 * pi / 128)
+    0.96377606579543984022,  // cos(11 * pi / 128)
+    0.95694033573220882438,  // cos(12 * pi / 128)
+    0.94952818059303667475,  // cos(13 * pi / 128)
+    0.94154406518302080631,  // cos(14 * pi / 128)
+    0.93299279883473895669,  // cos(15 * pi / 128)
+    0.92387953251128673848,  // cos(16 * pi / 128)
+    0.91420975570353069095,  // cos(17 * pi / 128)
+    0.90398929312344333820,  // cos(18 * pi / 128)
+    0.89322430119551532446,  // cos(19 * pi / 128)
+    0.88192126434835504956,  // cos(20 * pi / 128)
+    0.87008699110871146054,  // cos(21 * pi / 128)
+    0.85772861000027211809,  // cos(22 * pi / 128)
+    0.84485356524970711689,  // cos(23 * pi / 128)
+    0.83146961230254523567,  // cos(24 * pi / 128)
+    0.81758481315158371139,  // cos(25 * pi / 128)
+    0.80320753148064494287,  // cos(26 * pi / 128)
+    0.78834642762660633863,  // cos(27 * pi / 128)
+    0.77301045336273699338,  // cos(28 * pi / 128)
+    0.75720884650648456748,  // cos(29 * pi / 128)
+    0.74095112535495921691,  // cos(30 * pi / 128)
+    0.72424708295146700276,  // cos(31 * pi / 128)
+    0.70710678118654757274,  // cos(32 * pi / 128)
+    0.68954054473706694051,  // cos(33 * pi / 128)
+    0.67155895484701844111,  // cos(34 * pi / 128)
+    0.65317284295377686654,  // cos(35 * pi / 128)
+    0.63439328416364559882,  // cos(36 * pi / 128)
+    0.61523159058062693028,  // cos(37 * pi / 128)
+    0.59569930449243346793,  // cos(38 * pi / 128)
+    0.57580819141784544968,  // cos(39 * pi / 128)
+    0.55557023301960228867,  // cos(40 * pi / 128)
+    0.53499761988709737537,  // cos(41 * pi / 128)
+    0.51410274419322177231,  // cos(42 * pi / 128)
+    0.49289819222978414892,  // cos(43 * pi / 128)
+    0.47139673682599780857,  // cos(44 * pi / 128)
+    0.44961132965460659516,  // cos(45 * pi / 128)
+    0.42755509343028219593,  // cos(46 * pi / 128)
+    0.40524131400498980549,  // cos(47 * pi / 128)
+    0.38268343236508983729,  // cos(48 * pi / 128)
+    0.35989503653498827740,  // cos(49 * pi / 128)
+    0.33688985339222005111,  // cos(50 * pi / 128)
+    0.31368174039889151761,  // cos(51 * pi / 128)
+    0.29028467725446227554,  // cos(52 * pi / 128)
+    0.26671275747489842090,  // cos(53 * pi / 128)
+    0.24298017990326398197,  // cos(54 * pi / 128)
+    0.21910124015686976984,  // cos(55 * pi / 128)
+    0.19509032201612830359,  // cos(56 * pi / 128)
+    0.17096188876030135595,  // cos(57 * pi / 128)
+    0.14673047445536174793,  // cos(58 * pi / 128)
+    0.12241067519921627893,  // cos(59 * pi / 128)
+    0.09801714032956077016,  // cos(60 * pi / 128)
+    0.07356456359966745406,  // cos(61 * pi / 128)
+    0.04906767432741813290,  // cos(62 * pi / 128)
+    0.02454122852291226731,  // cos(63 * pi / 128)
+  };
+
+  for (i = 0; i < 32; ++i) {
+    step1[i] = input[stride * i] + input[stride * (63 - i)];
+    step1[32 + i] = (input[stride * i] -
+                     input[stride * (63 - i)]) * C[i * 2 + 1];
+  }
+
+  dct32_1d(step1, step2, 1);
+  dct32_1d(step1 + 32, step2 + 32, 1);
+
+  for (i = 0; i < 64; i += 2) {
+    output[stride*i] = step2[i / 2];
+  }
+  output[stride * 1] = 2 * step2[32] * C[32];
+  for (i = 3; i < 64; i += 2) {
+    output[stride * i] = 2 * step2[32 + i / 2] - output[stride * (i - 2)];
+  }
+}
+
+void vp9_fdct64x64_c(const int16_t *input, tran_low_t *out, int stride) {
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    int i, j;
+    double output[4096];
+    // First transform columns
+    for (i = 0; i < 64; i++) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; j++)
+        temp_in[j] = input[j * stride + i];
+      dct64_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; j++)
+        output[j * 64 + i] = temp_out[j];
+    }
+    // Then transform rows
+    for (i = 0; i < 64; ++i) {
+      double temp_in[64], temp_out[64];
+      for (j = 0; j < 64; ++j)
+        temp_in[j] = output[j + i * 64];
+      dct64_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 64; ++j)
+        output[j + i * 64] = temp_out[j];
+    }
+    // Scale by some magic number
+    for (i = 0; i < 4096; i++) {
+      out[i] = (tran_low_t)round(output[i] / 16);
+    }
+  }
+  // vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+void vp9_fdct64x64_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 64; ++r)
+    for (c = 0; c < 64; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 5;
+  output[1] = 0;
+}
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
                          int stride) {
@ -1498,4 +1950,15 @@ void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
                               int stride) {
  vp9_fdct32x32_rd_c(input, out, stride);
 }
+
+#if CONFIG_TX64X64
+void vp9_highbd_fdct64x64_1_c(const int16_t *input, tran_low_t *out,
+                              int stride) {
+  vp9_fdct64x64_1_c(input, out, stride);
+}
+
+void vp9_highbd_fdct64x64_c(const int16_t *input, tran_low_t *out, int stride) {
+  vp9_fdct64x64_c(input, out, stride);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -673,11 +673,23 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,

  // FIXME(rbultje) I'm pretty sure this should go to the end of this block
  // (i.e. after the output_enabled)
+#if CONFIG_TX64X64
+  if (bsize < BLOCK_64X64) {
+    if (bsize < BLOCK_32X32) {
+      if (bsize < BLOCK_16X16) {
+        ctx->tx_rd_diff[ALLOW_16X16] = ctx->tx_rd_diff[ALLOW_8X8];
+      }
+      ctx->tx_rd_diff[ALLOW_32X32] = ctx->tx_rd_diff[ALLOW_16X16];
+    }
+    ctx->tx_rd_diff[ALLOW_64X64] = ctx->tx_rd_diff[ALLOW_32X32];
+  }
+#else
  if (bsize < BLOCK_32X32) {
    if (bsize < BLOCK_16X16)
      ctx->tx_rd_diff[ALLOW_16X16] = ctx->tx_rd_diff[ALLOW_8X8];
    ctx->tx_rd_diff[ALLOW_32X32] = ctx->tx_rd_diff[ALLOW_16X16];
  }
+#endif

  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
@ -2567,7 +2579,11 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
  if (cpi->mb.e_mbd.lossless)
    return ONLY_4X4;
  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+#if CONFIG_TX64X64
+    return ALLOW_64X64;
+#else
    return ALLOW_32X32;
+#endif
  else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
           cpi->sf.tx_size_search_method == USE_TX_8X8)
    return TX_MODE_SELECT;
@ -3435,9 +3451,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {

 #if CONFIG_VP9_HIGHBITDEPTH
  if (cm->use_highbitdepth)
-    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-  else
    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
  x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
                                      vp9_highbd_idct4x4_add;
 #else
@ -3612,41 +3628,99 @@ void vp9_encode_frame(VP9_COMP *cpi) {
      }
    }

+#if CONFIG_TX64X64
    if (cm->tx_mode == TX_MODE_SELECT) {
-      int count4x4 = 0;
-      int count8x8_lp = 0, count8x8_8x8p = 0;
+      int count4x4_lp = 0;
+      int count8x8_8x8p = 0, count8x8_lp = 0;
      int count16x16_16x16p = 0, count16x16_lp = 0;
-      int count32x32 = 0;
+      int count32x32_32x32p = 0, count32x32_lp = 0;
+      int count64x64_64x64p = 0;

      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += cm->counts.tx.p32x32[i][TX_4X4];
-        count4x4 += cm->counts.tx.p16x16[i][TX_4X4];
-        count4x4 += cm->counts.tx.p8x8[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p64x64[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p32x32[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p16x16[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p8x8[i][TX_4X4];
+
+        count8x8_lp += cm->counts.tx.p64x64[i][TX_8X8];
+        count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
+        count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
+        count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];
+
+        count16x16_lp += cm->counts.tx.p64x64[i][TX_16X16];
+        count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
+        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
+
+        count32x32_lp += cm->counts.tx.p64x64[i][TX_32X32];
+        count32x32_32x32p += cm->counts.tx.p32x32[i][TX_32X32];
+
+        count64x64_64x64p += cm->counts.tx.p64x64[i][TX_64X64];
+      }
+
+      if (count4x4_lp == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32_lp == 0 && count32x32_32x32p == 0 &&
+          count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_8X8;
+        reset_skip_tx_size(cm, TX_8X8);
+      } else if (count8x8_8x8p == 0 && count8x8_lp == 0 &&
+                 count16x16_16x16p == 0 && count16x16_lp == 0 &&
+                 count32x32_32x32p == 0 && count32x32_lp == 0 &&
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ONLY_4X4;
+        reset_skip_tx_size(cm, TX_4X4);
+      } else if (count4x4_lp == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count32x32_lp == 0) {
+        cm->tx_mode = ALLOW_64X64;
+      } else if (count4x4_lp == 0 && count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_32X32;
+        reset_skip_tx_size(cm, TX_32X32);
+      } else if (count4x4_lp == 0 && count8x8_lp == 0 &&
+                 count32x32_lp == 0 && count32x32_32x32p == 0 &&
+                 count64x64_64x64p == 0) {
+        cm->tx_mode = ALLOW_16X16;
+        reset_skip_tx_size(cm, TX_16X16);
+      }
+    }
+#else
+    if (cm->tx_mode == TX_MODE_SELECT) {
+      int count4x4_lp = 0;
+      int count8x8_8x8p = 0, count8x8_lp = 0;
+      int count16x16_16x16p = 0, count16x16_lp = 0;
+      int count32x32_32x32p = 0;
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        count4x4_lp += cm->counts.tx.p32x32[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p16x16[i][TX_4X4];
+        count4x4_lp += cm->counts.tx.p8x8[i][TX_4X4];

        count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
        count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
        count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];

-        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
        count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
-        count32x32 += cm->counts.tx.p32x32[i][TX_32X32];
+        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
+        count32x32_32x32p += cm->counts.tx.p32x32[i][TX_32X32];
      }

-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-          count32x32 == 0) {
+      if (count4x4_lp == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32_32x32p == 0) {
        cm->tx_mode = ALLOW_8X8;
        reset_skip_tx_size(cm, TX_8X8);
      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+                 count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count32x32_32x32p == 0) {
        cm->tx_mode = ONLY_4X4;
        reset_skip_tx_size(cm, TX_4X4);
-      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4_lp == 0) {
        cm->tx_mode = ALLOW_32X32;
-      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+      } else if (count32x32_32x32p == 0 && count8x8_lp == 0 &&
+                 count4x4_lp == 0) {
        cm->tx_mode = ALLOW_16X16;
        reset_skip_tx_size(cm, TX_16X16);
      }
    }
+#endif
  } else {
    cm->reference_mode = SINGLE_REFERENCE;
    encode_frame_internal(cpi);
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -135,16 +135,16 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
  struct macroblock_plane *const p = &mb->plane[plane];
  struct macroblockd_plane *const pd = &xd->plane[plane];
  const int ref = is_inter_block(&xd->mi[0].src_mi->mbmi);
-  vp9_token_state tokens[1025][2];
-  unsigned best_index[1025][2];
-  uint8_t token_cache[1024];
+  vp9_token_state tokens[MAX_NUM_COEFS + 1][2];
+  unsigned best_index[MAX_NUM_COEFS + 1][2];
+  uint8_t token_cache[MAX_NUM_COEFS];
  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
  const int eob = p->eobs[block];
  const PLANE_TYPE type = pd->plane_type;
  const int default_eob = 16 << (tx_size << 1);
-  const int mul = 1 + (tx_size == TX_32X32);
+  const int mul = 1 << (tx_size >= TX_32X32 ? tx_size - TX_16X16 : 0);
  const int16_t *dequant_ptr = pd->dequant;
  const uint8_t *const band_translate = get_band_translate(tx_size);
  const scan_order *const so = get_scan(xd, tx_size, type, block);
@ -392,6 +392,16 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        vp9_highbd_fdct64x64(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp_64x64(coeff, 4096, x->skip_block, p->zbin,
+                                     p->round_fp, p->quant_fp, p->quant_shift,
+                                     qcoeff, dqcoeff, pd->dequant,
+                                     p->zbin_extra, eob, scan_order->scan,
+                                     scan_order->iscan);
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
@ -429,6 +439,15 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      vp9_fdct64x64(src_diff, coeff, diff_stride);
+      vp9_quantize_fp_64x64(coeff, 4096, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                            scan_order->iscan);
+      break;
+#endif  // CONFIG_TX64X64
    case TX_32X32:
      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
@ -482,6 +501,14 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        vp9_highbd_fdct64x64_1(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_dc_64x64(coeff, x->skip_block, p->round,
+                                     p->quant_fp[0], qcoeff, dqcoeff,
+                                     pd->dequant[0], eob);
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        vp9_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
        vp9_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
@ -514,6 +541,14 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      vp9_fdct64x64_1(src_diff, coeff, diff_stride);
+      vp9_quantize_dc_64x64(coeff, x->skip_block, p->round,
+                            p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
+      break;
+#endif  // CONFIG_TX64X64
    case TX_32X32:
      vp9_fdct32x32_1(src_diff, coeff, diff_stride);
      vp9_quantize_dc_32x32(coeff, x->skip_block, p->round,
@ -563,6 +598,15 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        vp9_highbd_fdct64x64(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b_64x64(coeff, 4096, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, p->zbin_extra, eob,
+                                    scan_order->scan, scan_order->iscan);
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
        vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
@ -599,6 +643,15 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      vp9_fdct64x64(src_diff, coeff, diff_stride);
+      vp9_quantize_b_64x64(coeff, 4096, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           scan_order->iscan);
+      break;
+#endif  // CONFIG_TX64X64
    case TX_32X32:
      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
@ -649,6 +702,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
  a = &ctx->ta[plane][i];
  l = &ctx->tl[plane][j];
+  if (plane) assert(tx_size != TX_64X64);

  // TODO(jingning): per transformed block zero forcing only enabled for
  // luma component. will integrate chroma components as well.
@ -695,6 +749,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        vp9_highbd_idct64x64_add(dqcoeff, dst, pd->dst.stride,
+                                 p->eobs[block], xd->bd);
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride,
                                 p->eobs[block], xd->bd);
@ -722,6 +782,11 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      vp9_idct64x64_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      break;
+#endif
    case TX_32X32:
      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
      break;
@ -832,6 +897,29 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    switch (tx_size) {
+#if CONFIG_TX64X64
+      case TX_64X64:
+        scan_order = &vp9_default_scan_orders[TX_64X64];
+        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+        vp9_predict_intra_block(xd, block >> 8, bwl, TX_64X64, mode,
+                                x->skip_encode ? src : dst,
+                                x->skip_encode ? src_stride : dst_stride,
+                                dst, dst_stride, i, j, plane);
+        if (!x->skip_recode) {
+          vp9_highbd_subtract_block(64, 64, src_diff, diff_stride,
+                                    src, src_stride, dst, dst_stride, xd->bd);
+          vp9_highbd_fdct64x64(src_diff, coeff, diff_stride);
+          vp9_highbd_quantize_b_64x64(coeff, 4096, x->skip_block, p->zbin,
+                                      p->round, p->quant, p->quant_shift,
+                                      qcoeff, dqcoeff, pd->dequant,
+                                      p->zbin_extra, eob,
+                                      scan_order->scan, scan_order->iscan);
+          if (!x->skip_encode && *eob) {
+            vp9_highbd_idct64x64_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          }
+        }
+        break;
+#endif  // CONFIG_TX64X64
      case TX_32X32:
        scan_order = &vp9_default_scan_orders[TX_32X32];
        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
@ -941,6 +1029,28 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      assert(plane == 0);
+      scan_order = &vp9_default_scan_orders[TX_64X64];
+      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+      vp9_predict_intra_block(xd, block >> 8, bwl, TX_64X64, mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? src_stride : dst_stride,
+                              dst, dst_stride, i, j, plane);
+      if (!x->skip_recode) {
+        vp9_subtract_block(64, 64, src_diff, diff_stride,
+                           src, src_stride, dst, dst_stride);
+        vp9_fdct64x64(src_diff, coeff, diff_stride);
+        vp9_quantize_b_64x64(coeff, 4096, x->skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                             scan_order->iscan);
+      }
+      if (!x->skip_encode && *eob)
+        vp9_idct64x64_add(dqcoeff, dst, dst_stride, *eob);
+      break;
+#endif  // CONFIG_TX64X64
    case TX_32X32:
      scan_order = &vp9_default_scan_orders[TX_32X32];
      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@ -3120,7 +3120,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
  release_scaled_references(cpi);
  vp9_update_reference_frames(cpi);

-  for (t = TX_4X4; t <= TX_32X32; t++)
+  for (t = TX_4X4; t < TX_SIZES; t++)
    full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]);

  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@ -65,10 +65,15 @@ void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
 }
 #endif

-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+static INLINE void quantize_dc_bigtx(const tran_low_t *coeff_ptr,
+                                     int skip_block,
+                                     const int16_t *round_ptr,
+                                     const int16_t quant,
+                                     tran_low_t *qcoeff_ptr,
+                                     tran_low_t *dqcoeff_ptr,
+                                     const int16_t dequant_ptr,
+                                     uint16_t *eob_ptr,
+                                     int logsizeby32) {
  const int rc = 0;
  const int coeff = coeff_ptr[rc];
  const int coeff_sign = (coeff >> 31);
@ -78,24 +83,43 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
  if (!skip_block) {

    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
+    tmp = (tmp * quant) >> (15 - logsizeby32);
    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / (2 << logsizeby32);
    if (tmp)
      eob = 0;
  }
  *eob_ptr = eob + 1;
 }

+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  quantize_dc_bigtx(coeff_ptr, skip_block, round_ptr, quant,
+                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  quantize_dc_bigtx(coeff_ptr, skip_block, round_ptr, quant,
+                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 1);
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
+static INLINE void highbd_quantize_dc_bigtx(const tran_low_t *coeff_ptr,
+                                            int skip_block,
+                                            const int16_t *round_ptr,
+                                            const int16_t quant,
+                                            tran_low_t *qcoeff_ptr,
+                                            tran_low_t *dqcoeff_ptr,
+                                            const int16_t dequant_ptr,
+                                            uint16_t *eob_ptr,
+                                            int logsizeby32) {
  int eob = -1;

  if (!skip_block) {
@ -106,15 +130,41 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,

    const int64_t tmp =
        (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
-         quant) >> 15;
+         quant) >> (15 - logsizeby32);
    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / (2 << logsizeby32);
    if (tmp)
      eob = 0;
  }
  *eob_ptr = eob + 1;
 }
-#endif
+
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  highbd_quantize_dc_bigtx(coeff_ptr, skip_block, round_ptr, quant,
+                           qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  highbd_quantize_dc_bigtx(coeff_ptr, skip_block, round_ptr, quant,
+                           qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, 1);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       int skip_block,
@ -210,15 +260,21 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,

 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr,
-                             int zbin_oq_value, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+static INLINE void quantize_fp_bigtx(const tran_low_t *coeff_ptr,
+                                     intptr_t n_coeffs,
+                                     int skip_block,
+                                     const int16_t *zbin_ptr,
+                                     const int16_t *round_ptr,
+                                     const int16_t *quant_ptr,
+                                     const int16_t *quant_shift_ptr,
+                                     tran_low_t *qcoeff_ptr,
+                                     tran_low_t *dqcoeff_ptr,
+                                     const int16_t *dequant_ptr,
+                                     int zbin_oq_value,
+                                     uint16_t *eob_ptr,
+                                     const int16_t *scan,
+                                     const int16_t *iscan,
+                                     int logsizeby32) {
  int i, eob = -1;
  (void)zbin_ptr;
  (void)quant_shift_ptr;
@ -236,12 +292,13 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      int tmp = 0;
      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> (2 + logsizeby32))) {
        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        tmp = (abs_coeff * quant_ptr[rc != 0]) >> (15 - logsizeby32);
        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] /
+                          (2 << logsizeby32);
      }

      if (tmp)
@ -251,18 +308,64 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  *eob_ptr = eob + 1;
 }

+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                             intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr,
+                             const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr,
+                             tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             int zbin_oq_value,
+                             uint16_t *eob_ptr,
+                             const int16_t *scan,
+                             const int16_t *iscan) {
+  quantize_fp_bigtx(coeff_ptr, n_coeffs, skip_block,
+                    zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                    zbin_oq_value, eob_ptr, scan, iscan, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_quantize_fp_64x64_c(const tran_low_t *coeff_ptr,
+                             intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *zbin_ptr,
+                             const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr,
+                             tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             int zbin_oq_value,
+                             uint16_t *eob_ptr,
+                             const int16_t *scan,
+                             const int16_t *iscan) {
+  quantize_fp_bigtx(coeff_ptr, n_coeffs, skip_block,
+                    zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                    zbin_oq_value, eob_ptr, scan, iscan, 1);
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
-                                    intptr_t n_coeffs, int skip_block,
-                                    const int16_t *zbin_ptr,
-                                    const int16_t *round_ptr,
-                                    const int16_t *quant_ptr,
-                                    const int16_t *quant_shift_ptr,
-                                    tran_low_t *qcoeff_ptr,
-                                    tran_low_t *dqcoeff_ptr,
-                                    const int16_t *dequant_ptr,
-                                    int zbin_oq_value, uint16_t *eob_ptr,
-                                    const int16_t *scan, const int16_t *iscan) {
+static INLINE void highbd_quantize_fp_bigtx(const tran_low_t *coeff_ptr,
+                                            intptr_t n_coeffs,
+                                            int skip_block,
+                                            const int16_t *zbin_ptr,
+                                            const int16_t *round_ptr,
+                                            const int16_t *quant_ptr,
+                                            const int16_t *quant_shift_ptr,
+                                            tran_low_t *qcoeff_ptr,
+                                            tran_low_t *dqcoeff_ptr,
+                                            const int16_t *dequant_ptr,
+                                            int zbin_oq_value,
+                                            uint16_t *eob_ptr,
+                                            const int16_t *scan,
+                                            const int16_t *iscan,
+                                            int logsizeby32) {
  int i, eob = -1;
  (void)zbin_ptr;
  (void)quant_shift_ptr;
@ -280,12 +383,13 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
      int64_t tmp = 0;
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      if (abs_coeff >= (dequant_ptr[rc != 0] >> (2 + logsizeby32))) {
        tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
                    INT32_MIN, INT32_MAX);
-        tmp = (tmp * quant_ptr[rc != 0]) >> 15;
+        tmp = (tmp * quant_ptr[rc != 0]) >> (15 - logsizeby32);
        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] /
+                          (2 << logsizeby32);
      }

      if (tmp)
@ -294,7 +398,49 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
  }
  *eob_ptr = eob + 1;
 }
-#endif
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs,
+                                    int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    int zbin_oq_value,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan,
+                                    const int16_t *iscan) {
+  highbd_quantize_fp_bigtx(coeff_ptr, n_coeffs, skip_block,
+                           zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                           qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                           zbin_oq_value, eob_ptr, scan, iscan, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_highbd_quantize_fp_64x64_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs,
+                                    int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    int zbin_oq_value,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan,
+                                    const int16_t *iscan) {
+  highbd_quantize_fp_bigtx(coeff_ptr, n_coeffs, skip_block,
+                           zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                           qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                           zbin_oq_value, eob_ptr, scan, iscan, 1);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                      int skip_block,
@ -403,23 +549,29 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  }
  *eob_ptr = eob + 1;
 }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH

-void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr,
-                            int zbin_oq_value, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
+static INLINE void quantize_b_bigtx(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs,
+                                    int skip_block,
+                                    const int16_t *zbin_ptr,
+                                    const int16_t *round_ptr,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *quant_shift_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    const int16_t *dequant_ptr,
+                                    int zbin_oq_value,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan,
+                                    const int16_t *iscan,
+                                    int logsizeby32) {
  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};

  int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[MAX_NUM_COEFS];
  int i, eob = -1;
  (void)iscan;

@ -446,13 +598,14 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      const int coeff_sign = (coeff >> 31);
      int tmp;
      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], (1 + logsizeby32));
      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-               quant_shift_ptr[rc != 0]) >> 15;
+               quant_shift_ptr[rc != 0]) >> (15 - logsizeby32);

      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] /
+                        (2 << logsizeby32);

      if (tmp)
        eob = idx_arr[i];
@ -461,24 +614,70 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  *eob_ptr = eob + 1;
 }

+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr,
+                            const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan,
+                            const int16_t *iscan) {
+  quantize_b_bigtx(coeff_ptr, n_coeffs, skip_block,
+                   zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                   qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                   zbin_oq_value, eob_ptr, scan, iscan, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_quantize_b_64x64_c(const tran_low_t *coeff_ptr,
+                            intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr,
+                            const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            int zbin_oq_value,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan,
+                            const int16_t *iscan) {
+  quantize_b_bigtx(coeff_ptr, n_coeffs, skip_block,
+                   zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                   qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                   zbin_oq_value, eob_ptr, scan, iscan, 1);
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, int skip_block,
-                                   const int16_t *zbin_ptr,
-                                   const int16_t *round_ptr,
-                                   const int16_t *quant_ptr,
-                                   const int16_t *quant_shift_ptr,
-                                   tran_low_t *qcoeff_ptr,
-                                   tran_low_t *dqcoeff_ptr,
-                                   const int16_t *dequant_ptr,
-                                   int zbin_oq_value, uint16_t *eob_ptr,
-                                   const int16_t *scan, const int16_t *iscan) {
+static INLINE void highbd_quantize_b_bigtx(const tran_low_t *coeff_ptr,
+                                           intptr_t n_coeffs,
+                                           int skip_block,
+                                           const int16_t *zbin_ptr,
+                                           const int16_t *round_ptr,
+                                           const int16_t *quant_ptr,
+                                           const int16_t *quant_shift_ptr,
+                                           tran_low_t *qcoeff_ptr,
+                                           tran_low_t *dqcoeff_ptr,
+                                           const int16_t *dequant_ptr,
+                                           int zbin_oq_value,
+                                           uint16_t *eob_ptr,
+                                           const int16_t *scan,
+                                           const int16_t *iscan,
+                                           int logsizeby32) {
  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };

  int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[MAX_NUM_COEFS];
  int i, eob = -1;
  (void)iscan;

@ -504,14 +703,15 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
      const int coeff = coeff_ptr[rc];
      const int coeff_sign = (coeff >> 31);
      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int64_t tmp = clamp(abs_coeff +
-                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                          INT32_MIN, INT32_MAX);
+      int64_t tmp = clamp(
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], (1 + logsizeby32)),
+          INT32_MIN, INT32_MAX);
      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >> 15;
+               quant_shift_ptr[rc != 0]) >> (15 - logsizeby32);

      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] /
+                        (2 << logsizeby32);

      if (tmp)
        eob = idx_arr[i];
@ -519,7 +719,49 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
  }
  *eob_ptr = eob + 1;
 }
-#endif
+
+void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs,
+                                   int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   int zbin_oq_value,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan,
+                                   const int16_t *iscan) {
+  highbd_quantize_b_bigtx(coeff_ptr, n_coeffs, skip_block,
+                          zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          zbin_oq_value, eob_ptr, scan, iscan, 0);
+}
+
+#if CONFIG_TX64X64
+void vp9_highbd_quantize_b_64x64_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs,
+                                   int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   int zbin_oq_value,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan,
+                                   const int16_t *iscan) {
+  highbd_quantize_b_bigtx(coeff_ptr, n_coeffs, skip_block,
+                          zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          zbin_oq_value, eob_ptr, scan, iscan, 1);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                const int16_t *scan, const int16_t *iscan) {
@ -530,21 +772,21 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    vp9_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
-                        16, x->skip_block,
-                        p->zbin, p->round, p->quant, p->quant_shift,
-                        BLOCK_OFFSET(p->qcoeff, block),
-                        BLOCK_OFFSET(pd->dqcoeff, block),
-                        pd->dequant, p->zbin_extra, &p->eobs[block],
-                        scan, iscan);
+                          16, x->skip_block,
+                          p->zbin, p->round, p->quant, p->quant_shift,
+                          BLOCK_OFFSET(p->qcoeff, block),
+                          BLOCK_OFFSET(pd->dqcoeff, block),
+                          pd->dequant, p->zbin_extra, &p->eobs[block],
+                          scan, iscan);
    return;
  }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
  vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
-           16, x->skip_block,
-           p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(p->qcoeff, block),
-           BLOCK_OFFSET(pd->dqcoeff, block),
-           pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
+                 16, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift,
+                 BLOCK_OFFSET(p->qcoeff, block),
+                 BLOCK_OFFSET(pd->dqcoeff, block),
+                 pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
 }

 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@ -45,6 +45,12 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                           const int16_t *round_ptr, const int16_t quant_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+#if CONFIG_TX64X64
+void vp9_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                const int16_t *scan, const int16_t *iscan);

@ -61,7 +67,17 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t dequant_ptr,
                                  uint16_t *eob_ptr);
-#endif
+#if CONFIG_TX64X64
+void vp9_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr);
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_VP9_HIGHBITDEPTH

 struct VP9_COMP;
 struct VP9Common;
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@ -88,7 +88,7 @@ static void fill_token_costs(vp9_coeff_cost *c,
                             vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
  int i, j, k, l;
  TX_SIZE t;
-  for (t = TX_4X4; t <= TX_32X32; ++t)
+  for (t = TX_4X4; t < TX_SIZES; ++t)
    for (i = 0; i < PLANE_TYPES; ++i)
      for (j = 0; j < REF_TYPES; ++j)
        for (k = 0; k < COEF_BANDS; ++k)
@ -437,6 +437,14 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
      for (i = 0; i < num_4x4_h; i += 8)
        t_left[i] = !!*(const uint64_t *)&left[i];
      break;
+#if CONFIG_TX64X64
+    case TX_64X64:
+      for (i = 0; i < num_4x4_w; i += 16)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 16)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+#endif
    default:
      assert(0 && "Invalid transform size.");
      break;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -340,6 +340,9 @@ static const int16_t band_counts[TX_SIZES][8] = {
  { 1, 2, 3, 4, 11,   64 - 21, 0 },
  { 1, 2, 3, 4, 11,  256 - 21, 0 },
  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+#if CONFIG_TX64X64
+  { 1, 2, 3, 4, 11, 4096 - 21, 0 },
+#endif
 };
 static INLINE int cost_coeffs(MACROBLOCK *x,
                              int plane, int block,
@ -357,7 +360,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
  unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_NUM_COEFS];
  int pt = combine_entropy_contexts(*A, *L);
  int c, cost;
  // Check for consistency of tx_size with mode info
@ -416,6 +419,8 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
  return cost;
 }

+#define right_shift_signed(x, s) ((s) < 0 ? (x) << (-(s)) : (x) >> (s))
+
 #if CONFIG_VP9_HIGHBITDEPTH
 static void dist_block(int plane, int block, TX_SIZE tx_size,
                       struct rdcost_block_args* args, int bd) {
@ -429,17 +434,23 @@ static void dist_block(int plane, int block, TX_SIZE tx_size,
  const struct macroblock_plane *const p = &x->plane[plane];
  const struct macroblockd_plane *const pd = &xd->plane[plane];
  int64_t this_sse;
+#if CONFIG_TX64X64
+  int shift = (tx_size == TX_64X64 ? -2 : (tx_size == TX_32X32 ? 0 : 2));
+#else
  int shift = tx_size == TX_32X32 ? 0 : 2;
+#endif
  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
-  args->dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                      &this_sse, bd) >> shift;
+  args->dist = right_shift_signed(
+      vp9_highbd_block_error(
+          coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd), shift);
 #else
-  args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                               &this_sse) >> shift;
+  args->dist = right_shift_signed(
+      vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse), shift);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  args->sse  = this_sse >> shift;
+  args->sse = right_shift_signed(this_sse, shift);

  if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {
    // TODO(jingning): tune the model to better capture the distortion.
@ -514,9 +525,12 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 #if CONFIG_VP9_HIGHBITDEPTH
        dc_correct >>= ((xd->bd - 8) * 2);
 #endif
-        if (tx_size != TX_32X32)
+        if (tx_size < TX_32X32)
          dc_correct >>= 2;
-
+#if CONFIG_TX64X64
+        else if (tx_size == TX_64X64)
+          dc_correct <<= 2;
+#endif
        args->dist = MAX(0, args->sse - dc_correct);
      }
    } else {
@ -629,10 +643,15 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
  int r[TX_SIZES][2], s[TX_SIZES];
  int64_t d[TX_SIZES], sse[TX_SIZES];
-  int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX},
-                             {INT64_MAX, INT64_MAX}};
+  int64_t rd[TX_SIZES][2] = {
+    {INT64_MAX, INT64_MAX},
+    {INT64_MAX, INT64_MAX},
+    {INT64_MAX, INT64_MAX},
+    {INT64_MAX, INT64_MAX},
+#if CONFIG_TX64X64
+    {INT64_MAX, INT64_MAX},
+#endif
+  };
  int n, m;
  int s0, s1;
  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
@ -681,7 +700,6 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
                      best_tx : MIN(max_tx_size, max_mode_tx_size);

-
  *distortion = d[mbmi->tx_size];
  *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
  *skip       = s[mbmi->tx_size];
@ -691,8 +709,14 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
  tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
  tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
+#if CONFIG_TX64X64
+  tx_cache[ALLOW_64X64] = rd[MIN(max_tx_size, TX_64X64)][0];
+#endif

-  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
+#if CONFIG_TX64X64
+  if (max_tx_size >= TX_64X64 && best_tx == TX_64X64) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_64X64][1];
+  } else if (max_tx_size >= TX_32X32 && best_tx == TX_32X32) {
    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
@ -701,6 +725,17 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  } else {
    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
  }
+#else
+  if (max_tx_size >= TX_32X32 && best_tx == TX_32X32) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
+  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
+  } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
+  } else {
+    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
+  }
+#endif
 }

 static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
@ -1972,12 +2007,13 @@ static void estimate_ref_frame_costs(const VP9_COMMON *cm,
  }
 }

-static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
-                         int mode_index,
-                         int64_t comp_pred_diff[REFERENCE_MODES],
-                         const int64_t tx_size_diff[TX_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
-                         int skippable) {
+static void store_coding_context(
+    MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+    int mode_index,
+    int64_t comp_pred_diff[REFERENCE_MODES],
+    const int64_t tx_size_diff[TX_MODES],
+    int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
+    int skippable) {
  MACROBLOCKD *const xd = &x->e_mbd;

  // Take a snapshot of the coding context so it can be
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@ -48,6 +48,10 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
    sf->adaptive_pred_interp_filter = 1;

    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
+#endif
    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
@ -114,6 +118,10 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
    sf->adaptive_rd_thresh = 3;
    sf->mode_skip_start = 6;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC;
+#endif
    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
    sf->adaptive_interp_filter_search = 1;
@ -181,6 +189,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
    sf->adaptive_pred_interp_filter = 1;
    sf->mv.auto_mv_step_size = 1;
    sf->adaptive_rd_thresh = 2;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = INTRA_DC_H_V;
+#endif
    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
@ -240,6 +252,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
      sf->intra_uv_mode_mask[i] = INTRA_DC;
    }
    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+#if CONFIG_TX64X64
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+#endif
    sf->frame_parameter_update = 0;
    sf->mv.search_method = FAST_HEX;

--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@ -296,7 +296,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
  VP9_COMP *cpi = args->cpi;
  MACROBLOCKD *xd = args->xd;
  TOKENEXTRA **tp = args->tp;
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_NUM_COEFS];
  struct macroblock_plane *p = &cpi->mb.plane[plane];
  struct macroblockd_plane *pd = &xd->plane[plane];
  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
@ -374,7 +374,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                       counts[band[c]][pt]);
    ++eob_branch[band[c]][pt];
  }
-
  *tp = t;

  vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);