Use balanced model for intra prediction mode coding

This commit replaces the previous table based intra mode model coding with a more balanced entropy coding system. It reduces the decoder lookup table size by 1K bytes. The key frame compression performance is about even on average. There are a few points where the compression performance is improved by over 5%. Most test points are fairly close to the lookup table approach. Change-Id: I47154276c0a6a22ae87de8845bc2d494681b95f6
Make tx partition entropy coder account for block size
2015-06-23 16:42:56 -07:00 · 2015-06-18 21:56:30 +00:00 · 2015-06-18 14:54:49 -07:00 · 2015-06-16 18:56:47 -07:00 · 2015-06-16 08:49:13 -07:00 · 2015-06-15 15:53:19 -07:00
33 changed files with 1835 additions and 423 deletions
--- a/test/byte_alignment_test.cc
+++ b/test/byte_alignment_test.cc
@ -21,13 +21,13 @@

 namespace {

-const int kLegacyByteAlignment = 0;
-const int kLegacyYPlaneByteAlignment = 32;
-const int kNumPlanesToCheck = 3;
-const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
-const char kVP9Md5File[] = "vp90-2-02-size-lf-1920x1080.webm.md5";
+//const int kLegacyByteAlignment = 0;
+//const int kLegacyYPlaneByteAlignment = 32;
+//const int kNumPlanesToCheck = 3;
+//const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+//const char kVP9Md5File[] = "vp90-2-02-size-lf-1920x1080.webm.md5";

-#if CONFIG_WEBM_IO
+#if CONFIG_WEBM_IO && 0

 struct ByteAlignmentTestParam {
  int byte_alignment;
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@ -398,7 +398,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
  delete video;
 }

-#if CONFIG_WEBM_IO
+#if CONFIG_WEBM_IO && 0
 TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
  // Minimum number of external frame buffers for VP9 is
  // #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS.
@ -481,8 +481,8 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
 }
 #endif  // CONFIG_WEBM_IO

-VP9_INSTANTIATE_TEST_CASE(ExternalFrameBufferMD5Test,
-                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
-                                              libvpx_test::kVP9TestVectors +
-                                              libvpx_test::kNumVP9TestVectors));
+//VP9_INSTANTIATE_TEST_CASE(ExternalFrameBufferMD5Test,
+//                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+//                                              libvpx_test::kVP9TestVectors +
+//                                              libvpx_test::kNumVP9TestVectors));
 }  // namespace
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@ -110,23 +110,23 @@ TEST_P(InvalidFileTest, ReturnCode) {
  RunTest();
 }

-const DecodeParam kVP9InvalidFileTests[] = {
-  {1, "invalid-vp90-02-v2.webm"},
-  {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
-  {1, "invalid-vp90-03-v3.webm"},
-  {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
-  {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
-  {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
-  {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
-  {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf"},
-  {1, "invalid-vp91-2-mixedrefcsp-444to420.ivf"},
-  {1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf"},
-  {1, "invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf"},
-  {1, "invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf"},
-};
+//const DecodeParam kVP9InvalidFileTests[] = {
+//  {1, "invalid-vp90-02-v2.webm"},
+//  {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
+//  {1, "invalid-vp90-03-v3.webm"},
+//  {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
+//  {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
+//  {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
+//  {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
+//  {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf"},
+//  {1, "invalid-vp91-2-mixedrefcsp-444to420.ivf"},
+//  {1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf"},
+//  {1, "invalid-vp90-2-03-size-224x196.webm.ivf.s44156_r01-05_b6-.ivf"},
+//  {1, "invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf"},
+//};

-VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
-                          ::testing::ValuesIn(kVP9InvalidFileTests));
+//VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
+//                          ::testing::ValuesIn(kVP9InvalidFileTests));

 // This class will include test vectors that are expected to fail
 // peek. However they are still expected to have no fatal failures.
@ -142,26 +142,26 @@ TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {
  RunTest();
 }

-const DecodeParam kVP9InvalidFileInvalidPeekTests[] = {
-  {1, "invalid-vp90-01-v2.webm"},
-};
+//const DecodeParam kVP9InvalidFileInvalidPeekTests[] = {
+//  {1, "invalid-vp90-01-v2.webm"},
+//};

-VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
-                          ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests));
+//VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
+//                          ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests));

-const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
-  {4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"},
-  {4, "invalid-"
-      "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"},
-  {4, "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf"},
-  {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf"},
-  {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf"},
-};
+//const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
+//  {4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"},
+//  {4, "invalid-"
+//      "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"},
+//  {4, "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf"},
+//  {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf"},
+//  {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf"},
+//};

-INSTANTIATE_TEST_CASE_P(
-    VP9MultiThreaded, InvalidFileTest,
-    ::testing::Combine(
-        ::testing::Values(
-            static_cast<const libvpx_test::CodecFactory*>(&libvpx_test::kVP9)),
-        ::testing::ValuesIn(kMultiThreadedVP9InvalidFileTests)));
+//INSTANTIATE_TEST_CASE_P(
+//    VP9MultiThreaded, InvalidFileTest,
+//    ::testing::Combine(
+//        ::testing::Values(
+//            static_cast<const libvpx_test::CodecFactory*>(&libvpx_test::kVP9)),
+//        ::testing::ValuesIn(kMultiThreadedVP9InvalidFileTests)));
 }  // namespace
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@ -145,28 +145,28 @@ VP8_INSTANTIATE_TEST_CASE(
                                libvpx_test::kNumVP8TestVectors)));

 // Test VP9 decode in serial mode with single thread.
-VP9_INSTANTIATE_TEST_CASE(
-    TestVectorTest,
-    ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
-        ::testing::Values(1),  // Single thread.
-        ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
-                            libvpx_test::kVP9TestVectors +
-                                libvpx_test::kNumVP9TestVectors)));
+//VP9_INSTANTIATE_TEST_CASE(
+//    TestVectorTest,
+//    ::testing::Combine(
+//        ::testing::Values(0),  // Serial Mode.
+//        ::testing::Values(1),  // Single thread.
+//        ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+//                            libvpx_test::kVP9TestVectors +
+//                                libvpx_test::kNumVP9TestVectors)));


-#if CONFIG_VP9_DECODER
-// Test VP9 decode in frame parallel mode with different number of threads.
-INSTANTIATE_TEST_CASE_P(
-    VP9MultiThreadedFrameParallel, TestVectorTest,
-    ::testing::Combine(
-        ::testing::Values(
-            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
-        ::testing::Combine(
-            ::testing::Values(1),        // Frame Parallel mode.
-            ::testing::Range(2, 9),      // With 2 ~ 8 threads.
-            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
-                                libvpx_test::kVP9TestVectors +
-                                    libvpx_test::kNumVP9TestVectors))));
-#endif
+//#if CONFIG_VP9_DECODER
+//// Test VP9 decode in frame parallel mode with different number of threads.
+//INSTANTIATE_TEST_CASE_P(
+//    VP9MultiThreadedFrameParallel, TestVectorTest,
+//    ::testing::Combine(
+//        ::testing::Values(
+//            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+//        ::testing::Combine(
+//            ::testing::Values(1),        // Frame Parallel mode.
+//            ::testing::Range(2, 9),      // With 2 ~ 8 threads.
+//            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+//                                libvpx_test::kVP9TestVectors +
+//                                    libvpx_test::kNumVP9TestVectors))));
+//#endif
 }  // namespace
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@ -30,7 +30,7 @@ namespace {
 using std::string;
 using libvpx_test::ACMRandom;

-#if CONFIG_WEBM_IO
+#if CONFIG_WEBM_IO && 0

 void CheckUserPrivateData(void *user_priv, int *target) {
  // actual pointer value should be the same as expected.
--- a/test/vp9_decrypt_test.cc
+++ b/test/vp9_decrypt_test.cc
@ -43,29 +43,29 @@ void test_decrypt_cb(void *decrypt_state, const uint8_t *input,

 namespace libvpx_test {

-TEST(TestDecrypt, DecryptWorksVp9) {
-  libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf");
-  video.Init();
-
-  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
-  VP9Decoder decoder(dec_cfg, 0);
-
-  video.Begin();
-
-  // no decryption
-  vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size());
-  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-
-  // decrypt frame
-  video.Next();
-
-  std::vector<uint8_t> encrypted(video.frame_size());
-  encrypt_buffer(video.cxdata(), &encrypted[0], video.frame_size(), 0);
-  vpx_decrypt_init di = { test_decrypt_cb, &encrypted[0] };
-  decoder.Control(VPXD_SET_DECRYPTOR, &di);
-
-  res = decoder.DecodeFrame(&encrypted[0], encrypted.size());
-  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-}
+//TEST(TestDecrypt, DecryptWorksVp9) {
+//  libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf");
+//  video.Init();
+//
+//  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
+//  VP9Decoder decoder(dec_cfg, 0);
+//
+//  video.Begin();
+//
+//  // no decryption
+//  vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size());
+//  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+//
+//  // decrypt frame
+//  video.Next();
+//
+//  std::vector<uint8_t> encrypted(video.frame_size());
+//  encrypt_buffer(video.cxdata(), &encrypted[0], video.frame_size(), 0);
+//  vpx_decrypt_init di = { test_decrypt_cb, &encrypted[0] };
+//  decoder.Control(VPXD_SET_DECRYPTOR, &di);
+//
+//  res = decoder.DecodeFrame(&encrypted[0], encrypted.size());
+//  ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+//}

 }  // namespace libvpx_test
--- a/test/vp9_frame_parallel_test.cc
+++ b/test/vp9_frame_parallel_test.cc
@ -27,7 +27,7 @@ namespace {

 using std::string;

-#if CONFIG_WEBM_IO
+#if CONFIG_WEBM_IO && 0

 struct FileList {
  const char *name;
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@ -152,7 +152,7 @@ TEST(VP9WorkerThreadTest, TestInterfaceAPI) {
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests

-#if CONFIG_WEBM_IO
+#if CONFIG_WEBM_IO && 0
 struct FileList {
  const char *name;
  const char *expected_md5;
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@ -114,6 +114,8 @@ void vp9_free_context_buffers(VP9_COMMON *cm) {
  cm->above_context = NULL;
  vpx_free(cm->above_seg_context);
  cm->above_seg_context = NULL;
+  vpx_free(cm->above_txfm_context);
+  cm->above_txfm_context = NULL;
 }

 int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
@ -137,6 +139,10 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
      mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
  if (!cm->above_seg_context) goto fail;

+  cm->above_txfm_context = (TXFM_CONTEXT *)vpx_calloc(
+      mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_txfm_context));
+  if (!cm->above_txfm_context) goto fail;
+
  return 0;

 fail:
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@ -29,6 +29,7 @@ extern "C" {
 #define BLOCK_SIZE_GROUPS 4
 #define SKIP_CONTEXTS 3
 #define INTER_MODE_CONTEXTS 7
+#define TXFM_PARTITION_CONTEXTS 12

 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
@ -46,6 +47,7 @@ typedef enum {
 #define MAX_MB_PLANE 3

 typedef char ENTROPY_CONTEXT;
+typedef TX_SIZE TXFM_CONTEXT;

 static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
                                           ENTROPY_CONTEXT b) {
@ -113,6 +115,8 @@ typedef struct {
  BLOCK_SIZE sb_type;
  PREDICTION_MODE mode;
  TX_SIZE tx_size;
+  TX_SIZE inter_tx_size[64];  // Assume maximum of 64x64 block size.
+  TX_SIZE max_tx_size;        // Maximum tx size allowed in current block.
  int8_t skip;
  int8_t segment_id;
  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
@ -126,7 +130,6 @@ typedef struct {
  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
  uint8_t mode_context[MAX_REF_FRAMES];
  INTERP_FILTER interp_filter;
-
 } MB_MODE_INFO;

 typedef struct MODE_INFO {
@ -218,6 +221,10 @@ typedef struct macroblockd {
  PARTITION_CONTEXT *above_seg_context;
  PARTITION_CONTEXT left_seg_context[8];

+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT *left_txfm_context;
+  TXFM_CONTEXT left_txfm_context_buffer[8];
+
  /* mc buffer */
  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);

@ -265,13 +272,21 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,

 void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);

-static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
+static TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
                                          int xss, int yss) {
  if (bsize < BLOCK_8X8) {
    return TX_4X4;
  } else {
    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
-    return MIN(y_tx_size, max_txsize_lookup[plane_bsize]);
+    TX_SIZE uv_tx_size = TX_4X4;
+    if (y_tx_size == TX_32X32)
+      uv_tx_size = TX_16X16;
+    else if (y_tx_size == TX_16X16)
+      uv_tx_size = TX_8X8;
+    else if (y_tx_size == TX_8X8)
+      uv_tx_size = TX_4X4;
+
+    return MIN(uv_tx_size, max_txsize_lookup[plane_bsize]);
  }
 }

@ -300,7 +315,7 @@ void vp9_foreach_transformed_block(
    const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
    foreach_transformed_block_visitor visit, void *arg);

-static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
+static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                            TX_SIZE tx_size, int block,
                                            int *x, int *y) {
  const int bwl = b_width_log2_lookup[plane_bsize];
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@ -13,118 +13,12 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"

-const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
-  {  // above = dc
-    { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
-    {  92,  45, 102, 136, 116, 180,  74,  90, 100 },  // left = v
-    {  73,  32,  19, 187, 222, 215,  46,  34, 100 },  // left = h
-    {  91,  30,  32, 116, 121, 186,  93,  86,  94 },  // left = d45
-    {  72,  35,  36, 149,  68, 206,  68,  63, 105 },  // left = d135
-    {  73,  31,  28, 138,  57, 124,  55, 122, 151 },  // left = d117
-    {  67,  23,  21, 140, 126, 197,  40,  37, 171 },  // left = d153
-    {  86,  27,  28, 128, 154, 212,  45,  43,  53 },  // left = d207
-    {  74,  32,  27, 107,  86, 160,  63, 134, 102 },  // left = d63
-    {  59,  67,  44, 140, 161, 202,  78,  67, 119 }   // left = tm
-  }, {  // above = v
-    {  63,  36, 126, 146, 123, 158,  60,  90,  96 },  // left = dc
-    {  43,  46, 168, 134, 107, 128,  69, 142,  92 },  // left = v
-    {  44,  29,  68, 159, 201, 177,  50,  57,  77 },  // left = h
-    {  58,  38,  76, 114,  97, 172,  78, 133,  92 },  // left = d45
-    {  46,  41,  76, 140,  63, 184,  69, 112,  57 },  // left = d135
-    {  38,  32,  85, 140,  46, 112,  54, 151, 133 },  // left = d117
-    {  39,  27,  61, 131, 110, 175,  44,  75, 136 },  // left = d153
-    {  52,  30,  74, 113, 130, 175,  51,  64,  58 },  // left = d207
-    {  47,  35,  80, 100,  74, 143,  64, 163,  74 },  // left = d63
-    {  36,  61, 116, 114, 128, 162,  80, 125,  82 }   // left = tm
-  }, {  // above = h
-    {  82,  26,  26, 171, 208, 204,  44,  32, 105 },  // left = dc
-    {  55,  44,  68, 166, 179, 192,  57,  57, 108 },  // left = v
-    {  42,  26,  11, 199, 241, 228,  23,  15,  85 },  // left = h
-    {  68,  42,  19, 131, 160, 199,  55,  52,  83 },  // left = d45
-    {  58,  50,  25, 139, 115, 232,  39,  52, 118 },  // left = d135
-    {  50,  35,  33, 153, 104, 162,  64,  59, 131 },  // left = d117
-    {  44,  24,  16, 150, 177, 202,  33,  19, 156 },  // left = d153
-    {  55,  27,  12, 153, 203, 218,  26,  27,  49 },  // left = d207
-    {  53,  49,  21, 110, 116, 168,  59,  80,  76 },  // left = d63
-    {  38,  72,  19, 168, 203, 212,  50,  50, 107 }   // left = tm
-  }, {  // above = d45
-    { 103,  26,  36, 129, 132, 201,  83,  80,  93 },  // left = dc
-    {  59,  38,  83, 112, 103, 162,  98, 136,  90 },  // left = v
-    {  62,  30,  23, 158, 200, 207,  59,  57,  50 },  // left = h
-    {  67,  30,  29,  84,  86, 191, 102,  91,  59 },  // left = d45
-    {  60,  32,  33, 112,  71, 220,  64,  89, 104 },  // left = d135
-    {  53,  26,  34, 130,  56, 149,  84, 120, 103 },  // left = d117
-    {  53,  21,  23, 133, 109, 210,  56,  77, 172 },  // left = d153
-    {  77,  19,  29, 112, 142, 228,  55,  66,  36 },  // left = d207
-    {  61,  29,  29,  93,  97, 165,  83, 175, 162 },  // left = d63
-    {  47,  47,  43, 114, 137, 181, 100,  99,  95 }   // left = tm
-  }, {  // above = d135
-    {  69,  23,  29, 128,  83, 199,  46,  44, 101 },  // left = dc
-    {  53,  40,  55, 139,  69, 183,  61,  80, 110 },  // left = v
-    {  40,  29,  19, 161, 180, 207,  43,  24,  91 },  // left = h
-    {  60,  34,  19, 105,  61, 198,  53,  64,  89 },  // left = d45
-    {  52,  31,  22, 158,  40, 209,  58,  62,  89 },  // left = d135
-    {  44,  31,  29, 147,  46, 158,  56, 102, 198 },  // left = d117
-    {  35,  19,  12, 135,  87, 209,  41,  45, 167 },  // left = d153
-    {  55,  25,  21, 118,  95, 215,  38,  39,  66 },  // left = d207
-    {  51,  38,  25, 113,  58, 164,  70,  93,  97 },  // left = d63
-    {  47,  54,  34, 146, 108, 203,  72, 103, 151 }   // left = tm
-  }, {  // above = d117
-    {  64,  19,  37, 156,  66, 138,  49,  95, 133 },  // left = dc
-    {  46,  27,  80, 150,  55, 124,  55, 121, 135 },  // left = v
-    {  36,  23,  27, 165, 149, 166,  54,  64, 118 },  // left = h
-    {  53,  21,  36, 131,  63, 163,  60, 109,  81 },  // left = d45
-    {  40,  26,  35, 154,  40, 185,  51,  97, 123 },  // left = d135
-    {  35,  19,  34, 179,  19,  97,  48, 129, 124 },  // left = d117
-    {  36,  20,  26, 136,  62, 164,  33,  77, 154 },  // left = d153
-    {  45,  18,  32, 130,  90, 157,  40,  79,  91 },  // left = d207
-    {  45,  26,  28, 129,  45, 129,  49, 147, 123 },  // left = d63
-    {  38,  44,  51, 136,  74, 162,  57,  97, 121 }   // left = tm
-  }, {  // above = d153
-    {  75,  17,  22, 136, 138, 185,  32,  34, 166 },  // left = dc
-    {  56,  39,  58, 133, 117, 173,  48,  53, 187 },  // left = v
-    {  35,  21,  12, 161, 212, 207,  20,  23, 145 },  // left = h
-    {  56,  29,  19, 117, 109, 181,  55,  68, 112 },  // left = d45
-    {  47,  29,  17, 153,  64, 220,  59,  51, 114 },  // left = d135
-    {  46,  16,  24, 136,  76, 147,  41,  64, 172 },  // left = d117
-    {  34,  17,  11, 108, 152, 187,  13,  15, 209 },  // left = d153
-    {  51,  24,  14, 115, 133, 209,  32,  26, 104 },  // left = d207
-    {  55,  30,  18, 122,  79, 179,  44,  88, 116 },  // left = d63
-    {  37,  49,  25, 129, 168, 164,  41,  54, 148 }   // left = tm
-  }, {  // above = d207
-    {  82,  22,  32, 127, 143, 213,  39,  41,  70 },  // left = dc
-    {  62,  44,  61, 123, 105, 189,  48,  57,  64 },  // left = v
-    {  47,  25,  17, 175, 222, 220,  24,  30,  86 },  // left = h
-    {  68,  36,  17, 106, 102, 206,  59,  74,  74 },  // left = d45
-    {  57,  39,  23, 151,  68, 216,  55,  63,  58 },  // left = d135
-    {  49,  30,  35, 141,  70, 168,  82,  40, 115 },  // left = d117
-    {  51,  25,  15, 136, 129, 202,  38,  35, 139 },  // left = d153
-    {  68,  26,  16, 111, 141, 215,  29,  28,  28 },  // left = d207
-    {  59,  39,  19, 114,  75, 180,  77, 104,  42 },  // left = d63
-    {  40,  61,  26, 126, 152, 206,  61,  59,  93 }   // left = tm
-  }, {  // above = d63
-    {  78,  23,  39, 111, 117, 170,  74, 124,  94 },  // left = dc
-    {  48,  34,  86, 101,  92, 146,  78, 179, 134 },  // left = v
-    {  47,  22,  24, 138, 187, 178,  68,  69,  59 },  // left = h
-    {  56,  25,  33, 105, 112, 187,  95, 177, 129 },  // left = d45
-    {  48,  31,  27, 114,  63, 183,  82, 116,  56 },  // left = d135
-    {  43,  28,  37, 121,  63, 123,  61, 192, 169 },  // left = d117
-    {  42,  17,  24, 109,  97, 177,  56,  76, 122 },  // left = d153
-    {  58,  18,  28, 105, 139, 182,  70,  92,  63 },  // left = d207
-    {  46,  23,  32,  74,  86, 150,  67, 183,  88 },  // left = d63
-    {  36,  38,  48,  92, 122, 165,  88, 137,  91 }   // left = tm
-  }, {  // above = tm
-    {  65,  70,  60, 155, 159, 199,  61,  60,  81 },  // left = dc
-    {  44,  78, 115, 132, 119, 173,  71, 112,  93 },  // left = v
-    {  39,  38,  21, 184, 227, 206,  42,  32,  64 },  // left = h
-    {  58,  47,  36, 124, 137, 193,  80,  82,  78 },  // left = d45
-    {  49,  50,  35, 144,  95, 205,  63,  78,  59 },  // left = d135
-    {  41,  53,  52, 148,  71, 142,  65, 128,  51 },  // left = d117
-    {  40,  36,  28, 143, 143, 202,  40,  55, 137 },  // left = d153
-    {  52,  34,  29, 129, 183, 227,  42,  35,  43 },  // left = d207
-    {  42,  44,  44, 104, 105, 164,  64, 130,  80 },  // left = d63
-    {  43,  81,  53, 140, 169, 204,  68,  84,  72 }   // left = tm
-  }
+const vp9_prob vp9_intra_mode_prob[INTRA_MODES] = {
+    227, 223, 219, 213, 204, 191, 170, 127
+};
+
+const vp9_prob vp9_intra_predictor_prob[3] = {
+    170, 192, 170
 };

 const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
@ -302,6 +196,10 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
  ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
 }

+static const vp9_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
+    141, 139, 175, 87, 196, 165, 177, 75, 220, 179, 205, 197
+};
+
 static const vp9_prob default_skip_probs[SKIP_CONTEXTS] = {
  192, 128, 64
 };
@ -324,6 +222,8 @@ void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
  vp9_copy(fc->comp_ref_prob, default_comp_ref_p);
  vp9_copy(fc->single_ref_prob, default_single_ref_p);
  fc->tx_probs = default_tx_probs;
+  vp9_copy(fc->txfm_partition_prob, default_txfm_partition_probs);
+  vp9_copy(fc->intra_predictor_prob, vp9_intra_predictor_prob);
  vp9_copy(fc->skip_probs, default_skip_probs);
  vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
 }
@ -402,6 +302,11 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
    }
  }

+  for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
+    fc->txfm_partition_prob[i] =
+        mode_mv_merge_probs(pre_fc->txfm_partition_prob[i],
+                            counts->txfm_partition[i]);
+
  for (i = 0; i < SKIP_CONTEXTS; ++i)
    fc->skip_probs[i] = mode_mv_merge_probs(
        pre_fc->skip_probs[i], counts->skip[i]);
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@ -49,6 +49,8 @@ typedef struct frame_contexts {
  vp9_prob single_ref_prob[REF_CONTEXTS][2];
  vp9_prob comp_ref_prob[REF_CONTEXTS];
  struct tx_probs tx_probs;
+  vp9_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
+  vp9_prob intra_predictor_prob[3];
  vp9_prob skip_probs[SKIP_CONTEXTS];
  nmv_context nmvc;
  int initialized;
@ -70,12 +72,13 @@ typedef struct FRAME_COUNTS {
  unsigned int comp_ref[REF_CONTEXTS][2];
  struct tx_counts tx;
  unsigned int skip[SKIP_CONTEXTS][2];
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+  unsigned int intra_predictor[2][2];
  nmv_context_counts mv;
 } FRAME_COUNTS;

+extern const vp9_prob vp9_intra_mode_prob[INTRA_MODES];
 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
-                                        [INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                            [PARTITION_TYPES - 1];
 extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
@ -97,15 +100,6 @@ void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
 void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                    unsigned int (*ct_8x8p)[2]);

-static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
-                                               const MODE_INFO *above_mi,
-                                               const MODE_INFO *left_mi,
-                                               int block) {
-  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
-  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
-  return vp9_kf_y_mode_prob[above][left];
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -1176,27 +1176,37 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
    // Determine the vertical edges that need filtering
    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
      const MODE_INFO *mi = mi_8x8[c].src_mi;
-      const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
-      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
+      const MB_MODE_INFO *mbmi = &mi[0].mbmi;
+      const BLOCK_SIZE sb_type = mbmi->sb_type;
+      const int skip_this = mbmi->skip && is_inter_block(mbmi);
+      const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
+      const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
      // left edge of current unit is block/partition edge -> no skip
      const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
-          !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;
+          !blk_col : 1;
      const int skip_this_c = skip_this && !block_edge_left;
      // top edge of current unit is block/partition edge -> no skip
      const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ?
-          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
+          !blk_row : 1;
      const int skip_this_r = skip_this && !block_edge_above;
-      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[0].mbmi, plane)
-                            : mi[0].mbmi.tx_size;
+
+      TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV) ?
+          get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, ss_x, ss_y)
+          : mbmi->tx_size;
      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;

      // Filter level can vary per MI
      if (!(lfl[(r << 3) + (c >> ss_x)] =
-            get_filter_level(&cm->lf_info, &mi[0].mbmi)))
+            get_filter_level(&cm->lf_info, mbmi)))
        continue;

+      if (is_inter_block(mbmi) && !mbmi->skip)
+        tx_size = (plane->plane_type == PLANE_TYPE_UV) ?
+            get_uv_tx_size_impl(mbmi->inter_tx_size[blk_row * 8 + blk_col],
+                                mbmi->sb_type, ss_x, ss_y)
+            : mbmi->inter_tx_size[blk_row * 8 + blk_col];
+
      // Build masks based on the transform size of each block
      if (tx_size == TX_32X32) {
        if (!skip_this_c && ((c >> ss_x) & 3) == 0) {
@ -1531,19 +1541,8 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                          struct macroblockd_plane planes[MAX_MB_PLANE],
                          int start, int stop, int y_only) {
  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  enum lf_path path;
-  LOOP_FILTER_MASK lfm;
  int mi_row, mi_col;

-  if (y_only)
-    path = LF_PATH_444;
-  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    path = LF_PATH_420;
-  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    path = LF_PATH_444;
-  else
-    path = LF_PATH_SLOW;
-
  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
    MODE_INFO *mi = cm->mi + mi_row * cm->mi_stride;

@ -1552,25 +1551,9 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,

      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

-      // TODO(JBB): Make setup_mask work for non 420.
-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                     &lfm);
-
-      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col);
-            break;
-        }
-      }
+      for (plane = 0; plane < num_planes; ++plane)
+        vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                      mi_row, mi_col);
    }
  }
 }
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@ -266,8 +266,9 @@ typedef struct VP9Common {
  // External BufferPool passed from outside.
  BufferPool *buffer_pool;

-  PARTITION_CONTEXT *above_seg_context;
  ENTROPY_CONTEXT *above_context;
+  PARTITION_CONTEXT *above_seg_context;
+  TXFM_CONTEXT *above_txfm_context;
 } VP9_COMMON;

 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@ -328,6 +329,7 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
  }

  xd->above_seg_context = cm->above_seg_context;
+  xd->above_txfm_context = cm->above_txfm_context;
  xd->mi_stride = cm->mi_stride;
  xd->error_info = &cm->error;
 }
@ -427,6 +429,30 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd,
  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }

+static void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+                                  TXFM_CONTEXT *left_ctx,
+                                  TX_SIZE tx_size) {
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int bs = num_8x8_blocks_high_lookup[bsize];
+  int i;
+  for (i = 0; i < bs; ++i) {
+    above_ctx[i] = tx_size;
+    left_ctx[i] = tx_size;
+  }
+}
+
+static int max_tx_size_offset[TX_SIZES] = {0, 0, 2, 6};
+
+static int txfm_partition_context(const TXFM_CONTEXT *above_ctx,
+                                  const TXFM_CONTEXT *left_ctx,
+                                  TX_SIZE max_tx_size,
+                                  TX_SIZE tx_size) {
+  int above = *above_ctx < tx_size;
+  int left = *left_ctx < tx_size;
+  return max_tx_size_offset[max_tx_size] +
+      2 * (max_tx_size - tx_size) + (above || left);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@ -138,7 +138,6 @@ void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
            break;
        }
      }
-
      sync_write(lf_sync, r, c, sb_cols);
    }
  }
@ -400,6 +399,10 @@ void vp9_accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts,
  for (i = 0; i < TX_SIZES; i++)
    cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];

+  for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
+    for (j = 0; j < 2; ++j)
+      cm->counts.txfm_partition[i][j] += counts->txfm_partition[i][j];
+
  for (i = 0; i < SKIP_CONTEXTS; i++)
    for (j = 0; j < 2; j++)
      cm->counts.skip[i][j] += counts->skip[i][j];
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@ -338,6 +338,59 @@ struct inter_args {
  const int16_t *const uv_dequant;
 };

+static void decode_reconstruct_tx(int blk_row, int blk_col,
+                                  int plane, int block,
+                                  TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                                  void *arg) {
+  struct inter_args *args = (struct inter_args *)arg;
+  VP9_COMMON *const cm = args->cm;
+  MACROBLOCKD *const xd = args->xd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], plane_bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_idx];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    const int16_t *const dequant = (plane == 0) ? args->y_dequant
+                                                : args->uv_dequant;
+    int eob;
+    eob = vp9_decode_block_tokens(cm, xd, args->counts, plane, block,
+                                  plane_bsize, blk_col, blk_row,
+                                  tx_size, args->r, dequant);
+    inverse_transform_block(xd, plane, block, tx_size,
+                            &pd->dst.buf[4 * blk_row * pd->dst.stride +
+                                         4 * blk_col],
+                            pd->dst.stride, eob);
+    *args->eobtotal += eob;
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int step = 1 << (2 *(tx_size - 1));
+    int i;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      decode_reconstruct_tx(blk_row + offsetr, blk_col + offsetc,
+                            plane, block + i * step, tx_size - 1,
+                            plane_bsize, arg);
+    }
+  }
+}
+
 static void reconstruct_inter_block(int plane, int block,
                                    BLOCK_SIZE plane_bsize,
                                    TX_SIZE tx_size, void *arg) {
@ -370,6 +423,7 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  xd->mi = cm->mi + offset;
  xd->mi[0].src_mi = &xd->mi[0];  // Point to self.
  xd->mi[0].mbmi.sb_type = bsize;
+  xd->mi[0].mbmi.max_tx_size = max_txsize_lookup[bsize];

  for (y = 0; y < y_mis; ++y)
    for (x = !y; x < x_mis; ++x) {
@ -425,12 +479,32 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
      int eobtotal = 0;
      struct inter_args arg = {cm, xd, r, counts, &eobtotal, y_dequant,
                               uv_dequant};
-      vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      int plane;
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+        const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+        const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+        BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+        int bh = num_4x4_blocks_wide_lookup[txb_size];
+        int idx, idy;
+        int block = 0;
+        int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+
+        for (idy = 0; idy < mi_height; idy += bh) {
+          for (idx = 0; idx < mi_width; idx += bh) {
+            decode_reconstruct_tx(idy, idx, plane, block,
+                                  max_txsize_lookup[plane_bsize],
+                                  plane_bsize, &arg);
+            block += step;
+          }
+        }
+      }
+
      if (!less8x8 && eobtotal == 0)
        mbmi->skip = 1;  // skip loopfilter
    }
  }
-
  xd->corrupted |= vp9_reader_has_error(r);
 }

@ -943,9 +1017,10 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
  // are allocated as part of the same buffer.
  vpx_memset(cm->above_context, 0,
             sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
-
  vpx_memset(cm->above_seg_context, 0,
             sizeof(*cm->above_seg_context) * aligned_cols);
+  vpx_memset(cm->above_txfm_context, 0,
+             sizeof(*cm->above_txfm_context) * aligned_cols);

  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);

@ -988,6 +1063,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        vp9_tile_set_col(&tile, tile_data->cm, col);
        vp9_zero(tile_data->xd.left_context);
        vp9_zero(tile_data->xd.left_seg_context);
+        vp9_zero(tile_data->xd.left_txfm_context_buffer);
        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
             mi_col += MI_BLOCK_SIZE) {
          decode_partition(pbi, &tile_data->xd, &cm->counts, &tile, mi_row,
@ -1061,6 +1137,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
       mi_row += MI_BLOCK_SIZE) {
    vp9_zero(tile_data->xd.left_context);
    vp9_zero(tile_data->xd.left_seg_context);
+    vp9_zero(tile_data->xd.left_txfm_context_buffer);
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE) {
      decode_partition(tile_data->pbi, &tile_data->xd, &tile_data->counts,
@ -1146,6 +1223,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
             sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
  vpx_memset(cm->above_seg_context, 0,
             sizeof(*cm->above_seg_context) * aligned_mi_cols);
+  vpx_memset(cm->above_txfm_context, 0,
+             sizeof(*cm->above_txfm_context) * aligned_mi_cols);

  // Load tile data into tile_buffers
  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
@ -1516,6 +1595,12 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
    read_tx_mode_probs(&fc->tx_probs, &r);
  read_coef_probs(fc, cm->tx_mode, &r);

+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    vp9_diff_update_prob(&r, &fc->txfm_partition_prob[k]);
+
+  for (k = 0; k < 3; ++k)
+    vp9_diff_update_prob(&r, &fc->intra_predictor_prob[k]);
+
  for (k = 0; k < SKIP_CONTEXTS; ++k)
    vp9_diff_update_prob(&r, &fc->skip_probs[k]);

--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@ -23,6 +23,51 @@
 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_reader.h"

+static PREDICTION_MODE read_intra_mode_exp(const VP9_COMMON *cm,
+                                           vp9_reader *r, const MODE_INFO *mi,
+                                           const MODE_INFO *above_mi,
+                                           const MODE_INFO *left_mi,
+                                           int block) {
+  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+  PREDICTION_MODE i;
+  int count = 0;
+
+  if (above == left) {
+    if (vp9_read(r, cm->fc->intra_predictor_prob[0]))
+      return above;
+    for (i = DC_PRED; i < INTRA_MODES - 1; ++i) {
+      if (i == above)
+        continue;
+      if (vp9_read(r, vp9_intra_mode_prob[count]))
+        return i;
+      ++count;
+      if (count == INTRA_MODES - 2)
+        return (i + 1) == above ? (i + 2) : (i + 1);
+    }
+    return (INTRA_MODES - 1);
+  } else {
+    if (vp9_read(r, cm->fc->intra_predictor_prob[1]))
+      return above;
+    if (vp9_read(r, cm->fc->intra_predictor_prob[2]))
+      return left;
+
+    for (i = DC_PRED; i < INTRA_MODES - 1; ++i) {
+      if (i == above || i == left)
+        continue;
+      if (vp9_read(r, vp9_intra_mode_prob[count + 1]))
+        return i;
+      ++count;
+      if (count == INTRA_MODES - 3)
+        break;
+    }
+    for (++i; i <= INTRA_MODES - 1; ++i)
+      if (i != above && i != left)
+        return i;
+    return (INTRA_MODES - 1);
+  }
+}
+
 static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
  return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
 }
@ -60,6 +105,62 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
  return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
 }

+static void read_tx_size_inter(VP9_COMMON *cm, MACROBLOCKD *xd,
+                               FRAME_COUNTS *counts, TX_SIZE tx_size,
+                               int blk_row, int blk_col, vp9_reader *r) {
+  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  int is_split = 0;
+  int tx_idx = (blk_row / 2) * 8 + (blk_col / 2);
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int bh = num_4x4_blocks_high_lookup[bsize];
+  int i, j;
+  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col / 2),
+                                   xd->left_txfm_context + (blk_row / 2),
+                                   mbmi->max_tx_size,
+                                   tx_size);
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  is_split = vp9_read(r, cm->fc->txfm_partition_prob[ctx]);
+
+  if (!is_split) {
+    mbmi->inter_tx_size[tx_idx] = tx_size;
+    for (j = 0; j < bh / 2; ++j)
+      for (i = 0; i < bh / 2; ++i)
+        mbmi->inter_tx_size[tx_idx + j * 8 + i] = tx_size;
+    mbmi->tx_size = tx_size;
+    ++counts->txfm_partition[ctx][0];
+    txfm_partition_update(xd->above_txfm_context + (blk_col / 2),
+                          xd->left_txfm_context + (blk_row / 2), tx_size);
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int i;
+    ++counts->txfm_partition[ctx][1];
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_idx] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + (blk_col / 2),
+                            xd->left_txfm_context + (blk_row / 2), TX_4X4);
+      return;
+    }
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      read_tx_size_inter(cm, xd, counts, tx_size - 1,
+                         blk_row + offsetr, blk_col + offsetc, r);
+    }
+  }
+}
+
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
                                     FRAME_COUNTS *counts,
                                     TX_SIZE max_tx_size, vp9_reader *r) {
@ -208,24 +309,24 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
    case BLOCK_4X4:
      for (i = 0; i < 4; ++i)
        mi->bmi[i].as_mode =
-            read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i));
+            read_intra_mode_exp(cm, r, mi, above_mi, left_mi, i);
      mbmi->mode = mi->bmi[3].as_mode;
      break;
    case BLOCK_4X8:
      mi->bmi[0].as_mode = mi->bmi[2].as_mode =
-          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+          read_intra_mode_exp(cm, r, mi, above_mi, left_mi, 0);
      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1));
+          read_intra_mode_exp(cm, r, mi, above_mi, left_mi, 1);
      break;
    case BLOCK_8X4:
      mi->bmi[0].as_mode = mi->bmi[1].as_mode =
-          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
+          read_intra_mode_exp(cm, r, mi, above_mi, left_mi, 0);
      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2));
+          read_intra_mode_exp(cm, r, mi, above_mi, left_mi, 2);
      break;
    default:
-      mbmi->mode = read_intra_mode(r,
-                                   get_y_mode_probs(mi, above_mi, left_mi, 0));
+      mbmi->mode =
+          read_intra_mode_exp(cm, r, mi, above_mi, left_mi, 0);
  }

  mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
@ -569,13 +670,63 @@ static void read_inter_frame_mode_info(VP9Decoder *const pbi,
  MODE_INFO *const mi = xd->mi[0].src_mi;
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  int inter_block;
+  BLOCK_SIZE bsize = mbmi->sb_type;

  mbmi->mv[0].as_int = 0;
  mbmi->mv[1].as_int = 0;
  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
  mbmi->skip = read_skip(cm, xd, counts, mbmi->segment_id, r);
  inter_block = read_is_inter_block(cm, xd, counts, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, counts, !mbmi->skip || !inter_block, r);
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+
+  if (mbmi->sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+      !mbmi->skip && inter_block) {
+    BLOCK_SIZE txb_size = txsize_to_bsize[mbmi->max_tx_size];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int width  = num_4x4_blocks_wide_lookup[bsize];
+    int height = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+    for (idy = 0; idy < height; idy += bh)
+      for (idx = 0; idx < width; idx += bh)
+        read_tx_size_inter(cm, xd, counts, mbmi->max_tx_size,
+                           idy, idx, r);
+  } else {
+    int i;
+    mbmi->tx_size = read_tx_size(cm, xd, counts,
+                                 !mbmi->skip || !inter_block, r);
+    for (i = 0; i < 64; ++i)
+      mbmi->inter_tx_size[i] = mbmi->tx_size;
+  }
+
+  if (mbmi->sb_type < BLOCK_8X8)
+    txfm_partition_update(xd->above_txfm_context, xd->left_txfm_context,
+                          TX_4X4);
+
+  if (inter_block) {
+    if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && mbmi->skip) {
+      BLOCK_SIZE txb_size = txsize_to_bsize[mbmi->max_tx_size];
+      int bh = num_4x4_blocks_wide_lookup[txb_size];
+      int width  = num_4x4_blocks_wide_lookup[bsize];
+      int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bh)
+        for (idx = 0; idx < width; idx += bh)
+          txfm_partition_update(xd->above_txfm_context + (idx / 2),
+                                xd->left_txfm_context + (idy / 2),
+                                mbmi->max_tx_size);
+    }
+  } else {
+    BLOCK_SIZE txb_size = txsize_to_bsize[mbmi->max_tx_size];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int width  = num_4x4_blocks_wide_lookup[bsize];
+    int height = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+    for (idy = 0; idy < height; idy += bh)
+      for (idx = 0; idx < width; idx += bh)
+        txfm_partition_update(xd->above_txfm_context + (idx / 2),
+                              xd->left_txfm_context + (idy / 2), mbmi->tx_size);
+  }

  if (inter_block)
    read_inter_block_mode_info(pbi, xd, counts, tile, mi, mi_row, mi_col, r);
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@ -44,6 +44,53 @@ static const struct vp9_token partition_encodings[PARTITION_TYPES] =
 static const struct vp9_token inter_mode_encodings[INTER_MODES] =
  {{2, 2}, {6, 3}, {0, 1}, {7, 3}};

+static void write_intra_mode_exp(const VP9_COMMON *cm,
+                                 vp9_writer *w, const MODE_INFO *mi,
+                                 const MODE_INFO *above_mi,
+                                 const MODE_INFO *left_mi, int block,
+                                 PREDICTION_MODE mode) {
+  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+  PREDICTION_MODE i;
+  int count = 0;
+
+  if (above == left) {
+    vp9_write(w, mode == above, cm->fc->intra_predictor_prob[0]);
+    if (mode == above)
+      return;
+
+    for (i = DC_PRED; i < INTRA_MODES - 1; ++i) {
+      if (i == above)
+        continue;
+      vp9_write(w, i == mode, vp9_intra_mode_prob[count]);
+      ++count;
+      if (i == mode)
+        return;
+      if (count == INTRA_MODES - 2)
+        return;
+    }
+  } else {
+    // above and left reference modes differ
+    vp9_write(w, mode == above, cm->fc->intra_predictor_prob[1]);
+    if (mode == above)
+      return;
+    vp9_write(w, mode == left, cm->fc->intra_predictor_prob[2]);
+    if (mode == left)
+      return;
+
+    for (i = DC_PRED; i < INTRA_MODES - 1; ++i) {
+      if (i == above || i == left)
+        continue;
+      vp9_write(w, i == mode, vp9_intra_mode_prob[count + 1]);
+      ++count;
+      if (i == mode)
+        return;
+      if (count == INTRA_MODES - 3)
+        return;
+    }
+  }
+}
+
 static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
                             const vp9_prob *probs) {
  vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
@ -76,6 +123,52 @@ static void prob_diff_update(const vp9_tree_index *tree,
    vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }

+static void write_tx_size_inter(const VP9_COMMON *cm, MACROBLOCKD *xd,
+                                TX_SIZE tx_size, int blk_row, int blk_col,
+                                vp9_writer *w) {
+  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  int tx_idx = (blk_row / 2) * 8 + (blk_col / 2);
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col / 2),
+                                   xd->left_txfm_context + (blk_row / 2),
+                                   mbmi->max_tx_size,
+                                   tx_size);
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  // TODO(jingning): this assumes support of the possible 64x64 transform.
+  if (tx_size == mbmi->inter_tx_size[tx_idx]) {
+    vp9_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
+    txfm_partition_update(xd->above_txfm_context + (blk_col / 2),
+                          xd->left_txfm_context + (blk_row / 2), tx_size);
+  } else {  // further split
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int i;
+
+    vp9_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
+
+    if (tx_size == TX_8X8) {
+      txfm_partition_update(xd->above_txfm_context + (blk_col / 2),
+                            xd->left_txfm_context + (blk_row / 2), TX_4X4);
+      return;
+    }
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+        write_tx_size_inter(cm, xd, tx_size - 1,
+                            blk_row + offsetr, blk_col + offsetc, w);
+    }
+  }
+}
+
 static void write_selected_tx_size(const VP9_COMMON *cm,
                                   const MACROBLOCKD *xd, vp9_writer *w) {
  TX_SIZE tx_size = xd->mi[0].src_mi->mbmi.tx_size;
@ -91,6 +184,22 @@ static void write_selected_tx_size(const VP9_COMMON *cm,
  }
 }

+static void update_txfm_partition_probs(VP9_COMMON *cm, vp9_writer *w,
+                                        FRAME_COUNTS *counts) {
+  int k;
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    vp9_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
+                              counts->txfm_partition[k]);
+}
+
+static void update_intra_predictor_probs(VP9_COMMON *cm, vp9_writer *w,
+                                         FRAME_COUNTS *counts) {
+  int k;
+  for (k = 0; k < 3; ++k)
+    vp9_cond_prob_diff_update(w, &cm->fc->intra_predictor_prob[k],
+                              counts->intra_predictor[k]);
+}
+
 static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                      int segment_id, const MODE_INFO *mi, vp9_writer *w) {
  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
@ -238,8 +347,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
                                vp9_writer *w) {
  VP9_COMMON *const cm = &cpi->common;
  const nmv_context *nmvc = &cm->fc->nmvc;
-  const MACROBLOCK *const x = &cpi->td.mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCK *x = &cpi->td.mb;
+  MACROBLOCKD *xd = &x->e_mbd;
  const struct segmentation *const seg = &cm->seg;
  const MB_MODE_INFO *const mbmi = &mi->mbmi;
  const PREDICTION_MODE mode = mbmi->mode;
@ -269,7 +378,47 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,

  if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
      !(is_inter && skip)) {
-    write_selected_tx_size(cm, xd, w);
+    if (!is_inter) {
+      write_selected_tx_size(cm, xd, w);
+    } else {
+      BLOCK_SIZE txb_size = txsize_to_bsize[mbmi->max_tx_size];
+      int bh = num_4x4_blocks_wide_lookup[txb_size];
+      int width  = num_4x4_blocks_wide_lookup[bsize];
+      int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bh)
+        for (idx = 0; idx < width; idx += bh)
+          write_tx_size_inter(cm, xd, mbmi->max_tx_size, idy, idx, w);
+    }
+  }
+
+  if (bsize < BLOCK_8X8)
+    txfm_partition_update(xd->above_txfm_context,
+                          xd->left_txfm_context, TX_4X4);
+
+  if (is_inter) {
+    if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && skip) {
+      BLOCK_SIZE txb_size = txsize_to_bsize[mbmi->max_tx_size];
+      int bh = num_4x4_blocks_wide_lookup[txb_size];
+      int width  = num_4x4_blocks_wide_lookup[bsize];
+      int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bh)
+        for (idx = 0; idx < width; idx += bh)
+          txfm_partition_update(xd->above_txfm_context + (idx / 2),
+                                xd->left_txfm_context + (idy / 2),
+                                mbmi->max_tx_size);
+    }
+  } else {
+    BLOCK_SIZE txb_size = txsize_to_bsize[mbmi->max_tx_size];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int width  = num_4x4_blocks_wide_lookup[bsize];
+    int height = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+    for (idy = 0; idy < height; idy += bh)
+      for (idx = 0; idx < width; idx += bh)
+        txfm_partition_update(xd->above_txfm_context + (idx / 2),
+                              xd->left_txfm_context + (idy / 2), mbmi->tx_size);
  }

  if (!is_inter) {
@ -355,7 +504,7 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
    write_selected_tx_size(cm, xd, w);

  if (bsize >= BLOCK_8X8) {
-    write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
+    write_intra_mode_exp(cm, w, mi, above_mi, left_mi, 0, mbmi->mode);
  } else {
    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@ -364,8 +513,8 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
    for (idy = 0; idy < 2; idy += num_4x4_h) {
      for (idx = 0; idx < 2; idx += num_4x4_w) {
        const int block = idy * 2 + idx;
-        write_intra_mode(w, mi->bmi[block].as_mode,
-                         get_y_mode_probs(mi, above_mi, left_mi, block));
+        write_intra_mode_exp(cm, w, mi, above_mi, left_mi, block,
+                             mi->bmi[block].as_mode);
      }
    }
  }
@ -391,6 +540,8 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
  if (frame_is_intra_only(cm)) {
    write_mb_modes_kf(cm, xd, xd->mi, w);
  } else {
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
    pack_inter_mode_mvs(cpi, m, w);
  }

@ -487,6 +638,7 @@ static void write_modes(VP9_COMP *cpi,
  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
       mi_row += MI_BLOCK_SIZE) {
    vp9_zero(xd->left_seg_context);
+    vp9_zero(xd->left_txfm_context_buffer);
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE)
      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
@ -930,6 +1082,8 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {

  vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) *
             mi_cols_aligned_to_sb(cm->mi_cols));
+  vpx_memset(cm->above_txfm_context, 0, sizeof(*cm->above_txfm_context) *
+             mi_cols_aligned_to_sb(cm->mi_cols));

  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
@ -1158,6 +1312,8 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
    encode_txfm_probs(cm, &header_bc, counts);

  update_coef_probs(cpi, &header_bc);
+  update_txfm_partition_probs(cm, &header_bc, counts);
+  update_intra_predictor_probs(cm, &header_bc, counts);
  update_skip_probs(cm, &header_bc, counts);

  if (!frame_is_intra_only(cm)) {
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@ -95,6 +95,7 @@ struct macroblock {

  uint8_t zcoeff_blk[TX_SIZES][256];
  int skip;
+  uint8_t blk_skip[MAX_MB_PLANE][256];

  int encode_breakout;

--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@ -28,6 +28,8 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                  vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    CHECK_MEM_ERROR(cm, ctx->blk_skip[i],
+                    vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
    for (k = 0; k < 3; ++k) {
      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
@ -50,6 +52,8 @@ static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
  vpx_free(ctx->zcoeff_blk);
  ctx->zcoeff_blk = 0;
  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    vpx_free(ctx->blk_skip[i]);
+    ctx->blk_skip[i] = 0;
    for (k = 0; k < 3; ++k) {
      vpx_free(ctx->coeff[i][k]);
      ctx->coeff[i][k] = 0;
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@ -21,6 +21,7 @@ struct ThreadData;
 typedef struct {
  MODE_INFO mic;
  uint8_t *zcoeff_blk;
+  uint8_t *blk_skip[MAX_MB_PLANE];
  tran_low_t *coeff[MAX_MB_PLANE][3];
  tran_low_t *qcoeff[MAX_MB_PLANE][3];
  tran_low_t *dqcoeff[MAX_MB_PLANE][3];
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -893,6 +893,10 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
  vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
             sizeof(uint8_t) * ctx->num_4x4_blk);

+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vpx_memcpy(x->blk_skip[i], ctx->blk_skip[i],
+               sizeof(uint8_t) * ctx->num_4x4_blk);
+
  if (!output_enabled)
    return;

@ -983,6 +987,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
  mbmi->mode = ZEROMV;
  mbmi->tx_size = MIN(max_txsize_lookup[bsize],
                      tx_mode_to_biggest_tx_size[tx_mode]);
+  mbmi->max_tx_size = max_txsize_lookup[bsize];
  mbmi->skip = 1;
  mbmi->uv_mode = DC_PRED;
  mbmi->ref_frame[0] = LAST_FRAME;
@ -1031,6 +1036,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
  mbmi = &xd->mi[0].src_mi->mbmi;
  mbmi->sb_type = bsize;
+  mbmi->max_tx_size = max_txsize_lookup[bsize];

  for (i = 0; i < MAX_MB_PLANE; ++i) {
    p[i].coeff = ctx->coeff_pbuf[i][0];
@ -1187,6 +1193,7 @@ static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                            PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                            TXFM_CONTEXT ta[8], TXFM_CONTEXT tl[8],
                            BLOCK_SIZE bsize) {
  MACROBLOCKD *const xd = &x->e_mbd;
  int p;
@ -1211,12 +1218,17 @@ static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
             sizeof(*xd->above_seg_context) * mi_width);
  vpx_memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
             sizeof(xd->left_seg_context[0]) * mi_height);
+  vpx_memcpy(xd->above_txfm_context, ta,
+             sizeof(*xd->above_txfm_context) * mi_width);
+  vpx_memcpy(xd->left_txfm_context, tl,
+             sizeof(*xd->left_txfm_context) * mi_height);
 }

 static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
                         ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                         ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                         PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+                         TXFM_CONTEXT ta[8], TXFM_CONTEXT tl[8],
                         BLOCK_SIZE bsize) {
  const MACROBLOCKD *const xd = &x->e_mbd;
  int p;
@ -1243,6 +1255,10 @@ static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
             sizeof(*xd->above_seg_context) * mi_width);
  vpx_memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
             sizeof(xd->left_seg_context[0]) * mi_height);
+  vpx_memcpy(ta, xd->above_txfm_context,
+             sizeof(*xd->above_txfm_context) * mi_width);
+  vpx_memcpy(tl, xd->left_txfm_context,
+             sizeof(*xd->left_txfm_context) * mi_height);
 }

 static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
@ -1400,6 +1416,7 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
        int index = block_row * mis + block_col;
        mi_8x8[index].src_mi = mi_upper_left + index;
        mi_8x8[index].src_mi->mbmi.sb_type = bsize;
+        mi_8x8[index].src_mi->mbmi.max_tx_size = max_txsize_lookup[bsize];
      }
    }
  } else {
@ -1465,6 +1482,7 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
        index = b_mi_row * mis + b_mi_col;
        mi_8x8[index].src_mi = mi_upper_left + index;
        mi_8x8[index].src_mi->mbmi.sb_type = BLOCK_16X16;
+        mi_8x8[index].src_mi->mbmi.max_tx_size = max_txsize_lookup[BLOCK_16X16];

        // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
        // size to further improve quality.
@ -1487,6 +1505,7 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
        index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col;
        mi_8x8[index].src_mi = mi_upper_left + index;
        mi_8x8[index].src_mi->mbmi.sb_type = BLOCK_32X32;
+        mi_8x8[index].src_mi->mbmi.max_tx_size = max_txsize_lookup[BLOCK_32X32];
      }
    }

@ -1499,6 +1518,7 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
      if (is_larger_better) {
        mi_8x8[0].src_mi = mi_upper_left;
        mi_8x8[0].src_mi->mbmi.sb_type = BLOCK_64X64;
+        mi_8x8[0].src_mi->mbmi.max_tx_size = max_txsize_lookup[BLOCK_64X64];
      }
    }
  } else {   // partial in-image SB64
@ -1693,6 +1713,7 @@ static void rd_use_partition(VP9_COMP *cpi,
  BLOCK_SIZE subsize;
  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
  PARTITION_CONTEXT sl[8], sa[8];
+  TXFM_CONTEXT tl[8], ta[8];
  RD_COST last_part_rdc, none_rdc, chosen_rdc;
  BLOCK_SIZE sub_subsize = BLOCK_4X4;
  int splits_below = 0;
@ -1714,7 +1735,9 @@ static void rd_use_partition(VP9_COMP *cpi,
  subsize = get_subsize(bsize, partition);

  pc_tree->partitioning = partition;
-  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+  save_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);

  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@ -1754,7 +1777,7 @@ static void rd_use_partition(VP9_COMP *cpi,
                                 none_rdc.dist);
      }

-      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
      mi_8x8[0].src_mi->mbmi.sb_type = bs_type;
      pc_tree->partitioning = partition;
    }
@ -1865,7 +1888,7 @@ static void rd_use_partition(VP9_COMP *cpi,
    BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
    chosen_rdc.rate = 0;
    chosen_rdc.dist = 0;
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
    pc_tree->partitioning = PARTITION_SPLIT;

    // Split partition.
@ -1875,17 +1898,18 @@ static void rd_use_partition(VP9_COMP *cpi,
      RD_COST tmp_rdc;
      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
      PARTITION_CONTEXT sl[8], sa[8];
+      TXFM_CONTEXT tl[8], ta[8];

      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
        continue;

-      save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      save_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
      pc_tree->split[i]->partitioning = PARTITION_NONE;
      rd_pick_sb_modes(cpi, tile_data, x,
                       mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);

-      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);

      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
        vp9_rd_cost_reset(&chosen_rdc);
@ -1925,7 +1949,9 @@ static void rd_use_partition(VP9_COMP *cpi,
    chosen_rdc = none_rdc;
  }

-  restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+  restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);

  // We must have chosen a partitioning and encoding or we'll fail later on.
  // No other opportunities for success.
@ -2279,6 +2305,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
  const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
  PARTITION_CONTEXT sl[8], sa[8];
+  TXFM_CONTEXT tl[8], ta[8];
  TOKENEXTRA *tp_orig = *tp;
  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
  int i, pl;
@ -2344,7 +2371,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
    partition_vert_allowed &= force_vert_split;
  }

-  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+  save_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);

 #if CONFIG_FP_MB_STATS
  if (cpi->use_fp_mb_stats) {
@ -2490,7 +2519,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 #endif
      }
    }
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
  }

  // store estimated motion vector
@ -2555,7 +2584,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
      if (cpi->sf.less_rectangular_check)
        do_rect &= !partition_none_allowed;
    }
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
  }

  // PARTITION_HORZ
@ -2582,6 +2613,10 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
          partition_none_allowed)
        pc_tree->horizontal[1].pred_interp_filter =
            ctx->mic.mbmi.interp_filter;
+
+      xd->above_txfm_context = cm->above_txfm_context + mi_col;
+      xd->left_txfm_context = xd->left_txfm_context_buffer +
+                                ((mi_row + mi_step) & 0x07);
      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
                       &this_rdc, subsize, &pc_tree->horizontal[1],
                       best_rdc.rdcost - sum_rdc.rdcost);
@ -2603,7 +2638,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
        pc_tree->partitioning = PARTITION_HORZ;
      }
    }
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
  }
  // PARTITION_VERT
  if (partition_vert_allowed && do_rect) {
@ -2629,6 +2666,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
          partition_none_allowed)
        pc_tree->vertical[1].pred_interp_filter =
            ctx->mic.mbmi.interp_filter;
+      xd->above_txfm_context = cm->above_txfm_context + mi_col + mi_step;
+      xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
                       &this_rdc, subsize,
                       &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
@ -2651,7 +2690,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
        pc_tree->partitioning = PARTITION_VERT;
      }
    }
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
  }

  // TODO(jbb): This code added so that we avoid static analysis
@ -2693,6 +2734,8 @@ static void encode_rd_sb_row(VP9_COMP *cpi,
  // Initialize the left context for the new SB row
  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+  vpx_memset(xd->left_txfm_context_buffer, 0,
+             sizeof(xd->left_txfm_context_buffer));

  // Code each SB in the row
  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
@ -2781,6 +2824,8 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
             2 * aligned_mi_cols * MAX_MB_PLANE);
  vpx_memset(xd->above_seg_context, 0,
             sizeof(*xd->above_seg_context) * aligned_mi_cols);
+  vpx_memset(cm->above_txfm_context, 0,
+             sizeof(*xd->above_txfm_context) * aligned_mi_cols);
 }

 static int check_dual_ref_flags(VP9_COMP *cpi) {
@ -2821,6 +2866,9 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
 static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
  if (xd->lossless)
    return ONLY_4X4;
+
+  return TX_MODE_SELECT;
+
  if (cpi->common.frame_type == KEY_FRAME &&
      cpi->sf.use_nonrd_pick_mode &&
      cpi->sf.partition_search_type == VAR_BASED_PARTITION)
@ -2854,6 +2902,7 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi,
  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
  mbmi = &xd->mi[0].src_mi->mbmi;
  mbmi->sb_type = bsize;
+  mbmi->max_tx_size = max_txsize_lookup[bsize];

  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
@ -3762,6 +3811,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
  vp9_zero(rdc->filter_diff);
  vp9_zero(rdc->tx_select_diff);
  vp9_zero(rd_opt->tx_select_threshes);
+  vp9_zero(x->blk_skip);

  xd->lossless = cm->base_qindex == 0 &&
                 cm->y_dc_delta_q == 0 &&
@ -3962,40 +4012,40 @@ void vp9_encode_frame(VP9_COMP *cpi) {
      }
    }

-    if (cm->tx_mode == TX_MODE_SELECT) {
-      int count4x4 = 0;
-      int count8x8_lp = 0, count8x8_8x8p = 0;
-      int count16x16_16x16p = 0, count16x16_lp = 0;
-      int count32x32 = 0;
-
-      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += counts->tx.p32x32[i][TX_4X4];
-        count4x4 += counts->tx.p16x16[i][TX_4X4];
-        count4x4 += counts->tx.p8x8[i][TX_4X4];
-
-        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
-        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
-        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
-
-        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
-        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
-        count32x32 += counts->tx.p32x32[i][TX_32X32];
-      }
-      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
-          count32x32 == 0) {
-        cm->tx_mode = ALLOW_8X8;
-        reset_skip_tx_size(cm, TX_8X8);
-      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
-        cm->tx_mode = ONLY_4X4;
-        reset_skip_tx_size(cm, TX_4X4);
-      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
-        cm->tx_mode = ALLOW_32X32;
-      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
-        cm->tx_mode = ALLOW_16X16;
-        reset_skip_tx_size(cm, TX_16X16);
-      }
-    }
+//    if (cm->tx_mode == TX_MODE_SELECT) {
+//      int count4x4 = 0;
+//      int count8x8_lp = 0, count8x8_8x8p = 0;
+//      int count16x16_16x16p = 0, count16x16_lp = 0;
+//      int count32x32 = 0;
+//
+//      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+//        count4x4 += counts->tx.p32x32[i][TX_4X4];
+//        count4x4 += counts->tx.p16x16[i][TX_4X4];
+//        count4x4 += counts->tx.p8x8[i][TX_4X4];
+//
+//        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
+//        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
+//        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
+//
+//        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
+//        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
+//        count32x32 += counts->tx.p32x32[i][TX_32X32];
+//      }
+//      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+//          count32x32 == 0) {
+//        cm->tx_mode = ALLOW_8X8;
+//        reset_skip_tx_size(cm, TX_8X8);
+//      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+//                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+//        cm->tx_mode = ONLY_4X4;
+//        reset_skip_tx_size(cm, TX_4X4);
+//      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+//        cm->tx_mode = ALLOW_32X32;
+//      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+//        cm->tx_mode = ALLOW_16X16;
+//        reset_skip_tx_size(cm, TX_16X16);
+//      }
+//    }
  } else {
    cm->reference_mode = SINGLE_REFERENCE;
    encode_frame_internal(cpi);
@ -4021,6 +4071,57 @@ static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
  ++counts->uv_mode[y_mode][uv_mode];
 }

+static void update_txfm_count(MACROBLOCKD *xd, FRAME_COUNTS *counts,
+                              TX_SIZE tx_size, int blk_row, int blk_col,
+                              int dry_run) {
+  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  int tx_idx = (blk_row / 2) * 8 + (blk_col / 2);
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col / 2),
+                                   xd->left_txfm_context + (blk_row / 2),
+                                   mbmi->max_tx_size,
+                                   tx_size);
+  TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_idx];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    if (!dry_run)
+      ++counts->txfm_partition[ctx][0];
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + (blk_col / 2),
+                          xd->left_txfm_context + (blk_row / 2), tx_size);
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int i;
+    if (!dry_run)
+      ++counts->txfm_partition[ctx][1];
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_idx] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + (blk_col / 2),
+                            xd->left_txfm_context + (blk_row / 2), TX_4X4);
+      return;
+    }
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      update_txfm_count(xd, counts, tx_size - 1,
+                        blk_row + offsetr, blk_col + offsetc, dry_run);
+    }
+  }
+}
+
 static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
                              TOKENEXTRA **t, int output_enabled,
                              int mi_row, int mi_col, BLOCK_SIZE bsize,
@ -4080,15 +4181,20 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
    vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));

    vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
-    vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+
+    vp9_tokenize_sb_inter(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
  }

+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+
  if (output_enabled) {
    if (cm->tx_mode == TX_MODE_SELECT &&
        mbmi->sb_type >= BLOCK_8X8  &&
        !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
-      ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
-                      &td->counts->tx)[mbmi->tx_size];
+      if (!is_inter_block(mbmi))
+        ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
+                        &td->counts->tx)[mbmi->tx_size];
    } else {
      int x, y;
      TX_SIZE tx_size;
@ -4105,7 +4211,80 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
            mi_8x8[mis * y + x].src_mi->mbmi.tx_size = tx_size;
    }
+
+    if (!is_inter_block(mbmi)) {
+      // TODO(jingning): refactor this code for speed improvement.
+      const MODE_INFO *above_mi = xd->mi[-cm->mi_stride].src_mi;
+      const MODE_INFO *left_mi  = xd->left_available ? xd->mi[-1].src_mi : NULL;
+      if (bsize >= BLOCK_8X8) {
+        int idx, idy;
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int block = idy * 2 + idx;
+            const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi,
+                                                               block);
+            const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi,
+                                                             block);
+            if (above == left) {
+              ++td->counts->intra_predictor[0][mi->bmi[block].as_mode == above];
+            } else {
+              ++td->counts->intra_predictor[1][mi->bmi[block].as_mode == above];
+              if (mbmi->mode != above)
+                ++td->counts->intra_predictor[1]
+                                             [mi->bmi[block].as_mode == left];
+            }
+          }
+        }
+      } else {
+        const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, 0);
+        const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, 0);
+        if (above == left) {
+          ++td->counts->intra_predictor[0][mbmi->mode == above];
+        } else {
+          ++td->counts->intra_predictor[1][mbmi->mode == above];
+          if (mbmi->mode != above)
+            ++td->counts->intra_predictor[1][mbmi->mode == left];
+        }
+      }
+    }
+
    ++td->counts->tx.tx_totals[mbmi->tx_size];
    ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
  }
+
+  if (is_inter_block(mbmi)) {
+    if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8) {
+      BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[bsize]];
+      int bh = num_4x4_blocks_wide_lookup[txb_size];
+      int width  = num_4x4_blocks_wide_lookup[bsize];
+      int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bh)
+        for (idx = 0; idx < width; idx += bh)
+          if (mbmi->skip || seg_skip)
+            txfm_partition_update(xd->above_txfm_context + (idx / 2),
+                                  xd->left_txfm_context + (idy / 2),
+                                  max_txsize_lookup[bsize]);
+          else
+            update_txfm_count(xd, td->counts, max_txsize_lookup[mbmi->sb_type],
+                              idy, idx, !output_enabled);
+    }
+  } else {
+    TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int width  = num_4x4_blocks_wide_lookup[bsize];
+    int height = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+    for (idy = 0; idy < height; idy += bh)
+      for (idx = 0; idx < width; idx += bh)
+        txfm_partition_update(xd->above_txfm_context + (idx / 2),
+                              xd->left_txfm_context + (idy / 2), mbmi->tx_size);
+  }
+
+  if (mbmi->sb_type < BLOCK_8X8)
+    txfm_partition_update(xd->above_txfm_context,
+                          xd->left_txfm_context, TX_4X4);
 }
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -530,6 +530,94 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
  }
 }

+void vp9_xform_quant_inter(MACROBLOCK *x, int plane, int block,
+                           int blk_row, int blk_col,
+                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+     switch (tx_size) {
+      case TX_32X32:
+        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                    p->round, p->quant, p->quant_shift, qcoeff,
+                                    dqcoeff, pd->dequant, eob,
+                                    scan_order->scan, scan_order->iscan);
+        break;
+      case TX_16X16:
+        vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_8X8:
+        vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      case TX_4X4:
+        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                              p->quant, p->quant_shift, qcoeff, dqcoeff,
+                              pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_32X32:
+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, eob, scan_order->scan,
+                           scan_order->iscan);
+      break;
+    case TX_16X16:
+      vp9_fdct16x16(src_diff, coeff, diff_stride);
+      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    case TX_8X8:
+      vp9_fdct8x8(src_diff, coeff, diff_stride);
+      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    case TX_4X4:
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, eob,
+                     scan_order->scan, scan_order->iscan);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
  MACROBLOCKD *const xd = &x->e_mbd;
@ -619,8 +707,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
  }
 }

-static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
-                         TX_SIZE tx_size, void *arg) {
+static void encode_block_b(int blk_row, int blk_col, int plane,
+                           int block, BLOCK_SIZE plane_bsize,
+                           TX_SIZE tx_size, void *arg) {
  struct encode_b_args *const args = arg;
  MACROBLOCK *const x = args->x;
  MACROBLOCKD *const xd = &x->e_mbd;
@ -628,61 +717,60 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
  struct macroblock_plane *const p = &x->plane[plane];
  struct macroblockd_plane *const pd = &xd->plane[plane];
  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  int i, j;
  uint8_t *dst;
  ENTROPY_CONTEXT *a, *l;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
-  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
-  a = &ctx->ta[plane][i];
-  l = &ctx->tl[plane][j];
+  const int block_stride = num_4x4_blocks_wide_lookup[plane_bsize];
+  int i;
+  dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+  a = &ctx->ta[plane][blk_col];
+  l = &ctx->tl[plane][blk_row];

  // TODO(jingning): per transformed block zero forcing only enabled for
  // luma component. will integrate chroma components as well.
-  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+//  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+//    p->eobs[block] = 0;
+//    *a = *l = 0;
+//    return;
+//  }
+
+  if (x->blk_skip[plane][blk_row * block_stride + blk_col] == 0)
+    vp9_xform_quant_inter(x, plane, block, blk_row, blk_col,
+                          plane_bsize, tx_size);
+  else
    p->eobs[block] = 0;
-    *a = *l = 0;
-    return;
-  }

-  if (!x->skip_recode) {
-    if (x->quant_fp) {
-      // Encoding process for rtc mode
-      if (x->skip_txfm[0] == 1 && plane == 0) {
-        // skip forward transform
-        p->eobs[block] = 0;
-        *a = *l = 0;
-        return;
-      } else {
-        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
-      }
-    } else {
-      if (max_txsize_lookup[plane_bsize] == tx_size) {
-        int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
-        if (x->skip_txfm[txfm_blk_index] == 0) {
-          // full forward transform and quantization
-          vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-        } else if (x->skip_txfm[txfm_blk_index]== 2) {
-          // fast path forward transform and quantization
-          vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
-        } else {
-          // skip forward transform
-          p->eobs[block] = 0;
-          *a = *l = 0;
-          return;
-        }
-      } else {
-        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-      }
+  if (x->optimize) {
+    int context;
+    switch (tx_size) {
+      case TX_4X4:
+        break;
+      case TX_8X8:
+        a[0] = !!*(const uint16_t *)&a[0];
+        l[0] = !!*(const uint16_t *)&l[0];
+        break;
+      case TX_16X16:
+        a[0] = !!*(const uint32_t *)&a[0];
+        l[0] = !!*(const uint32_t *)&l[0];
+        break;
+      case TX_32X32:
+        a[0] = !!*(const uint64_t *)&a[0];
+        l[0] = !!*(const uint64_t *)&l[0];
+        break;
+      default:
+        assert(0 && "Invalid transform size.");
+        break;
    }
-  }
-
-  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    const int ctx = combine_entropy_contexts(*a, *l);
-    *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
+    context = combine_entropy_contexts(*a, *l);
+    *a = *l = optimize_b(x, plane, block, tx_size, context) > 0;
  } else {
    *a = *l = p->eobs[block] > 0;
  }

+  for (i = 0; i < (1 << tx_size); ++i) {
+    a[i] = a[0];
+    l[i] = l[0];
+  }
+
  if (p->eobs[block])
    *(args->skip) = 0;

@ -739,6 +827,46 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
  }
 }

+static void encode_block_inter(int blk_row, int blk_col,
+                               int plane, int block, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], plane_bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_idx];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    encode_block_b(blk_row, blk_col, plane, block, plane_bsize, tx_size, arg);
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int step = 1 << (2 *(tx_size - 1));
+    int i;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      encode_block_inter(blk_row + offsetr, blk_col + offsetc,
+                         plane, block + i * step, plane_bsize,
+                         tx_size - 1, arg);
+    }
+  }
+}
+
 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
                               TX_SIZE tx_size, void *arg) {
  MACROBLOCK *const x = (MACROBLOCK *)arg;
@ -783,18 +911,27 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
    return;

  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    if (!x->skip_recode)
-      vp9_subtract_plane(x, bsize, plane);
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);

-    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-      const struct macroblockd_plane* const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
-      vp9_get_entropy_contexts(bsize, tx_size, pd,
-                               ctx.ta[plane], ctx.tl[plane]);
+    vp9_subtract_plane(x, bsize, plane);
+
+    vp9_get_entropy_contexts(bsize, TX_4X4, pd, ctx.ta[plane], ctx.tl[plane]);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        encode_block_inter(idy, idx, plane, block, plane_bsize,
+                           max_txsize_lookup[plane_bsize], &arg);
+        block += step;
+      }
    }
-
-    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
-                                           &arg);
  }
 }

@ -820,6 +957,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
  const int src_stride = p->src.stride;
  const int dst_stride = pd->dst.stride;
  int i, j;
+
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  dst = &pd->dst.buf[4 * (j * dst_stride + i)];
  src = &p->src.buf[4 * (j * src_stride + i)];
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@ -29,6 +29,9 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp9_xform_quant_inter(MACROBLOCK *x, int plane, int block,
+                           int blk_row, int blk_col,
+                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);

--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@ -3453,6 +3453,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
             MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif

+  cpi->dummy_writing = 1;
  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
    encode_without_recode_loop(cpi);
  } else {
@ -3498,6 +3499,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
  // Pick the loop filter level for the frame.
  loopfilter_frame(cpi, cm);

+  cpi->dummy_writing = 0;
  // build the bitstream
  vp9_pack_bitstream(cpi, dest, size);

--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@ -417,6 +417,7 @@ typedef struct VP9_COMP {

  int b_calculate_ssimg;
 #endif
+  int dummy_writing;
  int b_calculate_psnr;

  int droppable;
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@ -940,12 +940,11 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
      MIN(max_txsize_lookup[bsize],
          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
  MODE_INFO *const mic = xd->mi[0].src_mi;
-  int *bmode_costs;
+  int bmode_costs;
  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
-  bmode_costs = cpi->y_mode_costs[A][L];

  (void) ctx;
  vp9_rd_cost_reset(&best_rdc);
@ -963,11 +962,17 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
    args.rate = 0;
    args.dist = 0;
    mbmi->tx_size = intra_tx_size;
+
+    if (A == L)
+      bmode_costs = (this_mode == A) ? 406 : 961;
+    else  // (A != L)
+      bmode_costs = (this_mode == A) || (this_mode == L) ? 512 : 1024;
+
    vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                           estimate_block_intra, &args);
    this_rdc.rate = args.rate;
    this_rdc.dist = args.dist;
-    this_rdc.rate += bmode_costs[this_mode];
+    this_rdc.rate += bmode_costs;
    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                             this_rdc.rate, this_rdc.dist);

@ -1521,6 +1526,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  mbmi->ref_frame[0] = best_ref_frame;
  x->skip_txfm[0] = best_mode_skip_txfm;

+  {
+    int i;
+    for (i = 0; i < 64; ++i)
+      mbmi->inter_tx_size[i] = mbmi->tx_size;
+  }
+
  if (reuse_inter_pred && best_pred != NULL) {
    if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
 #if CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@ -66,12 +66,7 @@ static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {

 static void fill_mode_costs(VP9_COMP *cpi) {
  const FRAME_CONTEXT *const fc = cpi->common.fc;
-  int i, j;
-
-  for (i = 0; i < INTRA_MODES; ++i)
-    for (j = 0; j < INTRA_MODES; ++j)
-      vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
-                      vp9_intra_mode_tree);
+  int i;

  vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
  vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME],
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -61,6 +61,7 @@ typedef struct {
 } REF_DEFINITION;

 struct rdcost_block_args {
+  const VP9_COMP *cpi;
  MACROBLOCK *x;
  ENTROPY_CONTEXT t_above[16];
  ENTROPY_CONTEXT t_left[16];
@ -365,10 +366,6 @@ static int cost_coeffs(MACROBLOCK *x,
  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
 #endif

-  // Check for consistency of tx_size with mode info
-  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
-                              : get_uv_tx_size(mbmi, pd) == tx_size);
-
  if (eob == 0) {
    // single eob token
    cost = token_costs[0][0][pt][EOB_TOKEN];
@ -490,15 +487,35 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,

  if (!is_inter_block(mbmi)) {
    struct encode_b_args arg = {x, NULL, &mbmi->skip};
-    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
+    int i, j;
+    uint8_t *dst, *src;
+    int src_stride, dst_stride;
+    unsigned int tmp_sse;
+
 #if CONFIG_VP9_HIGHBITDEPTH
+    (void) i, j, dst, src, src_stride, dst_stride, tmp_sse;
+    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      dist_block(plane, block, tx_size, args, xd->bd);
    } else {
      dist_block(plane, block, tx_size, args, 8);
    }
 #else
-    dist_block(plane, block, tx_size, args);
+    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+    src_stride = x->plane[plane].src.stride;
+    dst_stride = xd->plane[plane].dst.stride;
+    src = &x->plane[plane].src.buf[4 * (j * src_stride + i)];
+    dst = &xd->plane[plane].dst.buf[4 * (j * dst_stride + i)];
+
+    args->cpi->fn_ptr[txsize_to_bsize[tx_size]].vf(src, src_stride,
+                                                   dst, dst_stride, &tmp_sse);
+    args->sse = (int64_t)tmp_sse * 16;
+
+    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
+
+    args->cpi->fn_ptr[txsize_to_bsize[tx_size]].vf(src, src_stride,
+                                                   dst, dst_stride, &tmp_sse);
+    args->dist = (int64_t)tmp_sse * 16;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
@ -573,7 +590,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
  }
 }

-static void txfm_rd_in_plane(MACROBLOCK *x,
+static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x,
                             int *rate, int64_t *distortion,
                             int *skippable, int64_t *sse,
                             int64_t ref_best_rd, int plane,
@ -583,6 +600,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
  const struct macroblockd_plane *const pd = &xd->plane[plane];
  struct rdcost_block_args args;
  vp9_zero(args);
+  args.cpi = cpi;
  args.x = x;
  args.best_rd = ref_best_rd;
  args.use_fast_coef_costing = use_fast_coef_casting;
@ -622,7 +640,7 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,

  mbmi->tx_size = MIN(max_tx_size, largest_tx_size);

-  txfm_rd_in_plane(x, rate, distortion, skip,
+  txfm_rd_in_plane(cpi, x, rate, distortion, skip,
                   sse, ref_best_rd, 0, bs,
                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 }
@ -658,7 +676,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  s1 = vp9_cost_bit(skip_prob, 1);

  for (n = max_tx_size; n >= 0;  n--) {
-    txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
+    txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n],
                     &sse[n], ref_best_rd, 0, bs, n,
                     cpi->sf.use_fast_coef_costing);
    r[n][1] = r[n][0];
@ -694,7 +712,6 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
                      best_tx : MIN(max_tx_size, max_mode_tx_size);

-
  *distortion = d[mbmi->tx_size];
  *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
  *skip       = s[mbmi->tx_size];
@ -760,8 +777,8 @@ static int conditional_skipintra(PREDICTION_MODE mode,

 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                     PREDICTION_MODE *best_mode,
-                                     const int *bmode_costs,
                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                                     PREDICTION_MODE A, PREDICTION_MODE L,
                                     int *bestrate, int *bestratey,
                                     int64_t *bestdistortion,
                                     BLOCK_SIZE bsize, int64_t rd_thresh) {
@ -800,7 +817,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
      int64_t this_rd;
      int ratey = 0;
      int64_t distortion = 0;
-      int rate = bmode_costs[mode];
+      int rate;
+
+      if (A == L)
+        rate = (mode == A) ? 256 : 1064;
+      else  // (A != L)
+        rate = (mode == A) || (mode == L) ? 404 : 1169;

      if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
        continue;
@ -901,7 +923,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
    int64_t this_rd;
    int ratey = 0;
    int64_t distortion = 0;
-    int rate = bmode_costs[mode];
+    int rate;
+
+    if (A == L)
+      rate = (mode == A) ? 406 : 961;
+    else  // (A != L)
+      rate = (mode == A) || (mode == L) ? 512 : 1024;

    if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
      continue;
@ -1009,7 +1036,8 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
  int tot_rate_y = 0;
  int64_t total_rd = 0;
  ENTROPY_CONTEXT t_above[4], t_left[4];
-  const int *bmode_costs = cpi->mbmode_cost;
+  PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+  PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);

  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
@ -1022,14 +1050,13 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
      int64_t d = INT64_MAX, this_rd = INT64_MAX;
      i = idy * 2 + idx;
      if (cpi->common.frame_type == KEY_FRAME) {
-        const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
-        const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
-
-        bmode_costs  = cpi->y_mode_costs[A][L];
+        A = vp9_above_block_mode(mic, above_mi, i);
+        L = vp9_left_block_mode(mic, left_mi, i);
      }

-      this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                      t_above + idx, t_left + idy, &r, &ry, &d,
+      this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode,
+                                      t_above + idx, t_left + idy,
+                                      A, L, &r, &ry, &d,
                                      bsize, best_rd - total_rd);
      if (this_rd >= best_rd - total_rd)
        return INT64_MAX;
@ -1073,12 +1100,11 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
  int64_t this_distortion, this_rd;
  TX_SIZE best_tx = TX_4X4;
  int i;
-  int *bmode_costs;
+  int bmode_costs;
  const MODE_INFO *above_mi = xd->above_mi;
  const MODE_INFO *left_mi = xd->left_mi;
  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
-  bmode_costs = cpi->y_mode_costs[A][L];

  if (cpi->sf.tx_size_search_method == USE_FULL_RD)
    for (i = 0; i < TX_MODES; i++)
@ -1089,6 +1115,11 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
    int64_t local_tx_cache[TX_MODES];

+    if (A == L)
+      bmode_costs = (mode == A) ? 406 : 961;
+    else  // (A != L)
+      bmode_costs = (mode == A) || (mode == L) ? 512 : 1024;
+
    if (cpi->sf.use_nonrd_pick_mode) {
      // These speed features are turned on in hybrid non-RD and RD mode
      // for key frame coding in the context of real-time setting.
@ -1106,7 +1137,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
    if (this_rate_tokenonly == INT_MAX)
      continue;

-    this_rate = this_rate_tokenonly + bmode_costs[mode];
+    this_rate = this_rate_tokenonly + bmode_costs;
    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

    if (this_rd < best_rd) {
@ -1136,8 +1167,496 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
  return best_rd;
 }

+static void tx_block_rd_b(VP9_COMP const *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                          int blk_row, int blk_col, int plane, int block,
+                          int plane_bsize, ENTROPY_CONTEXT *above_ctx,
+                          ENTROPY_CONTEXT *left_ctx, int *zero_blk_rate,
+                          int *rate, int64_t *dist, int64_t *bsse, int *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  unsigned int tmp_sse = 0;
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+  ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+  scan_order const *sc = get_scan(xd, tx_size, pd->plane_type, 0);
+  int i, pt;
+  BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
+  int bh = 4 * num_4x4_blocks_wide_lookup[txm_bsize];
+  int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
+  uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, rec_buffer, 32 * 32);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  int64_t this_sse;
+  const int ss_txfrm_size = tx_size << 1;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+
+  (void)this_dist, tmp_sse, txm_bsize, bh, src_stride, src, dst, rec_buffer;
+  vp9_xform_quant_inter(x, plane, block, blk_row, blk_col,
+                        plane_bsize, tx_size);
+  *dist += vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                  &this_sse, xd->bd) >> shift;
+  *bsse += this_sse >> shift;
+#else
+  // TODO(jingning) refactor the data structure to save repeated computation.
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  vp9_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
+                    NULL, 0, NULL, 0, bh, bh);
+
+  if ((bh >> 2) + blk_col > max_blocks_wide ||
+      (bh >> 2) + blk_row > max_blocks_high) {
+    int idx, idy;
+    unsigned int this_sse;
+    int blocks_height = MIN(bh >> 2, max_blocks_high - blk_row);
+    int blocks_width  = MIN(bh >> 2, max_blocks_wide - blk_col);
+    for (idy = 0; idy < blocks_height; idy += 2) {
+      for (idx = 0; idx < blocks_width; idx += 2) {
+        cpi->fn_ptr[BLOCK_8X8].vf(src + 4 * idy * src_stride + 4 * idx,
+                                  src_stride,
+                                  rec_buffer + 4 * idy * 32 + 4 * idx,
+                                  32, &this_sse);
+        tmp_sse += this_sse;
+      }
+    }
+  } else {
+    cpi->fn_ptr[txm_bsize].vf(src, src_stride,
+                              rec_buffer, 32, &tmp_sse);
+  }
+  *bsse = (int64_t)tmp_sse * 16;
+
+  vp9_xform_quant_inter(x, plane, block, blk_row, blk_col,
+                        plane_bsize, tx_size);
+
+  if (p->eobs[block] > 0) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp9_idct32x32_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
+        break;
+      case TX_16X16:
+        vp9_idct16x16_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
+        break;
+      case TX_8X8:
+        vp9_idct8x8_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
+        break;
+      case TX_4X4:
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        x->itxm_add(dqcoeff, rec_buffer, 32, p->eobs[block]);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        break;
+    }
+
+    if ((bh >> 2) + blk_col > max_blocks_wide ||
+        (bh >> 2) + blk_row > max_blocks_high) {
+      int idx, idy;
+      unsigned int this_sse;
+      int blocks_height = MIN(bh >> 2, max_blocks_high - blk_row);
+      int blocks_width  = MIN(bh >> 2, max_blocks_wide - blk_col);
+      tmp_sse = 0;
+      for (idy = 0; idy < blocks_height; idy += 2) {
+        for (idx = 0; idx < blocks_width; idx += 2) {
+          cpi->fn_ptr[BLOCK_8X8].vf(src + 4 * idy * src_stride + 4 * idx,
+                                    src_stride,
+                                    rec_buffer + 4 * idy * 32 + 4 * idx,
+                                    32, &this_sse);
+          tmp_sse += this_sse;
+        }
+      }
+    } else {
+      cpi->fn_ptr[txm_bsize].vf(src, src_stride,
+                                rec_buffer, 32, &tmp_sse);
+    }
+  }
+  *dist = (int64_t)tmp_sse * 16;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  switch (tx_size) {
+    case TX_4X4:
+      break;
+    case TX_8X8:
+      ta[0] = !!*(const uint16_t *)&ta[0];
+      tl[0] = !!*(const uint16_t *)&tl[0];
+      break;
+    case TX_16X16:
+      ta[0] = !!*(const uint32_t *)&ta[0];
+      tl[0] = !!*(const uint32_t *)&tl[0];
+      break;
+    case TX_32X32:
+      ta[0] = !!*(const uint64_t *)&ta[0];
+      tl[0] = !!*(const uint64_t *)&tl[0];
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      break;
+  }
+  pt = (ta[0] != 0) + (tl[0] != 0);
+  *zero_blk_rate =
+      x->token_costs[tx_size][pd->plane_type][1][0][0][pt][EOB_TOKEN];
+  *rate = cost_coeffs(x, plane, block, ta, tl, tx_size,
+                      sc->scan, sc->neighbors, 0);
+
+  for (i = 0; i < (1 << tx_size); ++i) {
+    ta[i] = ta[0];
+    tl[i] = tl[0];
+  }
+  *skip = (p->eobs[block] == 0);
+}
+
+static void select_tx_block(const VP9_COMP *cpi, MACROBLOCK *x,
+                            int blk_row, int blk_col, int plane, int block,
+                            TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                            BLOCK_SIZE txb_bsize,
+                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+                            TXFM_CONTEXT *txa, TXFM_CONTEXT *txl,
+                            int *rate, int64_t *dist,
+                            int64_t *bsse, int *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  int block_stride = max_blocks_wide;
+  int mi_width = num_8x8_blocks_wide_lookup[txb_bsize];
+  int mi_height = num_8x8_blocks_high_lookup[txb_bsize];
+  int64_t this_rd = INT64_MAX;
+  ENTROPY_CONTEXT ctxa[16], ctxl[16];
+  ENTROPY_CONTEXT *pta = ta + (blk_col >> pd->subsampling_x);
+  ENTROPY_CONTEXT *ptl = tl + (blk_row >> pd->subsampling_y);
+  TXFM_CONTEXT stxa[8], stxl[8];
+  int ctx = txfm_partition_context(txa + (blk_col / 2),
+                                   txl + (blk_row / 2),
+                                   mbmi->max_tx_size,
+                                   tx_size);
+  int zero_blk_rate;
+
+  vpx_memcpy(ctxa, ta, sizeof(ENTROPY_CONTEXT) * max_blocks_wide);
+  vpx_memcpy(ctxl, tl, sizeof(ENTROPY_CONTEXT) * max_blocks_high);
+
+  // Store the above and left transform block partition context.
+  vpx_memcpy(stxa + (blk_col / 2), txa + (blk_col / 2),
+             sizeof(TXFM_CONTEXT) * mi_width);
+  vpx_memcpy(stxl + (blk_row / 2), txl + (blk_row / 2),
+             sizeof(TXFM_CONTEXT) * mi_height);
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  *rate = 0;
+  *dist = 0;
+  *bsse = 0;
+  *skip = 1;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  mbmi->inter_tx_size[tx_idx] = tx_size;
+  mbmi->tx_size = tx_size;
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                  plane_bsize, ta, tl, &zero_blk_rate, rate, dist, bsse, skip);
+    txfm_partition_update(txa + (blk_col / 2), txl + (blk_row / 2), tx_size);
+
+    if (*skip == 1) {
+      x->blk_skip[plane][blk_row * block_stride + blk_col] = 1;
+    } else {
+      if (RDCOST(x->rdmult, x->rddiv, *rate, *dist) >=
+          RDCOST(x->rdmult, x->rddiv, zero_blk_rate, *bsse)) {
+        int i;
+        *rate = zero_blk_rate;
+        *dist = *bsse;
+        *skip = 1;
+        x->blk_skip[plane][blk_row * block_stride + blk_col] = 1;
+        for (i = 0; i < (1 << tx_size); ++i) {
+          pta[i] = 0;
+          ptl[i] = 0;
+        }
+      } else {
+        x->blk_skip[plane][blk_row * block_stride + blk_col] = 0;
+      }
+    }
+
+    if (tx_size >= TX_8X8)
+      *rate += vp9_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+    this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *dist);
+  }
+
+  if (tx_size > TX_4X4) {
+    int bh = num_4x4_blocks_high_lookup[txb_bsize];
+    int sub_step = 1 << (2 * (tx_size - 1));
+    int i;
+    int this_rate, sum_rate;
+    int64_t this_dist, sum_dist = 0;
+    int64_t this_bsse, sum_bsse = 0;
+    int this_skip, all_skip = 1;
+    int64_t sum_rd;
+
+    sum_rate = vp9_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc,
+                      plane, block + i * sub_step, tx_size - 1,
+                      plane_bsize, txsize_to_bsize[tx_size - 1],
+                      ctxa, ctxl, stxa, stxl, &this_rate, &this_dist,
+                      &this_bsse, &this_skip);
+      sum_rate += this_rate;
+      sum_dist += this_dist;
+      sum_bsse += this_bsse;
+      all_skip &= this_skip;
+    }
+    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+
+    if (this_rd < sum_rd) {
+      int idx, idy;
+      for (idy = blk_row; idy < blk_row + bh; idy += 2)
+        for (idx = blk_col; idx < blk_col + bh; idx += 2)
+          mbmi->inter_tx_size[(idy / 2) * 8 + (idx / 2)] = tx_size;
+      mbmi->tx_size = tx_size;
+      x->blk_skip[plane][blk_row * block_stride + blk_col] = *skip;
+    } else {
+      *rate = sum_rate;
+      *dist = sum_dist;
+      *bsse = sum_bsse;
+      *skip = all_skip;
+      vpx_memcpy(pta, ctxa + (blk_col >> pd->subsampling_x),
+          sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[txb_bsize]);
+      vpx_memcpy(ptl, ctxl + (blk_row >> pd->subsampling_y),
+          sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[txb_bsize]);
+      vpx_memcpy(txa + (blk_col / 2), stxa + (blk_col / 2),
+                 sizeof(TXFM_CONTEXT) * mi_width);
+      vpx_memcpy(txl + (blk_row / 2), stxl + (blk_row / 2),
+                 sizeof(TXFM_CONTEXT) * mi_height);
+    }
+  }
+}
+
+static void inter_block_yrd(const VP9_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd;
+  vp9_prob skip_prob = vp9_get_skip_prob(&cpi->common, xd);
+  int s0 = vp9_cost_bit(skip_prob, 0);
+  int s1 = vp9_cost_bit(skip_prob, 1);
+
+  if (ref_best_rd < 0)
+    is_cost_valid = 0;
+
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const int mi_width = num_4x4_blocks_wide_lookup[bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[bsize];
+    TX_SIZE max_tx_size = xd->mi[0].mbmi.max_tx_size;
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_tx_size * 2);
+    ENTROPY_CONTEXT ctxa[16], ctxl[16];
+    TXFM_CONTEXT txa[8], txl[8];
+
+    int pnrate = 0, pnskip = 1;
+    int64_t pndist = 0, pnsse = 0;
+
+    vp9_get_entropy_contexts(bsize, TX_4X4, pd, ctxa, ctxl);
+
+    vpx_memcpy(txa, xd->above_txfm_context,
+               sizeof(TXFM_CONTEXT) * num_8x8_blocks_wide_lookup[bsize]);
+    vpx_memcpy(txl, xd->left_txfm_context,
+               sizeof(TXFM_CONTEXT) * num_8x8_blocks_high_lookup[bsize]);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        select_tx_block(cpi, x, idy, idx, 0, block,
+                        max_tx_size, bsize, txb_size,
+                        ctxa, ctxl, txa, txl,
+                        &pnrate, &pndist, &pnsse, &pnskip);
+        *rate += pnrate;
+        *distortion += pndist;
+        *sse += pnsse;
+        *skippable &= pnskip;
+
+        block += step;
+      }
+    }
+
+    this_rd = MIN(RDCOST(x->rdmult, x->rddiv, *rate + s0, *distortion),
+                  RDCOST(x->rdmult, x->rddiv, s1, *sse));
+    if (this_rd > ref_best_rd)
+      is_cost_valid = 0;
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+}
+
+static void tx_block_rd(const VP9_COMP *cpi, MACROBLOCK *x,
+                        int blk_row, int blk_col, int plane, int block,
+                        TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                        ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+                        int *rate, int64_t *dist, int64_t *bsse, int *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], plane_bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_idx];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  int zero_blk_rate;
+  int this_rate, this_skip;
+  int64_t this_dist, this_bsse;
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                  plane_bsize, above_ctx, left_ctx,
+                  &zero_blk_rate, &this_rate,
+                  &this_dist, &this_bsse, &this_skip);
+    *rate += this_rate;
+    *dist += this_dist;
+    *bsse += this_bsse;
+    *skip &= this_skip;
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int step = 1 << (2 *(tx_size - 1));
+    int i;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      tx_block_rd(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
+                  block + i * step, tx_size - 1, plane_bsize,
+                  above_ctx, left_ctx, rate, dist, bsse, skip);
+    }
+  }
+}
+
 // Return value 0: early termination triggered, no valid rd cost available;
 //              1: rd cost values are valid.
+static int inter_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  int plane;
+  int is_cost_valid = 1;
+  int64_t this_rd;
+
+  if (ref_best_rd < 0) {
+    is_cost_valid = 0;
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+    return is_cost_valid;
+  }
+
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp9_subtract_plane(x, bsize, plane);
+  }
+
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+    int pnrate = 0, pnskip = 1;
+    int64_t pndist = 0, pnsse = 0;
+    ENTROPY_CONTEXT ta[16], tl[16];
+
+    vp9_get_entropy_contexts(bsize, TX_4X4, pd, ta, tl);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        tx_block_rd(cpi, x, idy, idx, plane, block,
+                    max_txsize_lookup[plane_bsize], plane_bsize, ta, tl,
+                    &pnrate, &pndist, &pnsse, &pnskip);
+        block += step;
+      }
+    }
+
+    if (pnrate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+    *rate += pnrate;
+    *distortion += pndist;
+    *sse += pnsse;
+    *skippable &= pnskip;
+
+    this_rd = MIN(RDCOST(x->rdmult, x->rddiv, *rate, *distortion),
+                  RDCOST(x->rdmult, x->rddiv, 0, *sse));
+    if (this_rd > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+
+  return is_cost_valid;
+}
+
 static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
                            int *rate, int64_t *distortion, int *skippable,
                            int64_t *sse, BLOCK_SIZE bsize,
@ -1165,7 +1684,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
  *skippable = 1;

  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+    txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse,
                     ref_best_rd, plane, bsize, uv_tx_size,
                     cpi->sf.use_fast_coef_costing);
    if (pnrate == INT_MAX) {
@ -2446,6 +2965,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  int64_t skip_sse_sb = INT64_MAX;
  int64_t distortion_y = 0, distortion_uv = 0;

+  (void)txfm_cache;
+
 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
@ -2725,8 +3246,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

    // Y cost and distortion
    vp9_subtract_plane(x, bsize, 0);
-    super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
-                    bsize, txfm_cache, ref_best_rd);
+
+    if (cm->tx_mode == TX_MODE_SELECT) {
+      inter_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                      bsize, ref_best_rd);
+    } else {
+      super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                      bsize, txfm_cache, ref_best_rd);
+      // sudo load
+      for (i = 0; i < 64; ++i)
+        mbmi->inter_tx_size[i] = mbmi->tx_size;
+    }

    if (*rate_y == INT_MAX) {
      *rate2 = INT_MAX;
@ -2741,7 +3271,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
    rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));

-    if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+    if (!inter_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
                          &sseuv, bsize, ref_best_rd - rdcosty)) {
      *rate2 = INT_MAX;
      *distortion = INT64_MAX;
@ -3266,6 +3796,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
    if (ref_frame == INTRA_FRAME) {
      TX_SIZE uv_tx;
      struct macroblockd_plane *const pd = &xd->plane[1];
+      MODE_INFO *const mic = xd->mi[0].src_mi;
+      const MODE_INFO *above_mi = xd->above_mi;
+      const MODE_INFO *left_mi = xd->left_mi;
+
+      const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+      const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+      int intra_mode_cost;
      vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                      NULL, bsize, tx_cache, best_rd);
@ -3280,18 +3817,22 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
                             &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
      }

+      if (A == L)
+        intra_mode_cost = (this_mode == A) ? 256 : 1064;
+      else  // (A != L)
+        intra_mode_cost = (this_mode == A) || (this_mode == L) ? 404 : 1169;
+
      rate_uv = rate_uv_tokenonly[uv_tx];
      distortion_uv = dist_uv[uv_tx];
      skippable = skippable && skip_uv[uv_tx];
      mbmi->uv_mode = mode_uv[uv_tx];

-      rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      rate2 = rate_y + intra_mode_cost + rate_uv_intra[uv_tx];
      if (this_mode != DC_PRED && this_mode != TM_PRED)
        rate2 += intra_cost_penalty;
      distortion2 = distortion_y + distortion_uv;
    } else {
-      this_rd = handle_inter_mode(cpi, x, bsize,
-                                  tx_cache,
+      this_rd = handle_inter_mode(cpi, x, bsize, tx_cache,
                                  &rate2, &distortion2, &skippable,
                                  &rate_y, &rate_uv,
                                  &disable_skip, frame_mv,
@ -3393,6 +3934,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                   sizeof(uint8_t) * ctx->num_4x4_blk);

+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          vpx_memcpy(ctx->blk_skip[i], x->blk_skip[i],
+                     sizeof(uint8_t) * ctx->num_4x4_blk);
+
        // TODO(debargha): enhance this test with a better distortion prediction
        // based on qp, activity mask and history
        if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
@ -3754,6 +4299,12 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
  vp9_zero(best_mbmode);

+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    int j;
+    for (j = 0; j < 4; ++j)
+      x->blk_skip[i][j] = 0;
+  }
+
  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
    filter_cache[i] = INT64_MAX;

@ -3899,6 +4450,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
    x->skip = 0;
    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);

+    // sudo load
+    for (i = 0; i < 64; ++i)
+      mbmi->inter_tx_size[i] = mbmi->tx_size;
+
    // Select prediction reference frames.
    for (i = 0; i < MAX_MB_PLANE; i++) {
      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
@ -4159,6 +4714,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
                   sizeof(uint8_t) * ctx->num_4x4_blk);

+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          vpx_memcpy(ctx->blk_skip[i], x->blk_skip[i],
+                     sizeof(uint8_t) * ctx->num_4x4_blk);
+
        for (i = 0; i < 4; i++)
          best_bmodes[i] = xd->mi[0].src_mi->bmi[i];

--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@ -580,7 +580,6 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
    mb_y_offset += 16 * (f->y_stride - mb_cols);
    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
  }
-
  // Restore input state
  for (i = 0; i < MAX_MB_PLANE; i++)
    mbd->plane[i].pre[0].buf = input_buffer[i];
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@ -442,6 +442,20 @@ struct tokenize_b_args {
  TOKENEXTRA **tp;
 };

+static void set_entropy_context_b_inter(int plane, int block,
+                                        BLOCK_SIZE plane_bsize,
+                                        int blk_row, int blk_col,
+                                        TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0,
+                   blk_col, blk_row);
+}
+
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
                                  TX_SIZE tx_size, void *arg) {
  struct tokenize_b_args* const args = arg;
@ -486,6 +500,85 @@ static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }

+static void tokenize_b_inter(int plane, int block, BLOCK_SIZE plane_bsize,
+                             int blk_row, int blk_col,
+                             TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  VP9_COMP *cpi = args->cpi;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TOKENEXTRA **tp = args->tp;
+  uint8_t token_cache[32 * 32];
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  int pt; /* near block/prev token context index */
+  int c;
+  TOKENEXTRA *t = *tp;        /* store tokens starting here */
+  int eob = p->eobs[block];
+  const PLANE_TYPE type = pd->plane_type;
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int segment_id = mbmi->segment_id;
+  const int16_t *scan, *nb;
+  const scan_order *so;
+  const int ref = is_inter_block(mbmi);
+  unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      td->rd_counts.coef_counts[tx_size][type][ref];
+  vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->common.fc->coef_probs[tx_size][type][ref];
+  unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
+      td->counts->eob_branch[tx_size][type][ref];
+  const uint8_t *const band = get_band_translate(tx_size);
+  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  int16_t token;
+  EXTRABIT extra;
+
+  pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                           pd->left_context + blk_row);
+  so = get_scan(xd, tx_size, type, block);
+  scan = so->scan;
+  nb = so->neighbors;
+  c = 0;
+
+  while (c < eob) {
+    int v = 0;
+    int skip_eob = 0;
+    v = qcoeff[scan[c]];
+
+    while (!v) {
+      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
+                         counts[band[c]][pt]);
+      eob_branch[band[c]][pt] += !skip_eob;
+
+      skip_eob = 1;
+      token_cache[scan[c]] = 0;
+      ++c;
+      pt = get_coef_context(nb, token_cache, c);
+      v = qcoeff[scan[c]];
+    }
+
+    vp9_get_token_extra(v, &token, &extra);
+
+    add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
+              (uint8_t)skip_eob, counts[band[c]][pt]);
+    eob_branch[band[c]][pt] += !skip_eob;
+
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
+    ++c;
+    pt = get_coef_context(nb, token_cache, c);
+  }
+  if (c < seg_eob) {
+    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
+                       counts[band[c]][pt]);
+    ++eob_branch[band[c]][pt];
+  }
+
+  *tp = t;
+
+  vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, blk_col, blk_row);
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                       TX_SIZE tx_size, void *arg) {
  struct tokenize_b_args* const args = arg;
@ -607,6 +700,105 @@ int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
  return result;
 }

+void tokenize_tx(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                 int dry_run, TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                 int blk_row, int blk_col, int block, int plane,
+                 void *arg) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], plane_bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_idx];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    if (!dry_run)
+      tokenize_b_inter(plane, block, plane_bsize,
+                       blk_row, blk_col, tx_size, arg);
+    else
+      set_entropy_context_b_inter(plane, block, plane_bsize,
+                                  blk_row, blk_col, tx_size, arg);
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_wide_lookup[bsize];
+    int i;
+
+    assert(num_4x4_blocks_high_lookup[bsize] ==
+           num_4x4_blocks_wide_lookup[bsize]);
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      int step = 1 << (2 * (tx_size - 1));
+      tokenize_tx(cpi, td, t, dry_run, tx_size - 1, plane_bsize,
+                  blk_row + offsetr, blk_col + offsetc,
+                  block + i * step, plane, arg);
+    }
+  }
+}
+
+void vp9_tokenize_sb_inter(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                           int dry_run, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = vp9_get_skip_context(xd);
+  const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
+                                              SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, td, t};
+  int plane;
+
+  if (mbmi->skip) {
+    if (!dry_run)
+      td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    td->counts->skip[ctx][0] += skip_inc;
+  else
+    *t = t_backup;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        tokenize_tx(cpi, td, t, dry_run, max_txsize_lookup[plane_bsize],
+                    plane_bsize, idy, idx, block, plane, &arg);
+        block += step;
+      }
+    }
+  }
+}
+
 void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                     int dry_run, BLOCK_SIZE bsize) {
  VP9_COMMON *const cm = &cpi->common;
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@ -51,6 +51,9 @@ int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 struct VP9_COMP;
 struct ThreadData;

+void vp9_tokenize_sb_inter(struct VP9_COMP *cpi, struct ThreadData *td,
+                           TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+
 void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);

--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@ -59,7 +59,7 @@ extern "C" {
   * types, removing or reassigning enums, adding/removing/rearranging
   * fields to structures
   */
-#define VPX_ENCODER_ABI_VERSION (4 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/


  /*! \brief Encoder capabilities bitfield
Author	SHA1	Message	Date
Jingning Han	ac50b75e50	Use balanced model for intra prediction mode coding This commit replaces the previous table based intra mode model coding with a more balanced entropy coding system. It reduces the decoder lookup table size by 1K bytes. The key frame compression performance is about even on average. There are a few points where the compression performance is improved by over 5%. Most test points are fairly close to the lookup table approach. Change-Id: I47154276c0a6a22ae87de8845bc2d494681b95f6	2015-06-23 16:42:56 -07:00
Jingning Han	81c389e790	Make tx partition entropy coder account for block size This commit allows the entropy coder for transform block partition to account for its relative position with respect to the block size. Change-Id: I2b5019c378bfb58c11b926fa50c0db1933f35852	2015-06-18 21:56:30 +00:00
Jingning Han	0a42a1efd4	Add max_tx_size to MB_MODE_INFO Refactor the recursive transform block partition to reduce repeated computation maximum transform block size per block. Change-Id: Ib408c78dc6923fe7d337dc937e74f2701ac63859	2015-06-18 14:54:49 -07:00
Jingning Han	2aa2ef4094	Make loop filter support variable transform block size This commit refactors the loop filter implementation to make it support recursive transform block partition. Change-Id: Ica2daa9cb54730cff7770ee2c2d7ffdb240ff418	2015-06-16 18:56:47 -07:00
Jingning Han	85c220b2c4	Turn on loop filter Temporarily use univariate transform size for loop filter. As compared to VP9 master branch with loop filter turned on, the compression gains are: derf 0.671% mr 0.749% stdhd 0.886% hr 1.394% The encoding speed currently is about 1.3X that of speed 0. Change-Id: I64788f894e70fde14c5be3159501bedf836e5998	2015-06-16 08:49:13 -07:00
Jingning Han	7cbea06386	Update transform block partition information for intra blocks If a block is coded in the intra modes, update the transform block partition information as maximum block size. Change-Id: I5ea440c700fc887ff2fe84fabde77a9d896d16f4	2015-06-15 15:53:19 -07:00
Jingning Han	a4fd58a761	Refactor tx_block_rd_b() to compute per block rd cost This commit makes the tx_block_rd_b() compute the rate and distortion cost per transform block, instead of accumulating these costs. Change-Id: Iff5adc4c27cc54f8e6eb3abd95f8d88ba00f462c	2015-06-15 09:08:00 -07:00
Jingning Han	e272e5b8fb	Skip redundant flag reset If the skip flag is already on, there is no need to further check the all zero block case. This improves encoding speed at no coding statistics change. Change-Id: Icab997ca2977e650351a47ff1314def5ac4ecb1d	2015-06-12 11:44:01 -07:00
Jingning Han	5180368403	Allow encoder to force all zero coefficient block This commit allows the encoder to force all zero quantized coefficient block per transform block, if that provides better rate-distortion trade-off. Change-Id: I5b57b28cccd257ebfaf7c1749dda7be482abc834	2015-06-12 09:18:10 -07:00
Jingning Han	63c0d8df9f	Assign largest transform block size to skip block If a block has all coefficients quantized to zero, the codec will assume that it uses largest transform block size. Change-Id: I1a32527e50026e8e4759ad8de474189cd20e89c8	2015-06-11 11:01:44 -07:00
Jingning Han	9ce132ac37	Refactor transform block partition entropy coding This commit refactors the transform block partition entropy coding process to improve the encoding speed. There is no change in the compression statistics. Change-Id: I237466fd95c1b888df432babfa36e01f74240eef	2015-06-11 09:41:20 -07:00
Jingning Han	9692042493	Refactor transform block partition update process Unify transform block partition update process used in rate distortion optimization and encoding stage. Change-Id: I4e5f2b6d2482c53ceadb7c8743435158f229a82c	2015-06-10 10:01:31 -07:00
Jingning Han	87a0d5436b	Account for context information for partition rate estimate This commit allows the encoder to account for the boundary block information to estimate the transform block partitiion rate cost in the rate-distortion optimization scheme. Change-Id: Idb79cf936d96cdd15bcba27e47318295413a5f5d	2015-06-09 15:53:55 -07:00
Jingning Han	948c6d882e	Enable transform block partition entropy coding Select the probability model for transform block partition coding conditioned on the neighbor transform block sizes. Change-Id: Ib701296e59009bad97dbd21d8dcd58bc5e552f39	2015-06-09 12:30:52 -07:00
Jingning Han	79d6b8fc85	Properly handle boundary block rate distortion computation This commit makes the encoder to properly compute the rate distortion cost for blocks that partially cover extend pixels. Change-Id: I44529af6f76925cdc0f6b24a5d190b51b0813983	2015-06-09 11:14:24 -07:00
Jingning Han	b54dd00f53	Align the intra and inter mode cost measurement This commit aligns the measurement method used to evaluate both intra and inter modes. Change-Id: I8071584ce87fa3c5401800363daa0e670de29af5	2015-06-05 11:37:21 -07:00
Jingning Han	3239e22a42	Conditionally use recursive transform block partition search If the frame header sets to use fixed transform block size, use the univariate transform block partition search flow. Change-Id: Ic422ecb6565642cd8ddb96dc67a37109ef3ce90f	2015-06-03 11:14:26 -07:00
Jingning Han	a96f2ca319	Rework the rate and distortion computation pipeline This allows the encoder to use more precise rate and distortion costs for mode decision. Change-Id: I7cfd676a88531a194b9a509375feea8365e5ef12	2015-06-02 23:15:09 -07:00
Jingning Han	0207dcde4a	Fix rate estimate issue in transform block partition coding This commit fixes the over count issue in the recursive transform block partition rate cost estimation. It improves the compression performance by about 0.45%. Change-Id: I01ccda954ed0e120263977472c1c759c3c67170c	2015-06-02 18:51:03 -07:00
Jingning Han	33f05e90fe	Enable rate-distortion optimization for transform partition This commit enables the rate-distortion optimization for recursive transform block partition for inter mode blocks based on luma component. The chroma component infers the transform block size decision from those of luma component. Change-Id: I907cc52af888a606b718e087e717b189fa505748	2015-06-01 16:50:36 -07:00
Jingning Han	0451c6b6dd	Refactor per block rate distortion estimate Move the rate-distortion estimate function outside the recursion as an individual operating module. Change-Id: I662199223c256664bcd312084b3aebffb8a8034b	2015-06-01 12:41:45 -07:00
Jingning Han	d4b8dd76c4	Make chroma component RD estimate support transform partition This commit makes the rate-distortion estimation of the chroma components support the recursive transform block partition inferred from the luma component mode decisions. Change-Id: I2e038bebf558da406e966015952ad1058bdf4766	2015-06-01 11:15:15 -07:00
Jingning Han	cd4aca5959	Add decoder support to recursive transform block partition It allows the decoder to recursively parse and use the transform block size for inter coded blocks. Change-Id: I12ceea48ab35501ac1a3447142deb2a334eff3b8	2015-05-22 16:45:34 -07:00
Jingning Han	64f3820f80	Refactor bit-stream syntax support to transform partition Make the bit-stream syntax elelment coding ready to support variable transform coding block sizes. Change-Id: I07ae4ab62d1ecd46c4a5ae45702fc14bd1d4b07d	2015-05-22 12:13:29 -07:00
Jingning Han	6fc13b5cc2	Inter block transform coding partition syntax elements Allocate memory buffer to store the transform coding partition information of inter prediction mode blocks. Change-Id: I428b1dd0b26e8eaf24030a833554ceb4479c5551	2015-05-22 10:57:36 -07:00
Jingning Han	df2042dc1e	Synchronize encoding process and tokenization handle The encoding and tokenization process support the recursive transform block partition coding scheme. Change-Id: I47283cc6ee9c383059950623ece60a0fcce82e00	2015-05-21 18:51:27 -07:00
Jingning Han	a15cf9a5b7	Synchronize tokenization and detokenization process Make the encoder and decoder synchronized for recursive tokenization coding. Change-Id: I84c5f3dfc3ee9982ab57e658ffe6cb17a949eda2	2015-05-22 01:45:31 +00:00
Jingning Han	bf99a00340	Arrange tokenization order to support recursive txfm block coding Make the encoder packetize transform block in a recursive order. Note that the block index with respect to the coding block remains identical. Change-Id: I07c6d2017f4f150274aff46c05388a7fd47cd920	2015-05-21 18:43:37 -07:00
Jingning Han	5f6fe83ac5	Syntax coding support for transform block coding This commit re-designs the bitstream syntax to support recursive transform block partition. It disables the decoder vector unit tests. Change-Id: I6cac24c4f1e44f29ffcc9b87ba1167eeb32d1b69	2015-05-18 15:43:02 -07:00