Add error handling for frame parallel decode and unit test for that.

Change-Id: I6e309e11f1641618d2424b7a2c0fe744b8974dec
Fix a bug in frame parallel decode and add a unit test for that.
2014-12-08 12:30:19 -08:00 · 2014-11-14 10:16:34 -08:00 · 2014-11-07 10:06:35 -08:00 · 2014-11-06 16:28:35 -08:00 · 2014-11-06 13:51:08 -08:00 · 2014-10-22 10:50:58 -07:00
41 changed files with 1916 additions and 551 deletions
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -35,6 +35,10 @@ class CodecFactory {
  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                 unsigned long deadline) const = 0;

+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const = 0;  // NOLINT
+
  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
                                 unsigned long deadline,
                                 const unsigned long init_flags,
@@ -72,6 +76,10 @@ class VP8Decoder : public Decoder {
  VP8Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
      : Decoder(cfg, deadline) {}

+  VP8Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
 protected:
  virtual vpx_codec_iface_t* CodecInterface() const {
 #if CONFIG_VP8_DECODER
@@ -104,8 +112,14 @@ class VP8CodecFactory : public CodecFactory {

  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                 unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
 #if CONFIG_VP8_DECODER
-    return new VP8Decoder(cfg, deadline);
+    return new VP8Decoder(cfg, flags, deadline);
 #else
    return NULL;
 #endif
@@ -154,6 +168,10 @@ class VP9Decoder : public Decoder {
  VP9Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
      : Decoder(cfg, deadline) {}

+  VP9Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
 protected:
  virtual vpx_codec_iface_t* CodecInterface() const {
 #if CONFIG_VP9_DECODER
@@ -186,8 +204,14 @@ class VP9CodecFactory : public CodecFactory {

  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                 unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
 #if CONFIG_VP9_DECODER
-    return new VP9Decoder(cfg, deadline);
+    return new VP9Decoder(cfg, flags, deadline);
 #else
    return NULL;
 #endif
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -40,26 +40,28 @@ vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
 }

 void DecoderTest::RunLoop(CompressedVideoSource *video) {
-  vpx_codec_dec_cfg_t dec_cfg = {0};
-  Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
+  Decoder* const decoder = codec_->CreateDecoder(cfg_, flags_, 0);
  ASSERT_TRUE(decoder != NULL);
  const char *codec_name = decoder->GetDecoderName();
  const bool is_vp8 = strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
+  bool end_of_file = false;

  // Decode frames.
-  for (video->Begin(); !::testing::Test::HasFailure() && video->cxdata();
+  for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file;
       video->Next()) {
    PreDecodeFrameHook(*video, decoder);

    vpx_codec_stream_info_t stream_info;
    stream_info.sz = sizeof(stream_info);
+
+    if (video->cxdata() != NULL) {
      const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(),
                                                           video->frame_size(),
                                                           &stream_info);
      if (is_vp8) {
        /* Vp8's implementation of PeekStream returns an error if the frame you
-       * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
-       * frame, which must be a keyframe. */
+         * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the
+         * first frame, which must be a keyframe. */
        if (video->frame_number() == 0)
          ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
              << vpx_codec_err_to_string(res_peek);
@@ -74,6 +76,12 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {
                                                     video->frame_size());
      if (!HandleDecodeResult(res_dec, *video, decoder))
        break;
+    } else {
+      // Signal end of the file to the decoder.
+      const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+      end_of_file = true;
+    }

    DxDataIterator dec_iter = decoder->GetDxData();
    const vpx_image_t *img = NULL;
@@ -85,4 +93,12 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {

  delete decoder;
 }
+
+void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) {
+  memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const vpx_codec_flags_t flags) {
+  flags_ = flags;
+}
 }  // namespace libvpx_test
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -41,7 +41,13 @@ class DxDataIterator {
 class Decoder {
 public:
  Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
-      : cfg_(cfg), deadline_(deadline), init_done_(false) {
+      : cfg_(cfg), flags_(0), deadline_(deadline), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+          unsigned long deadline)  // NOLINT
+      : cfg_(cfg), flags_(flag), deadline_(deadline), init_done_(false) {
    memset(&decoder_, 0, sizeof(decoder_));
  }

@@ -102,7 +108,7 @@ class Decoder {
    if (!init_done_) {
      const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,
                                                     CodecInterface(),
-                                                     &cfg_, 0);
+                                                     &cfg_, flags_);
      ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
      init_done_ = true;
    }
@@ -110,6 +116,7 @@ class Decoder {

  vpx_codec_ctx_t     decoder_;
  vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
  unsigned int        deadline_;
  bool                init_done_;
 };
@@ -120,6 +127,9 @@ class DecoderTest {
  // Main decoding loop
  virtual void RunLoop(CompressedVideoSource *video);

+  virtual void set_cfg(const vpx_codec_dec_cfg_t &dec_cfg);
+  virtual void set_flags(const vpx_codec_flags_t flags);
+
  // Hook to be called before decompressing every frame.
  virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
                                  Decoder *decoder) {}
@@ -137,11 +147,15 @@ class DecoderTest {
                                     const unsigned int frame_number) {}

 protected:
-  explicit DecoderTest(const CodecFactory *codec) : codec_(codec) {}
+  explicit DecoderTest(const CodecFactory *codec) : codec_(codec), flags_(0) {
+    memset(&cfg_, 0, sizeof(cfg_));
+  }

  virtual ~DecoderTest() {}

  const CodecFactory *codec_;
+  vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
 };

 }  // namespace libvpx_test
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -97,7 +97,7 @@ const char *const kVP9InvalidFileTests[] = {
  "invalid-vp90-01.webm",
  "invalid-vp90-02.webm",
  "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf",
-  "invalid-vp90-03.webm",
+  "invalid-vp90-03-v3.webm",
  "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf",
  "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf",
 };
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -4,8 +4,8 @@ fe346136b9b8c1e6f6084cc106485706915795e4  invalid-vp90-01.webm
 25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02.webm
 2dadee5306245fa5eeb0f99652d0e17afbcba96d  invalid-vp90-02.webm.res
-df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03.webm
-8fe6fd82bf537340f586f97a7ae31fb37ccda302  invalid-vp90-03.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03-v3.webm
+4935c62becc68c13642a03db1e6d3e2331c1c612  invalid-vp90-03-v3.webm.res
 a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
 0b194cc312c3a2e84d156a221b0a5eb615dfddc5  park_joy_90p_10_422.y4m
 ff0e0a21dc2adc95b8c1b37902713700655ced17  park_joy_90p_10_444.y4m
@@ -661,3 +661,12 @@ d3964f9dad9f60363c81b688324d95b4ec7c8038  invalid-vp90-2-00-quantizer-00.webm.iv
 456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 c123d1f9f02fb4143abb5e271916e3a3080de8f6  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 456d1493e52d32a5c30edf44a27debc1fa6b253a  invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+f97088c7359fc8d3d5aa5eafe57bc7308b3ee124  vp90-2-20-big_superframe-01.webm
+47d7d409785afa33b123376de0c907336e6c7bd7  vp90-2-20-big_superframe-01.webm.md5
+65ade6d2786209582c50d34cfe22b3cdb033abaf  vp90-2-20-big_superframe-02.webm
+7c0ed8d04c4d06c5411dd2e5de2411d37f092db5  vp90-2-20-big_superframe-02.webm.md5
+667ec8718c982aef6be07eb94f083c2efb9d2d16  vp90-2-07-frame_parallel-1.webm
+bfc82bf848e9c05020d61e3ffc1e62f25df81d19  vp90-2-07-frame_parallel-1.webm.md5
+efd5a51d175cfdacd169ed23477729dc558030dc  invalid-vp90-2-07-frame_parallel-1.webm
+9f912712ec418be69adb910e2ca886a63c4cec08  invalid-vp90-2-07-frame_parallel-2.webm
+445f5a53ca9555341852997ccdd480a51540bd14  invalid-vp90-2-07-frame_parallel-3.webm
--- a/test/test.mk
+++ b/test/test.mk
@@ -31,6 +31,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
@@ -683,6 +684,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel-1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
@@ -773,20 +776,27 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-01.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-20-big_superframe-02.webm.md5

 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm

 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -12,6 +12,7 @@
 #include <cstdlib>
 #include <string>
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "../tools_common.h"
 #include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
@@ -26,10 +27,24 @@

 namespace {

+enum DecodeMode {
+  kSerialMode,
+  kFrameParallMode
+};
+
+const int kDecodeMode = 0;
+const int kThreads = 1;
+const int kFileName = 2;
+
+typedef std::tr1::tuple<int, int, const char *> DecodeParam;
+
 class TestVectorTest : public ::libvpx_test::DecoderTest,
-    public ::libvpx_test::CodecTestWithParam<const char*> {
+    public ::libvpx_test::CodecTestWithParam<DecodeParam> {
 protected:
-  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {}
+  TestVectorTest()
+      : DecoderTest(GET_PARAM(0)),
+        md5_file_(NULL) {
+  }

  virtual ~TestVectorTest() {
    if (md5_file_)
@@ -71,8 +86,25 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 // checksums match the correct md5 data, then the test is passed. Otherwise,
 // the test failed.
 TEST_P(TestVectorTest, MD5Match) {
-  const std::string filename = GET_PARAM(1);
+  const DecodeParam input = GET_PARAM(1);
+  const std::string filename = std::tr1::get<kFileName>(input);
+  const int threads = std::tr1::get<kThreads>(input);
+  const int mode = std::tr1::get<kDecodeMode>(input);
  libvpx_test::CompressedVideoSource *video = NULL;
+  vpx_codec_flags_t flags = 0;
+  vpx_codec_dec_cfg_t cfg = {0};
+  char str[256];
+
+  if (mode == kFrameParallMode) {
+    flags |= VPX_CODEC_USE_FRAME_THREADING;
+  }
+
+  cfg.threads = threads;
+
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
+           "file: %s  mode: %s threads: %d",
+           filename.c_str(), mode == 0 ? "Serial" : "Parallel", threads);
+  SCOPED_TRACE(str);

  // Open compressed video file.
  if (filename.substr(filename.length() - 3, 3) == "ivf") {
@@ -92,18 +124,53 @@ TEST_P(TestVectorTest, MD5Match) {
  const std::string md5_filename = filename + ".md5";
  OpenMD5File(md5_filename);

+  // Set decode config and flags.
+  set_cfg(cfg);
+  set_flags(flags);
+
  // Decode frame, and check the md5 matching.
  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
  delete video;
 }

-VP8_INSTANTIATE_TEST_CASE(TestVectorTest,
+// Test VP8 decode in serial mode with single thread.
+// NOTE: VP8 only support serial mode.
+INSTANTIATE_TEST_CASE_P(
+    VP8, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
+        ::testing::Combine(
+            ::testing::Values(0),  // Serial Mode.
+            ::testing::Values(1),  // Single thread.
            ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                                libvpx_test::kVP8TestVectors +
-                                              libvpx_test::kNumVP8TestVectors));
-VP9_INSTANTIATE_TEST_CASE(TestVectorTest,
+                                    libvpx_test::kNumVP8TestVectors))));
+
+// Test VP9 decode in serial mode with single thread.
+INSTANTIATE_TEST_CASE_P(
+    VP9, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Combine(
+            ::testing::Values(0),  // Serial Mode.
+            ::testing::Values(1),  // Single thread.
            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                                libvpx_test::kVP9TestVectors +
-                                              libvpx_test::kNumVP9TestVectors));
+                                    libvpx_test::kNumVP9TestVectors))));

+
+// Test VP9 decode in frame parallel mode with different number of threads.
+INSTANTIATE_TEST_CASE_P(
+    VP9MultiThreadedFrameParallel, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Combine(
+            ::testing::Values(1),        // Frame Parallel mode.
+            ::testing::Range(2, 9),      // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
+                                libvpx_test::kVP9TestVectors +
+                                    libvpx_test::kNumVP9TestVectors))));
 }  // namespace
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -154,7 +154,8 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
  "vp90-2-05-resize.ivf", "vp90-2-06-bilinear.webm",
-  "vp90-2-07-frame_parallel.webm", "vp90-2-08-tile_1x2_frame_parallel.webm",
+  "vp90-2-07-frame_parallel.webm", "vp90-2-07-frame_parallel-1.webm",
+  "vp90-2-08-tile_1x2_frame_parallel.webm",
  "vp90-2-08-tile_1x2.webm", "vp90-2-08-tile_1x4_frame_parallel.webm",
  "vp90-2-08-tile_1x4.webm", "vp90-2-08-tile_1x8_frame_parallel.webm",
  "vp90-2-08-tile_1x8.webm", "vp90-2-08-tile-4x4.webm",
@@ -181,6 +182,7 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
  "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
  "vp91-2-04-yuv444.webm",
+  "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
--- a/test/vp9_frame_parallel_test.cc
+++ b/test/vp9_frame_parallel_test.cc
@@ -0,0 +1,208 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+using std::string;
+
+#if CONFIG_WEBM_IO
+
+struct FileList {
+  const char *name;
+  // md5 sum for decoded frames which does not include skipped frames.
+  const char *expected_md5;
+  const int pause_frame_num;
+};
+
+// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
+// seek to next key frame and then continue decoding until the end. Return
+// the md5 of the decoded frames which does not include skipped frames.
+string DecodeFile(const string &filename, int num_threads, int pause_num) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+  int in_frames = 0;
+  int out_frames = 0;
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  cfg.threads = num_threads;
+  vpx_codec_flags_t flags = 0;
+  flags |= VPX_CODEC_USE_FRAME_THREADING;
+  libvpx_test::VP9Decoder decoder(cfg, flags, 0);
+
+  libvpx_test::MD5 md5;
+  video.Begin();
+
+  do {
+    ++in_frames;
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    // Pause at specified frame number.
+    if (in_frames == pause_num) {
+      // Flush the decoder and then seek to next key frame.
+      decoder.DecodeFrame(NULL, 0);
+      video.SeekToNextKeyFrame();
+    } else {
+      video.Next();
+    }
+
+    // Flush the decoder at the end of the video.
+    if (!video.cxdata())
+      decoder.DecodeFrame(NULL, 0);
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      ++out_frames;
+      md5.Add(img);
+    }
+  } while (video.cxdata() != NULL);
+
+  EXPECT_EQ(in_frames, out_frames) <<
+      "Input frame count does not match output frame count";
+
+  return string(md5.Get());
+}
+
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5,
+                DecodeFile(iter->name, t, iter->pause_frame_num))
+          << "threads = " << t;
+    }
+  }
+}
+
+TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
+  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
+  // one key frame for every ten frames.
+  static const FileList files[] = {
+    { "vp90-2-07-frame_parallel-1.webm",
+      "6ea7c3875d67252e7caf2bc6e75b36b1", 6},
+    { "vp90-2-07-frame_parallel-1.webm",
+      "4bb634160c7356a8d7d4299b6dc83a45", 12},
+    { "vp90-2-07-frame_parallel-1.webm",
+      "89772591e6ef461f9fa754f916c78ed8", 26},
+    { NULL, NULL, 0},
+  };
+  DecodeFiles(files);
+}
+
+struct InvalidFileList {
+  const char *name;
+  // md5 sum for decoded frames which does not include corrupted frames.
+  const char *expected_md5;
+  // Expected number of decoded frames which does not include corrupted frames.
+  const int expected_frame_count;
+};
+
+// Decodes |filename| with |num_threads|. Return the md5 of the decoded
+// frames which does not include corrupted frames.
+string DecodeInvalidFile(const string &filename, int num_threads,
+                         int expected_frame_count) {
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+  cfg.threads = num_threads;
+  const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
+  libvpx_test::VP9Decoder decoder(cfg, flags, 0);
+
+  libvpx_test::MD5 md5;
+  video.Begin();
+
+  int out_frames = 0;
+  do {
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    // TODO(hkuang): frame parallel mode should return an error on corruption.
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+
+    video.Next();
+
+    // Flush the decoder at the end of the video.
+    if (!video.cxdata())
+      decoder.DecodeFrame(NULL, 0);
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      ++out_frames;
+      md5.Add(img);
+    }
+  } while (video.cxdata() != NULL);
+
+  EXPECT_EQ(expected_frame_count, out_frames) <<
+      "Input frame count does not match expected output frame count";
+
+  return string(md5.Get());
+}
+
+void DecodeInvalidFiles(const InvalidFileList files[]) {
+  for (const InvalidFileList *iter = files; iter->name != NULL; ++iter) {
+    SCOPED_TRACE(iter->name);
+    for (int t = 2; t <= 8; ++t) {
+      EXPECT_EQ(iter->expected_md5,
+                DecodeInvalidFile(iter->name, t, iter->expected_frame_count))
+          << "threads = " << t;
+    }
+  }
+}
+
+TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
+  static const InvalidFileList files[] = {
+    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 11th frame has corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-1.webm",
+      "0549d0f45f60deaef8eb708e6c0eb6cb", 30},
+    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 1st and 31st frames have
+    // corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-2.webm",
+      "6a1f3cf6f9e7a364212fadb9580d525e", 20},
+    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
+    // one key frame for every ten frames. The 13th frame has corrupted data.
+    { "invalid-vp90-2-07-frame_parallel-3.webm",
+      "a567c8259d27ad32b1b7f58db5ac89dd", 32},
+    { NULL, NULL, 0},
+  };
+  DecodeInvalidFiles(files);
+}
+
+#endif  // CONFIG_WEBM_IO
+}  // namespace
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -18,7 +18,7 @@
 #if CONFIG_WEBM_IO
 #include "test/webm_video_source.h"
 #endif
-#include "vp9/decoder/vp9_thread.h"
+#include "vp9/common/vp9_thread.h"

 namespace {

@@ -175,7 +175,7 @@ int Reset(VP9Worker *const /*worker*/) { return 1; }
 int Sync(VP9Worker *const worker) { return !worker->had_error; }

 void Execute(VP9Worker *const worker) {
-  worker->had_error |= worker->hook(worker->data1, worker->data2);
+  worker->had_error |= !worker->hook(worker->data1, worker->data2);
 }

 void Launch(VP9Worker *const worker) { Execute(worker); }
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -69,6 +69,18 @@ class WebMVideoSource : public CompressedVideoSource {
    }
  }

+  void SeekToNextKeyFrame() {
+    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    do {
+      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+      ASSERT_GE(status, 0) << "webm_read_frame failed";
+      ++frame_;
+      if (status == 1) {
+        end_of_file_ = true;
+      }
+    } while (!webm_ctx_->is_key_frame && !end_of_file_);
+  }
+
  virtual const uint8_t *cxdata() const {
    return end_of_file_ ? NULL : buf_;
  }
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -60,6 +60,7 @@ struct vpx_codec_alg_priv
    vpx_decrypt_cb          decrypt_cb;
    void                    *decrypt_state;
    vpx_image_t             img;
+    int                     flushed;
    int                     img_setup;
    struct frame_buffers    yv12_frame_buffers;
    void                    *user_priv;
@@ -89,6 +90,7 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
    ctx->priv->alg_priv->decrypt_cb = NULL;
    ctx->priv->alg_priv->decrypt_state = NULL;
    ctx->priv->init_flags = ctx->init_flags;
+    ctx->priv->alg_priv->flushed = 0;

    if (ctx->config.dec)
    {
@@ -327,6 +329,13 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
    unsigned int resolution_change = 0;
    unsigned int w, h;

+    if (data == NULL && data_sz == 0) {
+      ctx->flushed = 1;
+      return VPX_CODEC_OK;
+    }
+
+    /* Reset flushed when receiving a valid frame */
+    ctx->flushed = 0;

    /* Update the input fragment data */
    if(update_fragments(ctx, data, data_sz, &res) <= 0)
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -12,11 +12,37 @@
 #include "vpx_mem/vpx_mem.h"

 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_systemdependent.h"

+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static INLINE void alloc_mi_array(VP9_COMMON *cm, int mi_size, int idx) {
+  CHECK_MEM_ERROR(cm, cm->mip_array[idx],
+                  vpx_calloc(mi_size, sizeof(*cm->mip_array[0])));
+  CHECK_MEM_ERROR(cm, cm->mi_grid_base_array[idx],
+                  vpx_calloc(mi_size, sizeof(*cm->mi_grid_base_array[0])));
+}
+
 static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
  int i;

@@ -49,40 +75,47 @@ static void setup_mi(VP9_COMMON *cm) {
  vpx_memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) *
                                      sizeof(*cm->mi_grid_base));

+  // Only clear mi border in non frame-parallel decode. In frame-parallel
+  // decode, prev_mip is managed by previous decoding thread. While in
+  // non frame-parallel decode, prev_mip and mip are both managed by
+  // current decoding thread.
+  if (!cm->frame_parallel_decode)
    clear_mi_border(cm, cm->prev_mip);
 }

 static int alloc_mi(VP9_COMMON *cm, int mi_size) {
  int i;

-  for (i = 0; i < 2; ++i) {
-    cm->mip_array[i] =
-        (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
-    if (cm->mip_array[i] == NULL)
-      return 1;
-
-    cm->mi_grid_base_array[i] =
-        (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
-    if (cm->mi_grid_base_array[i] == NULL)
-      return 1;
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    // Delay reallocation as another thread is accessing prev_mi.
+    if (cm->frame_parallel_decode && i == cm->prev_mi_idx) {
+      cm->update_prev_mi = 1;
+      continue;
+    }
+    alloc_mi_array(cm, mi_size, i);
  }

-  // Init the index.
+  cm->mip = cm->mip_array[cm->mi_idx];
+  cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
+
+  if (!cm->frame_parallel_decode) {
    cm->mi_idx = 0;
    cm->prev_mi_idx = 1;
-
-  cm->mip = cm->mip_array[cm->mi_idx];
+    // In frame-parallel decode, prev_mip comes from another thread,
+    // so current decoding thread should not touch it.
    cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
-  cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
    cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
+  }

  return 0;
 }

-static void free_mi(VP9_COMMON *cm) {
+static void free_mi(VP9_COMMON *cm, int decode_done) {
  int i;

-  for (i = 0; i < 2; ++i) {
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    if (cm->frame_parallel_decode && i == cm->prev_mi_idx && !decode_done)
+      continue;
    vpx_free(cm->mip_array[i]);
    cm->mip_array[i] = NULL;
    vpx_free(cm->mi_grid_base_array[i]);
@@ -90,21 +123,62 @@ static void free_mi(VP9_COMMON *cm) {
  }

  cm->mip = NULL;
-  cm->prev_mip = NULL;
  cm->mi_grid_base = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->prev_mip = NULL;
    cm->prev_mi_grid_base = NULL;
+  }
+}
+
+static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1);
+    if (cm->seg_map_array[i] == NULL)
+      return 1;
+  }
+
+  // Init the index.
+  cm->seg_map_idx = 0;
+  cm->prev_seg_map_idx = 1;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+  }
+
+  return 0;
+}
+
+static void free_seg_map(VP9_COMMON *cm) {
+  int i;
+
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    vpx_free(cm->seg_map_array[i]);
+    cm->seg_map_array[i] = NULL;
+  }
+
+  cm->current_frame_seg_map = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = NULL;
+  }
 }

 void vp9_free_frame_buffers(VP9_COMMON *cm) {
  int i;
+  BufferPool *const pool = cm->buffer_pool;

  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
+    vp9_free_frame_buffer(&pool->frame_bufs[i].buf);

-    if (cm->frame_bufs[i].ref_count > 0 &&
-        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
-      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
-      cm->frame_bufs[i].ref_count = 0;
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
    }
  }

@@ -112,10 +186,8 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
 }

 void vp9_free_context_buffers(VP9_COMMON *cm) {
-  free_mi(cm);
-
-  vpx_free(cm->last_frame_seg_map);
-  cm->last_frame_seg_map = NULL;
+  free_mi(cm, 1);
+  free_seg_map(cm);

  vpx_free(cm->above_context);
  cm->above_context = NULL;
@@ -139,16 +211,15 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {

  set_mb_mi(cm, aligned_width, aligned_height);

-  free_mi(cm);
+  free_mi(cm, 0);
  if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE)))
    goto fail;

  setup_mi(cm);

  // Create the segmentation map structure and set to 0.
-  vpx_free(cm->last_frame_seg_map);
-  cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
-  if (!cm->last_frame_seg_map)
+  free_seg_map(cm);
+  if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
    goto fail;

  vpx_free(cm->above_context);
@@ -176,13 +247,14 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {

 static void init_frame_bufs(VP9_COMMON *cm) {
  int i;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
+  frame_bufs[cm->new_fb_idx].ref_count = 1;

  for (i = 0; i < REF_FRAMES; ++i) {
    cm->ref_frame_map[i] = i;
-    cm->frame_bufs[i].ref_count = 1;
+    frame_bufs[i].ref_count = 1;
  }
 }

@@ -190,12 +262,13 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
  int i;
  const int ss_x = cm->subsampling_x;
  const int ss_y = cm->subsampling_y;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  vp9_free_frame_buffers(cm);

  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    cm->frame_bufs[i].ref_count = 0;
-    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
+    frame_bufs[i].ref_count = 0;
+    if (vp9_alloc_frame_buffer(&frame_bufs[i].buf, width, height,
                               ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
      goto fail;
  }
@@ -256,7 +329,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
 void vp9_remove_common(VP9_COMMON *cm) {
  vp9_free_frame_buffers(cm);
  vp9_free_context_buffers(cm);
-  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
 }

 void vp9_update_frame_size(VP9_COMMON *cm) {
@@ -267,13 +339,27 @@ void vp9_update_frame_size(VP9_COMMON *cm) {
  setup_mi(cm);

  // Initialize the previous frame segment map to 0.
-  if (cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+  if (cm->current_frame_seg_map)
+    vpx_memset(cm->current_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }

 void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
  // Swap indices.
  const int tmp = cm->mi_idx;
+
+  // Only used in frame parallel decode: Update the prev_mi buffer if
+  // needed. The worker that was accessing it must already finish decoding.
+  // So it can be resized safely now.
+  if (cm->update_prev_mi) {
+    const int mi_size = cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE);
+    vpx_free(cm->mip_array[cm->prev_mi_idx]);
+    vpx_free(cm->mi_grid_base_array[cm->prev_mi_idx]);
+    cm->mip_array[cm->prev_mi_idx] = NULL;
+    cm->mi_grid_base_array[cm->prev_mi_idx] = NULL;
+    alloc_mi_array(cm, mi_size, cm->prev_mi_idx);
+    cm->update_prev_mi = 0;
+  }
+
  cm->mi_idx = cm->prev_mi_idx;
  cm->prev_mi_idx = tmp;

@@ -289,3 +375,13 @@ void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
 }
+
+void vp9_swap_current_and_last_seg_map(VP9_COMMON *cm) {
+  // Swap indices.
+  const int tmp = cm->seg_map_idx;
+  cm->seg_map_idx = cm->prev_seg_map_idx;
+  cm->prev_seg_map_idx = tmp;
+
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+}
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -34,6 +34,8 @@ void vp9_update_frame_size(struct VP9Common *cm);

 void vp9_swap_mi_and_prev_mi(struct VP9Common *cm);

+void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -439,9 +439,13 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
  int i;
  vp9_clearall_segfeatures(&cm->seg);
  cm->seg.abs_delta = SEGMENT_DELTADATA;
-  if (cm->last_frame_seg_map)
+
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));

+  if (cm->current_frame_seg_map)
+    vpx_memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
  // Reset the mode ref deltas for loop filter
  vp9_zero(lf->last_ref_deltas);
  vp9_zero(lf->last_mode_deltas);
@@ -464,7 +468,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
  }

-  if (frame_is_intra_only(cm))
+  if (frame_is_intra_only(cm) && !cm->frame_parallel_decode)
    vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
                                    sizeof(*cm->prev_mip));

--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -17,14 +17,12 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                             const TileInfo *const tile,
                             MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                             int_mv *mv_ref_list,
-                             int block, int mi_row, int mi_col) {
+                             int block, int mi_row, int mi_col,
+                             find_mv_refs_sync sync, void *const data) {
  const int *ref_sign_bias = cm->ref_frame_sign_bias;
  int i, refmv_count = 0;
-  const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi
-        ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]
-        : NULL;
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
-
+  MODE_INFO *prev_mi = NULL;
+  MB_MODE_INFO *prev_mbmi = NULL;

  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];

@@ -71,6 +69,14 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
    }
  }

+  // Synchronize here for frame parallel decode if sync function is provided.
+  if (sync != NULL) {
+    sync(data, mi_row);
+  }
+  prev_mi = cm->coding_use_prev_mi && cm->prev_mi ?
+            cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] : NULL;
+  prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
+
  // Check the last frame's mode and mv info.
  if (prev_mbmi) {
    if (prev_mbmi->ref_frame[0] == ref_frame)
@@ -112,9 +118,10 @@ void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                      const TileInfo *const tile,
                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                      int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
+                      int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data) {
  find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
-                   mi_row, mi_col);
+                   mi_row, mi_col, sync, data);
 }

 static void lower_mv_precision(MV *mv, int allow_hp) {
@@ -152,7 +159,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
  assert(MAX_MV_REF_CANDIDATES == 2);

  find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col);
+                   mi_row, mi_col, NULL, NULL);

  near->as_int = 0;
  switch (block) {
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -204,10 +204,12 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }

+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                      const TileInfo *const tile,
                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list, int mi_row, int mi_col);
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data);

 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_frame_buffers.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/common/vp9_tile_common.h"

 #if CONFIG_VP9_POSTPROC
@@ -35,14 +36,19 @@ extern "C" {
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)

-// 1 scratch frame for the new frame, 3 for scaled references on the encoder
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 4)
+#define FRAME_BUFFERS (REF_FRAMES + 7)

 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)

+#define NUM_PING_PONG_BUFFERS 2
+
 extern const struct {
  PARTITION_CONTEXT above;
  PARTITION_CONTEXT left;
@@ -61,8 +67,40 @@ typedef struct {
  int ref_count;
  vpx_codec_frame_buffer_t raw_frame_buffer;
  YV12_BUFFER_CONFIG buf;
+
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  VP9Worker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
 } RefCntBuffer;

+typedef struct {
+  // Protect BufferPool from being accessed by several FrameWorkers at
+  // the same time during frame parallel decode.
+  // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
 typedef struct VP9Common {
  struct vpx_internal_error_info  error;

@@ -89,10 +127,12 @@ typedef struct VP9Common {

  YV12_BUFFER_CONFIG *frame_to_show;

-  RefCntBuffer frame_bufs[FRAME_BUFFERS];
-
  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */

+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
  // roll new_fb_idx into it.

@@ -144,8 +184,8 @@ typedef struct VP9Common {

  int mi_idx;
  int prev_mi_idx;
-  MODE_INFO *mip_array[2];
-  MODE_INFO **mi_grid_base_array[2];
+  MODE_INFO *mip_array[NUM_PING_PONG_BUFFERS];
+  MODE_INFO **mi_grid_base_array[NUM_PING_PONG_BUFFERS];

  MODE_INFO *mip; /* Base of allocated array */
  MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
@@ -157,8 +197,16 @@ typedef struct VP9Common {
  MODE_INFO **prev_mi_grid_base;
  MODE_INFO **prev_mi_grid_visible;

+  // Used in frame parallel decode for delay resizing prev_mi.
+  int update_prev_mi;
+
  // Persistent mb segment id map used in prediction.
-  unsigned char *last_frame_seg_map;
+  int seg_map_idx;
+  int prev_seg_map_idx;
+
+  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
+  uint8_t *last_frame_seg_map;
+  uint8_t *current_frame_seg_map;

  INTERP_FILTER interp_filter;

@@ -171,6 +219,10 @@ typedef struct VP9Common {
  struct loopfilter lf;
  struct segmentation seg;

+  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
+  // in pbi.
+  int frame_parallel_decode;  // frame-based threading.
+
  // Context probabilities for reference frame prediction
  int allow_comp_inter_inter;
  MV_REFERENCE_FRAME comp_fixed_ref;
@@ -202,30 +254,34 @@ typedef struct VP9Common {

  int log2_tile_cols, log2_tile_rows;

-  // Private data associated with the frame buffer callbacks.
-  void *cb_priv;
-  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
-  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
-
-  // Handles memory for the codec.
-  InternalFrameBufferList int_frame_buffers;
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;

  PARTITION_CONTEXT *above_seg_context;
  ENTROPY_CONTEXT *above_context;
 } VP9_COMMON;

+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool);
+void unlock_buffer_pool(BufferPool *const pool);
+
 static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
-  return &cm->frame_bufs[cm->new_fb_idx].buf;
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }

 static INLINE int get_free_fb(VP9_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  int i;
-  for (i = 0; i < FRAME_BUFFERS; i++)
-    if (cm->frame_bufs[i].ref_count == 0)
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0)
      break;

  assert(i < FRAME_BUFFERS);
-  cm->frame_bufs[i].ref_count = 1;
+  frame_bufs[i].ref_count = 1;
+  unlock_buffer_pool(cm->buffer_pool);
  return i;
 }

@@ -310,7 +366,6 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);

-  // num_4x4_blocks_wide_lookup[bsize] / 2
  const int bs = num_8x8_blocks_wide_lookup[bsize];

  // update the partition context at the end notes. set partition bits
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,50 +20,7 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"

-static void build_mc_border(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
-                            int x, int y, int b_w, int b_h, int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      memset(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy);
-
-    if (right)
-      memset(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-
-static void inter_predictor(const uint8_t *src, int src_stride,
+void inter_predictor(const uint8_t *src, int src_stride,
                            uint8_t *dst, int dst_stride,
                            const int subpel_x,
                            const int subpel_y,
@@ -151,7 +108,7 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
  return clamped_mv;
 }

-static MV average_split_mvs(const struct macroblockd_plane *pd, int plane,
+MV average_split_mvs(const struct macroblockd_plane *pd, int plane,
                            const MODE_INFO *mi, int ref, int block) {
  const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
  MV res = {0, 0};
@@ -174,7 +131,7 @@ static MV average_split_mvs(const struct macroblockd_plane *pd, int plane,
  return res;
 }

-static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                   int bw, int bh,
                                   int x, int y, int w, int h,
                                   int mi_x, int mi_y) {
@@ -270,169 +227,6 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    MAX_MB_PLANE - 1);
 }

-// TODO(jingning): This function serves as a placeholder for decoder prediction
-// using on demand border extension. It should be moved to /decoder/ directory.
-static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                       int bw, int bh,
-                                       int x, int y, int w, int h,
-                                       int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0];
-  const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
-  int ref;
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-    struct buf_2d *const pre_buf = &pd->pre[ref];
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? average_split_mvs(pd, plane, mi, ref, block)
-               : mi->mbmi.mv[ref].as_mv;
-
-
-    // TODO(jkoleszar): This clamping is done in the incorrect place for the
-    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
-    // MV. Note however that it performs the subsampling aware scaling so
-    // that the result is always q4.
-    // mv_precision precision is MV_PRECISION_Q4.
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                               pd->subsampling_x,
-                                               pd->subsampling_y);
-
-    MV32 scaled_mv;
-    int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
-        subpel_x, subpel_y;
-    uint8_t *ref_frame, *buf_ptr;
-    const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
-
-    // Get reference frame pointer, width and height.
-    if (plane == 0) {
-      frame_width = ref_buf->y_crop_width;
-      frame_height = ref_buf->y_crop_height;
-      ref_frame = ref_buf->y_buffer;
-    } else {
-      frame_width = ref_buf->uv_crop_width;
-      frame_height = ref_buf->uv_crop_height;
-      ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
-    }
-
-    if (vp9_is_scaled(sf)) {
-      // Co-ordinate of containing block to pixel precision.
-      int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
-      int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = (x_start + x) << SUBPEL_BITS;
-      y0_16 = (y_start + y) << SUBPEL_BITS;
-
-      // Co-ordinate of current block in reference frame
-      // to 1/16th pixel precision.
-      x0_16 = sf->scale_value_x(x0_16, sf);
-      y0_16 = sf->scale_value_y(y0_16, sf);
-
-      // Map the top left corner of the block into the reference frame.
-      x0 = sf->scale_value_x(x_start + x, sf);
-      y0 = sf->scale_value_y(y_start + y, sf);
-
-      // Scale the MV and incorporate the sub-pixel offset of the block
-      // in the reference frame.
-      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-      xs = sf->x_step_q4;
-      ys = sf->y_step_q4;
-    } else {
-      // Co-ordinate of containing block to pixel precision.
-      x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-      y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = x0 << SUBPEL_BITS;
-      y0_16 = y0 << SUBPEL_BITS;
-
-      scaled_mv.row = mv_q4.row;
-      scaled_mv.col = mv_q4.col;
-      xs = ys = 16;
-    }
-    subpel_x = scaled_mv.col & SUBPEL_MASK;
-    subpel_y = scaled_mv.row & SUBPEL_MASK;
-
-    // Calculate the top left corner of the best matching block in the reference frame.
-    x0 += scaled_mv.col >> SUBPEL_BITS;
-    y0 += scaled_mv.row >> SUBPEL_BITS;
-    x0_16 += scaled_mv.col;
-    y0_16 += scaled_mv.row;
-
-    // Get reference block pointer.
-    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
-    buf_stride = pre_buf->stride;
-
-    // Do border extension if there is motion or the
-    // width/height is not a multiple of 8 pixels.
-    if (scaled_mv.col || scaled_mv.row ||
-        (frame_width & 0x7) || (frame_height & 0x7)) {
-      // Get reference block bottom right coordinate.
-      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-      int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-      int x_pad = 0, y_pad = 0;
-
-      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
-        x0 -= VP9_INTERP_EXTEND - 1;
-        x1 += VP9_INTERP_EXTEND;
-        x_pad = 1;
-      }
-
-      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
-        y0 -= VP9_INTERP_EXTEND - 1;
-        y1 += VP9_INTERP_EXTEND;
-        y_pad = 1;
-      }
-
-      // Skip border extension if block is inside the frame.
-      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
-          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
-        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
-        // Extend the border.
-        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1,
-                        x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width,
-                        frame_height);
-        buf_stride = x1 - x0 + 1;
-        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-      }
-    }
-
-    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-  }
-}
-
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
-                                                        &xd->plane[plane]);
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
-
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
-      int i = 0, x, y;
-      assert(bsize == BLOCK_8X8);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(xd, plane, i++, bw, bh,
-                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
-    } else {
-      dec_build_inter_predictors(xd, plane, 0, bw, bh,
-                                 0, 0, bw, bh, mi_x, mi_y);
-    }
-  }
-}
-
 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
                          const YV12_BUFFER_CONFIG *src,
                          int mi_row, int mi_col) {
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -18,6 +18,26 @@
 extern "C" {
 #endif

+void inter_predictor(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            const int subpel_x,
+                            const int subpel_y,
+                            const struct scale_factors *sf,
+                            int w, int h, int ref,
+                            const InterpKernel *kernel,
+                            int xs, int ys);
+
+MV average_split_mvs(const struct macroblockd_plane *pd, int plane,
+                            const MODE_INFO *mi, int ref, int block);
+
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y);
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   int bw, int bh,
+                                   int x, int y, int w, int h,
+                                   int mi_x, int mi_y);
+
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);

@@ -27,9 +47,6 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                   BLOCK_SIZE bsize);

-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize);
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                               uint8_t *dst, int dst_stride,
                               const MV *mv_q3,
--- a/vp9/decoder/vp9_thread.c
+++ b/vp9/decoder/vp9_thread.c
--- a/vp9/decoder/vp9_thread.h
+++ b/vp9/decoder/vp9_thread.h
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -28,6 +28,7 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/common/vp9_tile_common.h"

 #include "vp9/decoder/vp9_decodeframe.h"
@@ -38,7 +39,6 @@
 #include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_reader.h"
-#include "vp9/decoder/vp9_thread.h"

 #define MAX_VP9_HEADER_SIZE 80

@@ -327,21 +327,24 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
  xd->block_refs[idx] = ref_buffer;
+
  if (!vp9_is_valid_scale(&ref_buffer->sf))
    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                       "Invalid scale factors");
  vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
                       &ref_buffer->sf);
+  if (!cm->frame_parallel_decode)
    xd->corrupted |= ref_buffer->buf->corrupted;
 }

-static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                         const TileInfo *const tile,
                         int mi_row, int mi_col,
                         vp9_reader *r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
  const int less8x8 = bsize < BLOCK_8X8;
  MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+  vp9_read_mode_info(pbi, xd, tile, mi_row, mi_col, r);

  if (less8x8)
    bsize = BLOCK_8X8;
@@ -365,7 +368,7 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
      set_ref(cm, xd, 1, mi_row, mi_col);

    // Prediction
-    vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);

    // Reconstruction
    if (!mbmi->skip) {
@@ -404,10 +407,11 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
  return p;
 }

-static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
                             vp9_reader* r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
  PARTITION_TYPE partition;
  BLOCK_SIZE subsize, uv_subsize;
@@ -422,27 +426,27 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                       "Invalid block size.");
  if (subsize < BLOCK_8X8) {
-    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+    decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
  } else {
    switch (partition) {
      case PARTITION_NONE:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
        break;
      case PARTITION_HORZ:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
        if (mi_row + hbs < cm->mi_rows)
-          decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
        break;
      case PARTITION_VERT:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
        if (mi_col + hbs < cm->mi_cols)
-          decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
        break;
      case PARTITION_SPLIT:
-        decode_partition(cm, xd, tile, mi_row,       mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row,       mi_col + hbs, r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, tile, mi_row,       mi_col,       r, subsize);
+        decode_partition(pbi, xd, tile, mi_row,       mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, tile, mi_row + hbs, mi_col,       r, subsize);
+        decode_partition(pbi, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
        break;
      default:
        assert(0 && "Invalid partition type");
@@ -621,6 +625,7 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
 }

 static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
+  BufferPool *const pool = cm->buffer_pool;
  if (cm->width != width || cm->height != height) {
    // Change in frame size.
    // TODO(agrange) Don't test width/height, check overall size.
@@ -637,14 +642,17 @@ static void apply_frame_size(VP9_COMMON *cm, int width, int height) {
    vp9_update_frame_size(cm);
  }

+  lock_buffer_pool(pool);
  if (vp9_realloc_frame_buffer(
          get_frame_new_buffer(cm), cm->width, cm->height,
          cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                       "Failed to allocate frame buffer");
  }
+  unlock_buffer_pool(pool);
 }

 static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -777,7 +785,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
  const int tile_rows = 1 << cm->log2_tile_rows;
  TileBuffer tile_buffers[4][1 << 6];
  int tile_row, tile_col;
-  int mi_row, mi_col;
+  int mi_row = 0, mi_col = 0;
  TileData *tile_data = NULL;

  if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) {
@@ -797,7 +805,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
    vp9_copy(lf_data->planes, pbi->mb.plane);
    lf_data->stop = 0;
    lf_data->y_only = 0;
-    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
  }

  assert(tile_rows <= 4);
@@ -855,7 +862,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
        vp9_zero(tile_data->xd.left_seg_context);
        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
             mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
+          decode_partition(pbi, &tile_data->xd, &tile, mi_row, mi_col,
                           &tile_data->bit_reader, BLOCK_64X64);
        }
      }
@@ -879,6 +886,12 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
          winterface->execute(&pbi->lf_worker);
        }
      }
+      // After loopfiltering, the last 7 row pixels in each superblock row may
+      // still be changed by the longest loopfilter of the next superblock
+      // row.
+      if (pbi->frame_parallel_decode)
+        vp9_frameworker_broadcast(pbi->cur_buf,
+                                  mi_row << MI_BLOCK_SIZE_LOG2);
    }
  }

@@ -894,6 +907,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
  // Get last tile data.
  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;

+  if (pbi->frame_parallel_decode)
+    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
  return vp9_reader_find_end(&tile_data->bit_reader);
 }

@@ -908,7 +923,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
    vp9_zero(tile_data->xd.left_seg_context);
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE) {
-      decode_partition(tile_data->cm, &tile_data->xd, tile,
+      decode_partition(tile_data->pbi, &tile_data->xd, tile,
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
    }
  }
@@ -1014,10 +1029,10 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
      TileInfo *const tile = (TileInfo*)worker->data2;
      TileBuffer *const buf = &tile_buffers[0][n];

-      tile_data->cm = cm;
+      tile_data->pbi = pbi;
      tile_data->xd = pbi->mb;
      tile_data->xd.corrupted = 0;
-      vp9_tile_init(tile, tile_data->cm, 0, buf->col);
+      vp9_tile_init(tile, &pbi->common, 0, buf->col);
      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                          &tile_data->bit_reader, pbi->decrypt_cb,
                          pbi->decrypt_state);
@@ -1076,8 +1091,10 @@ static BITSTREAM_PROFILE read_profile(struct vp9_read_bit_buffer *rb) {
 static size_t read_uncompressed_header(VP9Decoder *pbi,
                                       struct vp9_read_bit_buffer *rb) {
  VP9_COMMON *const cm = &pbi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  BufferPool *const pool = pbi->common.buffer_pool;
+  int i, mask, ref_index = 0;
  size_t sz;
-  int i;

  cm->last_frame_type = cm->frame_type;

@@ -1094,16 +1111,22 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
  if (cm->show_existing_frame) {
    // Show an existing frame directly.
    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
-
-    if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1)
+    lock_buffer_pool(pool);
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1)
      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                         "Buffer %d does not contain a decoded frame",
                         frame_to_show);

-    ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+    unlock_buffer_pool(pool);
    pbi->refresh_frame_flags = 0;
    cm->lf.filter_level = 0;
    cm->show_frame = 1;
+
+    if (pbi->frame_parallel_decode) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+    }
    return 0;
  }

@@ -1143,6 +1166,10 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
    }

    setup_frame_size(cm, rb);
+    if (pbi->need_resync) {
+      vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      pbi->need_resync = 0;
+    }
  } else {
    cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);

@@ -1154,6 +1181,10 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,

      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
      setup_frame_size(cm, rb);
+      if (pbi->need_resync) {
+        vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+        pbi->need_resync = 0;
+      }
    } else {
      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
      for (i = 0; i < REFS_PER_FRAME; ++i) {
@@ -1161,10 +1192,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
        const int idx = cm->ref_frame_map[ref];
        RefBuffer *const ref_frame = &cm->frame_refs[i];
        ref_frame->idx = idx;
-        ref_frame->buf = &cm->frame_bufs[idx].buf;
+        ref_frame->buf = &frame_bufs[idx].buf;
        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
      }
-
      setup_frame_size_with_refs(cm, rb);

      cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
@@ -1182,6 +1212,12 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
    }
  }

+  if (pbi->need_resync) {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
  if (!cm->error_resilient_mode) {
    cm->coding_use_prev_mi = 1;
    cm->refresh_frame_context = vp9_rb_read_bit(rb);
@@ -1196,6 +1232,30 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
  // below, forcing the use of context 0 for those frame types.
  cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);

+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
  if (frame_is_intra_only(cm) || cm->error_resilient_mode)
    vp9_setup_past_independence(cm);

@@ -1341,6 +1401,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
  VP9_COMMON *const cm = &pbi->common;
  MACROBLOCKD *const xd = &pbi->mb;
  struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
+  int context_updated = 0;

  uint8_t clear_data[MAX_VP9_HEADER_SIZE];
  const size_t first_partition_size = read_uncompressed_header(pbi,
@@ -1378,6 +1439,28 @@ void vp9_decode_frame(VP9Decoder *pbi,
  xd->corrupted = 0;
  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);

+  if (cm->lf.filter_level) {
+    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+  }
+
+  // If encoded in frame parallel mode, frame context is ready after decoding
+  // the frame header.
+  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    if (cm->refresh_frame_context) {
+      context_updated = 1;
+      cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    }
+    vp9_frameworker_lock_stats(worker);
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    frame_worker_data->frame_context_ready = 1;
+    // Signal the main thread that context is ready.
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  }
+
  // TODO(jzern): remove frame_parallel_decoding_mode restriction for
  // single-frame tile decoding.
  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
@@ -1390,9 +1473,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
    *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
  }

-  new_fb->corrupted |= xd->corrupted;
-
-  if (!new_fb->corrupted) {
+  if (!xd->corrupted) {
    if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
      vp9_adapt_coef_probs(cm);

@@ -1403,8 +1484,235 @@ void vp9_decode_frame(VP9Decoder *pbi,
    } else {
      debug_check_frame_counts(cm);
    }
+  } else {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data is corrupted.");
  }

-  if (cm->refresh_frame_context)
+  // Non frame parallel update frame context here.
+  if (cm->refresh_frame_context && !context_updated)
    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
 }
+
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+
+void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                int plane, int block, int bw, int bh, int x,
+                                int y, int w, int h, int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi[0];
+  const int is_compound = has_second_ref(&mi->mbmi);
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  int ref;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+               ? average_split_mvs(pd, plane, mi, ref, block)
+               : mi->mbmi.mv[ref].as_mv;
+
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    MV32 scaled_mv;
+    int xs, ys, x0, y0, x0_16, y0_16, y1, frame_width, frame_height,
+        buf_stride, subpel_x, subpel_y;
+    uint8_t *ref_frame, *buf_ptr;
+    const int idx = xd->block_refs[ref]->idx;
+    BufferPool *const pool = pbi->common.buffer_pool;
+    RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+
+    // Get reference frame pointer, width and height.
+    if (plane == 0) {
+      frame_width = ref_frame_buf->buf.y_crop_width;
+      frame_height = ref_frame_buf->buf.y_crop_height;
+      ref_frame = ref_frame_buf->buf.y_buffer;
+    } else {
+      frame_width = ref_frame_buf->buf.uv_crop_width;
+      frame_height = ref_frame_buf->buf.uv_crop_height;
+      ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                           : ref_frame_buf->buf.v_buffer;
+    }
+
+    if (vp9_is_scaled(sf)) {
+      // Co-ordinate of containing block to pixel precision.
+      int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+      // Co-ordinate of the block to 1/16th pixel precision.
+      x0_16 = (x_start + x) << SUBPEL_BITS;
+      y0_16 = (y_start + y) << SUBPEL_BITS;
+
+      // Co-ordinate of current block in reference frame
+      // to 1/16th pixel precision.
+      x0_16 = sf->scale_value_x(x0_16, sf);
+      y0_16 = sf->scale_value_y(y0_16, sf);
+
+      // Map the top left corner of the block into the reference frame.
+      x0 = sf->scale_value_x(x_start + x, sf);
+      y0 = sf->scale_value_y(y_start + y, sf);
+
+      // Scale the MV and incorporate the sub-pixel offset of the block
+      // in the reference frame.
+      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+      xs = sf->x_step_q4;
+      ys = sf->y_step_q4;
+    } else {
+      // Co-ordinate of containing block to pixel precision.
+      x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+      y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+      // Co-ordinate of the block to 1/16th pixel precision.
+      x0_16 = x0 << SUBPEL_BITS;
+      y0_16 = y0 << SUBPEL_BITS;
+
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+    subpel_x = scaled_mv.col & SUBPEL_MASK;
+    subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+    // Calculate the top left corner of the best matching block in the
+    // reference frame.
+    x0 += scaled_mv.col >> SUBPEL_BITS;
+    y0 += scaled_mv.row >> SUBPEL_BITS;
+    x0_16 += scaled_mv.col;
+    y0_16 += scaled_mv.row;
+
+    // Get reference block pointer.
+    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+    buf_stride = pre_buf->stride;
+
+    // Get reference block bottom right vertical coordinate.
+    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+
+    // Do border extension if there is motion or the
+    // width/height is not a multiple of 8 pixels.
+    if (scaled_mv.col || scaled_mv.row ||
+        (frame_width & 0x7) || (frame_height & 0x7)) {
+      int x_pad = 0, y_pad = 0;
+
+      // Get reference block bottom right horizontal coordinate.
+      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+
+      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
+        x0 -= VP9_INTERP_EXTEND - 1;
+        x1 += VP9_INTERP_EXTEND;
+        x_pad = 1;
+      }
+
+      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
+        y0 -= VP9_INTERP_EXTEND - 1;
+        y1 += VP9_INTERP_EXTEND;
+        y_pad = 1;
+      }
+
+      // Wait until reference block is ready. Pad 7 more pixels as last 7
+      // pixels of each superblock row can be changed by next superblock row.
+       if (pbi->frame_parallel_decode)
+         vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                              (y1 + 7) << (plane == 0 ? 0 : 1));
+
+      // Skip border extension if block is inside the frame.
+      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
+          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
+        // Extend the border.
+        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1,
+                        x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width,
+                        frame_height);
+        buf_stride = x1 - x0 + 1;
+        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+      }
+    } else {
+      // Wait until reference block is ready. Pad 7 more pixels as last 7
+      // pixels of each superblock row can be changed by next superblock row.
+       if (pbi->frame_parallel_decode)
+         vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                              (y1 + 7) << (plane == 0 ? 0 : 1));
+    }
+
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+}
+
+void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          dec_build_inter_predictors(pbi, xd, plane, i++, bw, bh,
+                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
+    } else {
+      dec_build_inter_predictors(pbi, xd, plane, 0, bw, bh,
+                                 0, 0, bw, bh, mi_x, mi_y);
+    }
+  }
+}
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h
@@ -25,6 +25,9 @@ void vp9_decode_frame(struct VP9Decoder *pbi,
                      const uint8_t *data, const uint8_t *data_end,
                      const uint8_t **p_data_end);

+void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi,
+                                       MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -96,7 +96,7 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,

  for (y = 0; y < ymis; y++)
    for (x = 0; x < xmis; x++)
-      cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }

 static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -129,8 +129,10 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,

  predicted_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
                                            bsize, mi_row, mi_col);
-  if (!seg->update_map)
+  if (!seg->update_map) {
+    set_segment_id(cm, bsize, mi_row, mi_col, predicted_segment_id);
    return predicted_segment_id;
+  }

  if (seg->temporal_update) {
    const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
@@ -418,11 +420,18 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
  }
 }

-static void read_inter_block_mode_info(VP9_COMMON *const cm,
+static void fpm_sync(void *const data, int mi_row) {
+  VP9Decoder *const pbi = (VP9Decoder *)data;
+  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->prev_buf,
+                       mi_row << MI_BLOCK_SIZE_LOG2);
+}
+
+static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
                                       MODE_INFO *const mi,
                                       int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  const BLOCK_SIZE bsize = mbmi->sb_type;
  const int allow_hp = cm->allow_high_precision_mv;
@@ -436,7 +445,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
  for (ref = 0; ref < 1 + is_compound; ++ref) {
    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
    vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
-                     mi_row, mi_col);
+                     mi_row, mi_col, fpm_sync, (void *)pbi);
  }

  inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
@@ -510,10 +519,13 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
  }
 }

-static void read_inter_frame_mode_info(VP9_COMMON *const cm,
+// TODO(hkuang): Pass cm instead of pbi. This requires change in
+// vp9_frameworker_wait.
+static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                       MACROBLOCKD *const xd,
                                       const TileInfo *const tile,
                                       int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
  MODE_INFO *const mi = xd->mi[0];
  MB_MODE_INFO *const mbmi = &mi->mbmi;
  int inter_block;
@@ -527,16 +539,17 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
                               !mbmi->skip || !inter_block, r);

  if (inter_block)
-    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd, tile, mi, mi_row, mi_col, r);
  else
    read_intra_block_mode_info(cm, mi, r);
 }

-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                        const TileInfo *const tile,
                        int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
  if (frame_is_intra_only(cm))
    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
  else
-    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r);
 }
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -11,6 +11,7 @@
 #ifndef VP9_DECODER_VP9_DECODEMV_H_
 #define VP9_DECODER_VP9_DECODEMV_H_

+#include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_reader.h"

 #ifdef __cplusplus
@@ -19,7 +20,7 @@ extern "C" {

 struct TileInfo;

-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                        const struct TileInfo *const tile,
                        int mi_row, int mi_col, vp9_reader *r);

--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -26,6 +26,7 @@
 #endif
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_thread.h"

 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_decoder.h"
@@ -41,7 +42,7 @@ static void initialize_dec() {
  }
 }

-VP9Decoder *vp9_decoder_create() {
+VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
  VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
  VP9_COMMON *const cm = pbi ? &pbi->common : NULL;

@@ -57,15 +58,18 @@ VP9Decoder *vp9_decoder_create() {
  }

  cm->error.setjmp = 1;
+  pbi->need_resync = 1;
  initialize_dec();

  vp9_rtcd();

  // Initialize the references to not point to any frame buffers.
  vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  vpx_memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));

  cm->current_video_frame = 0;
  pbi->ready_for_new_data = 1;
+  pbi->common.buffer_pool = pool;

  // vp9_init_dequantizer() is first called here. Add check in
  // frame_init_dequantizer() to avoid unnecessary calling of
@@ -124,7 +128,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
   */
  if (ref_frame_flag == VP9_LAST_FLAG) {
    const YV12_BUFFER_CONFIG *const cfg =
-        &cm->frame_bufs[cm->ref_frame_map[0]].buf;
+        &cm->buffer_pool->frame_bufs[cm->ref_frame_map[0]].buf;
    if (!equal_dimensions(cfg, sd))
      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                         "Incorrect buffer dimensions");
@@ -143,6 +147,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                      VP9_REFFRAME ref_frame_flag,
                                      YV12_BUFFER_CONFIG *sd) {
  RefBuffer *ref_buf = NULL;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
  // encoder is using the frame buffers for. This is just a stub to keep the
@@ -170,11 +175,11 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
    const int free_fb = get_free_fb(cm);
    // Decrease ref_count since it will be increased again in
    // ref_cnt_fb() below.
-    cm->frame_bufs[free_fb].ref_count--;
+    --frame_bufs[free_fb].ref_count;

    // Manage the reference counters and copy image.
-    ref_cnt_fb(cm->frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &cm->frame_bufs[*ref_fb_ptr].buf;
+    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
+    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
    vp8_yv12_copy_frame(sd, ref_buf->buf);
  }

@@ -184,11 +189,12 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,

 int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) {
  VP9_COMMON *cm = &pbi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  if (index < 0 || index >= REF_FRAMES)
    return -1;

-  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+  *fb = &frame_bufs[cm->ref_frame_map[index]].buf;
  return 0;
 }

@@ -196,21 +202,38 @@ int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) {
 static void swap_frame_buffers(VP9Decoder *pbi) {
  int ref_index = 0, mask;
  VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

+  lock_buffer_pool(pool);
  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    if (mask & 1) {
    const int old_idx = cm->ref_frame_map[ref_index];
-      ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index],
-                 cm->new_fb_idx);
-      if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0)
-        cm->release_fb_cb(cm->cb_priv,
-                          &cm->frame_bufs[old_idx].raw_frame_buffer);
+    // Current thread releases the holding of reference frame.
+    decrease_ref_count(old_idx, frame_bufs, pool);
+
+    // Release the reference frame in reference map.
+    if ((mask & 1) && old_idx >= 0) {
+      decrease_ref_count(old_idx, frame_bufs, pool);
    }
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
    ++ref_index;
  }

+  // Current thread releases the holding of reference frame.
+  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 0;
  cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+
+  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+    lock_buffer_pool(pool);
+    --frame_bufs[cm->new_fb_idx].ref_count;
+    unlock_buffer_pool(pool);
+  }

  // Invalidate these references until the next frame starts.
  for (ref_index = 0; ref_index < 3; ref_index++)
@@ -220,9 +243,10 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
 int vp9_receive_compressed_data(VP9Decoder *pbi,
                                size_t size, const uint8_t **psource) {
  VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  const uint8_t *source = *psource;
  int retcode = 0;
-
  cm->error.error_code = VPX_CODEC_OK;

  if (size == 0) {
@@ -238,27 +262,63 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
      cm->frame_refs[0].buf->corrupted = 1;
  }

+  pbi->ready_for_new_data = 0;
+
  // Check if the previous frame was a frame without any references to it.
-  if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
-    cm->release_fb_cb(cm->cb_priv,
-                      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && frame_bufs[cm->new_fb_idx].ref_count == 0)
+    pool->release_fb_cb(pool->cb_priv,
+                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
  cm->new_fb_idx = get_free_fb(cm);

+  pbi->hold_ref_buf = 0;
+  if (pbi->frame_parallel_decode) {
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    vp9_frameworker_lock_stats(worker);
+    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+    // Reset decoding progress.
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+  }
+
  if (setjmp(cm->error.jmp)) {
    cm->error.setjmp = 0;
+    pbi->ready_for_new_data = 1;

-    // We do not know if the missing frame(s) was supposed to update
-    // any of the reference buffers, but we act conservative and
-    // mark only the last buffer as corrupted.
-    //
-    // TODO(jkoleszar): Error concealment is undefined and non-normative
-    // at this point, but if it becomes so, [0] may not always be the correct
-    // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
-      cm->frame_refs[0].buf->corrupted = 1;
+    lock_buffer_pool(pool);
+    // Release all the reference buffers if worker thread is holding them.
+    if (pbi->hold_ref_buf == 1) {
+      int ref_index = 0, mask;
+      VP9_COMMON *const cm = &pbi->common;
+      BufferPool *const pool = cm->buffer_pool;
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);

-    if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
-      cm->frame_bufs[cm->new_fb_idx].ref_count--;
+        // Release the reference frame in reference map.
+        if ((mask & 1) && old_idx >= 0) {
+          decrease_ref_count(old_idx, frame_bufs, pool);
+        }
+        ++ref_index;
+      }
+
+      // Current thread releases the holding of reference frame.
+      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      pbi->hold_ref_buf = 0;
+    }
+    // Release current frame.
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);

    return -1;
  }
@@ -271,19 +331,38 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,

  vp9_clear_system_state();

-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-
  if (!cm->show_existing_frame)
    cm->last_show_frame = cm->show_frame;
+
+  // Update progress in frame parallel decode.
+  if (pbi->frame_parallel_decode) {
+    // Need to lock the mutex here as another thread may
+    // be accessing this buffer.
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    vp9_frameworker_lock_stats(worker);
+
+    if (cm->show_frame) {
+      if (!cm->show_existing_frame)
+        vp9_swap_mi_and_prev_mi(cm);
+      cm->current_video_frame++;
+    }
+    vp9_swap_current_and_last_seg_map(cm);
+    frame_worker_data->frame_decoded = 1;
+    frame_worker_data->frame_context_ready = 1;
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
    if (cm->show_frame) {
      if (!cm->show_existing_frame)
        vp9_swap_mi_and_prev_mi(cm);
-
      cm->current_video_frame++;
    }

-  pbi->ready_for_new_data = 0;
+    vp9_swap_current_and_last_seg_map(cm);
+  }

  cm->error.setjmp = 0;
  return retcode;
@@ -299,12 +378,12 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
  if (pbi->ready_for_new_data == 1)
    return ret;

+  pbi->ready_for_new_data = 1;
+
  /* no raw frame to show!!! */
  if (pbi->common.show_frame == 0)
    return ret;

-  pbi->ready_for_new_data = 1;
-
 #if CONFIG_VP9_POSTPROC
  ret = vp9_post_proc_frame(&pbi->common, sd, flags);
 #else
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -18,10 +18,10 @@

 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
+#include "vp9/common/vp9_thread.h"

 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_thread.h"

 #ifdef __cplusplus
 extern "C" {
@@ -45,6 +45,12 @@ typedef struct VP9Decoder {

  int frame_parallel_decode;  // frame-based threading.

+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
+  RefCntBuffer *prev_buf;  //  Previous decoding frame buffer.
+
+  VP9Worker *frame_worker_owner;   // frame_worker that owns this pbi.
  VP9Worker lf_worker;
  VP9Worker *tile_workers;
  int num_tile_workers;
@@ -59,6 +65,8 @@ typedef struct VP9Decoder {

  int max_threads;
  int inv_tile_order;
+  int need_resync;  // wait for key/intra-only frame.
+  int hold_ref_buf;  // hold the reference buffer.
 } VP9Decoder;

 int vp9_receive_compressed_data(struct VP9Decoder *pbi,
@@ -78,10 +86,25 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
 int vp9_get_reference_dec(struct VP9Decoder *pbi,
                          int index, YV12_BUFFER_CONFIG **fb);

-struct VP9Decoder *vp9_decoder_create();
+struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);

 void vp9_decoder_remove(struct VP9Decoder *pbi);

+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0) {
+    --frame_bufs[idx].ref_count;
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the private buffer is not set up until finish decoding header.
+    // So any error happens during decoding header, the frame_bufs will not
+    // have valid priv buffer.
+    if (frame_bufs[idx].ref_count == 0 &&
+        frame_bufs[idx].raw_frame_buffer.priv) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -17,6 +17,8 @@
 #include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_decoder.h"

+// #define DEBUG_THREAD
+
 #if CONFIG_MULTITHREAD
 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
  const int kMaxTryLocks = 4000;
@@ -279,3 +281,178 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
    vp9_zero(*lf_sync);
  }
 }
+
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void vp9_frameworker_lock_stats(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void vp9_frameworker_unlock_stats(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void vp9_frameworker_signal_stats(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  // TODO(hkuang): Investigate using broadcast or signal.
+  pthread_cond_signal(&worker_data->stats_cond);
+#else
+  (void)worker;
+#endif
+}
+
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+                          int row) {
+#if CONFIG_MULTITHREAD
+  if (!ref_buf)
+    return;
+
+  // Enabling the following line of code will get harmless tsan error but
+  // will get best performance.
+  // if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+
+  {
+    // Find the worker thread that owns the reference frame. If the reference
+    // frame has been fully decoded, it may not have owner.
+    VP9Worker *const ref_worker = ref_buf->frame_worker_owner;
+    FrameWorkerData *const ref_worker_data =
+        (FrameWorkerData *)ref_worker->data1;
+    const VP9Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+    {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
+             worker_data->worker_id, worker, ref_worker_data->worker_id,
+             ref_buf->frame_worker_owner, row, ref_buf->row);
+    }
+#endif
+
+    vp9_frameworker_lock_stats(ref_worker);
+    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+           ref_buf->buf.corrupted != 1) {
+      pthread_cond_wait(&ref_worker_data->stats_cond,
+                        &ref_worker_data->stats_mutex);
+    }
+
+    if (ref_buf->buf.corrupted == 1) {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      vp9_frameworker_unlock_stats(ref_worker);
+      vpx_internal_error(&worker_data->pbi->common.error,
+                         VPX_CODEC_CORRUPT_FRAME,
+                         "Worker %p failed to decode frame", worker);
+    }
+    vp9_frameworker_unlock_stats(ref_worker);
+  }
+#else
+  (void)ref_buf;
+  (void)row;
+  (void)ref_buf;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
+#if CONFIG_MULTITHREAD
+  VP9Worker *worker = buf->frame_worker_owner;
+
+#ifdef DEBUG_THREAD
+  {
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+           buf->frame_worker_owner, row);
+  }
+#endif
+
+  vp9_frameworker_lock_stats(worker);
+  buf->row = row;
+  vp9_frameworker_signal_stats(worker);
+  vp9_frameworker_unlock_stats(worker);
+#else
+  (void)buf;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+                                  VP9Worker *const src_worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+  VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
+  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+  int i;
+
+  // Wait until source frame's context is ready.
+  vp9_frameworker_lock_stats(src_worker);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+        &src_worker_data->stats_mutex);
+  }
+
+  // src worker may have already finished decoding a frame and swapped the mi.
+  // TODO(hkuang): Remove following code after implenment no ModeInfo decoding.
+  if (src_worker_data->frame_decoded) {
+    dst_cm->prev_mip = src_cm->prev_mip;
+    dst_cm->prev_mi = src_cm->prev_mi;
+    dst_cm->prev_mi_grid_base = src_cm->prev_mi_grid_base;
+    dst_cm->prev_mi_grid_visible = src_cm->prev_mi_grid_visible;
+    dst_cm->last_frame_seg_map = src_cm->last_frame_seg_map;
+  } else {
+    dst_cm->prev_mip = src_cm->mip;
+    dst_cm->prev_mi = src_cm->mi;
+    dst_cm->prev_mi_grid_base = src_cm->mi_grid_base;
+    dst_cm->prev_mi_grid_visible = src_cm->mi_grid_visible;
+    dst_cm->last_frame_seg_map = src_cm->current_frame_seg_map;
+  }
+  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+  vp9_frameworker_unlock_stats(src_worker);
+
+  dst_worker_data->pbi->prev_buf =
+      src_worker_data->pbi->common.show_existing_frame ?
+          NULL : src_worker_data->pbi->cur_buf;
+
+  dst_cm->last_width = !src_cm->show_existing_frame ?
+                       src_cm->width : src_cm->last_width;
+  dst_cm->last_height = !src_cm->show_existing_frame ?
+                        src_cm->height : src_cm->last_height;
+  dst_cm->display_width = src_cm->display_width;
+  dst_cm->display_height = src_cm->display_height;
+  dst_cm->subsampling_x = src_cm->subsampling_x;
+  dst_cm->subsampling_y = src_cm->subsampling_y;
+  dst_cm->last_show_frame = !src_cm->show_existing_frame ?
+                            src_cm->show_frame : src_cm->last_show_frame;
+  dst_cm->last_frame_type = src_cm->last_frame_type;
+  dst_cm->frame_type = src_cm->frame_type;
+  dst_cm->y_dc_delta_q = src_cm->y_dc_delta_q;
+  dst_cm->uv_dc_delta_q = src_cm->uv_dc_delta_q;
+  dst_cm->uv_ac_delta_q = src_cm->uv_ac_delta_q;
+  dst_cm->base_qindex = src_cm->base_qindex;
+
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+  dst_cm->lf.filter_level = src_cm->lf.filter_level;
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
+  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+  dst_cm->seg = src_cm->seg;
+  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+  (void) dst_worker;
+  (void) src_worker;
+#endif  // CONFIG_MULTITHREAD
+}
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -12,14 +12,14 @@
 #define VP9_DECODER_VP9_DTHREAD_H_

 #include "./vpx_config.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/decoder/vp9_reader.h"
-#include "vp9/decoder/vp9_thread.h"

 struct VP9Common;
 struct VP9Decoder;

 typedef struct TileWorkerData {
-  struct VP9Common *cm;
+  struct VP9Decoder *pbi;
  vp9_reader bit_reader;
  DECLARE_ALIGNED(16, struct macroblockd, xd);

@@ -40,6 +40,31 @@ typedef struct VP9LfSyncData {
  int sync_range;
 } VP9LfSync;

+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP9Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
+#endif
+
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
+} FrameWorkerData;
+
 // Allocate memory for loopfilter row synchronization.
 void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
                           int rows, int width);
@@ -54,4 +79,23 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
                              int frame_filter_level,
                              int y_only);

+void vp9_frameworker_lock_stats(VP9Worker *const worker);
+void vp9_frameworker_unlock_stats(VP9Worker *const worker);
+void vp9_frameworker_signal_stats(VP9Worker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+                          int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+                                  VP9Worker *const src_worker);
+
 #endif  // VP9_DECODER_VP9_DTHREAD_H_
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -713,7 +713,8 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
 }


-VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
+VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                BufferPool *const pool) {
  unsigned int i, j;
  VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
  VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
@@ -734,6 +735,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
  vp9_rtcd();

  cpi->use_svc = 0;
+  cpi->common.buffer_pool = pool;

  init_config(cpi, oxcf);
  vp9_rc_init(&cpi->oxcf, cpi->pass, &cpi->rc);
@@ -1273,7 +1275,7 @@ int vp9_get_reference_enc(VP9_COMP *cpi, int index, YV12_BUFFER_CONFIG **fb) {
  if (index < 0 || index >= REF_FRAMES)
    return -1;

-  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+  *fb = &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
  return 0;
 }

@@ -1542,14 +1544,13 @@ static int recode_loop_test(const VP9_COMP *cpi,

 void vp9_update_reference_frames(VP9_COMP *cpi) {
  VP9_COMMON * const cm = &cpi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  // At this point the new frame has been encoded.
  // If any buffer copy / swapping is signaled it should be done here.
  if (cm->frame_type == KEY_FRAME) {
-    ref_cnt_fb(cm->frame_bufs,
-               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
-    ref_cnt_fb(cm->frame_bufs,
-               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
  } else if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
             cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
    /* Preserve the previously existing golden frame and update the frame in
@@ -1563,8 +1564,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     */
    int tmp;

-    ref_cnt_fb(cm->frame_bufs,
-               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);

    tmp = cpi->alt_fb_idx;
    cpi->alt_fb_idx = cpi->gld_fb_idx;
@@ -1577,19 +1577,17 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
        arf_idx = gf_group->arf_update_idx[gf_group->index];
      }

-      ref_cnt_fb(cm->frame_bufs,
-                 &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+      ref_cnt_fb(frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
    }

    if (cpi->refresh_golden_frame) {
-      ref_cnt_fb(cm->frame_bufs,
+      ref_cnt_fb(frame_bufs,
                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
    }
  }

  if (cpi->refresh_last_frame) {
-    ref_cnt_fb(cm->frame_bufs,
-               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+    ref_cnt_fb(frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
  }
 #if CONFIG_DENOISING
  vp9_denoiser_update_frame_info(&cpi->denoiser,
@@ -1630,34 +1628,36 @@ void vp9_scale_references(VP9_COMP *cpi) {
  VP9_COMMON *cm = &cpi->common;
  MV_REFERENCE_FRAME ref_frame;
  const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
+    const YV12_BUFFER_CONFIG *const ref = &frame_bufs[idx].buf;

    // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
    if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) &&
        (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) {
      const int new_fb = get_free_fb(cm);
-      vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
+      vp9_realloc_frame_buffer(&frame_bufs[new_fb].buf,
                               cm->width, cm->height,
                               cm->subsampling_x, cm->subsampling_y,
                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
-      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
+      scale_and_extend_frame(ref, &frame_bufs[new_fb].buf);
      cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
    } else {
      cpi->scaled_ref_idx[ref_frame - 1] = idx;
-      cm->frame_bufs[idx].ref_count++;
+      ++frame_bufs[idx].ref_count;
    }
  }
 }

 static void release_scaled_references(VP9_COMP *cpi) {
  VP9_COMMON *cm = &cpi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  int i;

-  for (i = 0; i < 3; i++)
-    cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--;
+  for (i = 0; i < 3; ++i)
+    --frame_bufs[cpi->scaled_ref_idx[i]].ref_count;
 }

 static void full_to_model_count(unsigned int *model_count,
@@ -2520,6 +2520,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
  const int is_spatial_svc = cpi->use_svc &&
                             (cpi->svc.number_temporal_layers == 1) &&
                             (cpi->svc.number_spatial_layers > 1);
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;

  if (!cpi)
    return -1;
@@ -2602,7 +2603,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
      cm->show_frame = 1;
      cm->intra_only = 0;

-      // Check to see if the frame should be encoded as an arf overlay.
+      // Check to see if the frame to be encoded is an overlay for a previous
+      // arf frame and if so configure it as such.
      check_src_altref(cpi);
    }
  }
@@ -2656,7 +2658,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
  /* find a free buffer for the new frame, releasing the reference previously
   * held.
   */
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+  --frame_bufs[cm->new_fb_idx].ref_count;
  cm->new_fb_idx = get_free_fb(cm);

  if (!cpi->use_svc && cpi->multi_arf_allowed) {
@@ -2690,7 +2692,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,

  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
+    YV12_BUFFER_CONFIG *const buf = &frame_bufs[idx].buf;
    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
    ref_buf->buf = buf;
    ref_buf->idx = idx;
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -436,7 +436,8 @@ typedef struct VP9_COMP {

 void vp9_initialize_enc();

-struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
+struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                       BufferPool *const pool);
 void vp9_remove_compressor(VP9_COMP *cpi);

 void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
@@ -494,8 +495,9 @@ static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,

 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON * const cm = &cpi->common;
-  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
+  VP9_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  return &pool->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
      .buf;
 }

--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -484,7 +484,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

      if (cm->coding_use_prev_mi)
        vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame,
-                         candidates, mi_row, mi_col);
+                         candidates, mi_row, mi_col, NULL, NULL);
      else
        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0],
                                             ref_frame, candidates,
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2253,7 +2253,8 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
  vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);

  // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
+  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col,
+                   NULL, NULL);

  // Candidate refinement carried out at encoder and decoder
  vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
@@ -2271,9 +2272,10 @@ void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
                                                   int ref_frame) {
  const VP9_COMMON *const cm = &cpi->common;
+  const RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
  const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
-  return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
+  return (scaled_idx != ref_idx) ? &frame_bufs[scaled_idx].buf : NULL;
 }

 int vp9_get_switchable_rate(const VP9_COMP *cpi) {
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -50,6 +50,8 @@ VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
+VP9_COMMON_SRCS-yes += common/vp9_thread.h
+VP9_COMMON_SRCS-yes += common/vp9_thread.c
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -90,6 +90,8 @@ struct vpx_codec_alg_priv {
  vp8_postproc_cfg_t      preview_ppcfg;
  vpx_codec_pkt_list_decl(128) pkt_list;
  unsigned int                 fixed_kf_cntr;
+  // BufferPool that holds all reference frames.
+  BufferPool              *buffer_pool;
 };

 static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
@@ -630,6 +632,10 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
    ctx->priv->alg_priv = priv;
    ctx->priv->init_flags = ctx->init_flags;
    ctx->priv->enc.total_encoders = 1;
+    ctx->priv->alg_priv->buffer_pool =
+        (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+    if (ctx->priv->alg_priv->buffer_pool == NULL)
+      return VPX_CODEC_MEM_ERROR;

    if (ctx->config.enc) {
      // Update the reference to the config structure to an internal copy.
@@ -667,7 +673,8 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
      set_encoder_config(&ctx->priv->alg_priv->oxcf,
                         &ctx->priv->alg_priv->cfg,
                         &ctx->priv->alg_priv->extra_cfg);
-      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
+      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf,
+                                  ctx->priv->alg_priv->buffer_pool);
      if (cpi == NULL)
        res = VPX_CODEC_MEM_ERROR;
      else
@@ -681,6 +688,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
  free(ctx->cx_data);
  vp9_remove_compressor(ctx->cpi);
+  vpx_free(ctx->buffer_pool);
  free(ctx);
  return VPX_CODEC_OK;
 }
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -18,6 +18,7 @@
 #include "vpx/vpx_decoder.h"

 #include "vp9/common/vp9_frame_buffers.h"
+#include "vp9/common/vp9_thread.h"

 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
@@ -28,19 +29,43 @@

 typedef vpx_codec_stream_info_t vp9_stream_info_t;

+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6   // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+  int fb_idx;
+  vpx_image_t img;
+} cache_frame;
+
 struct vpx_codec_alg_priv {
  vpx_codec_priv_t        base;
  vpx_codec_dec_cfg_t     cfg;
  vp9_stream_info_t       si;
-  struct VP9Decoder *pbi;
  int                     postproc_cfg_set;
  vp8_postproc_cfg_t      postproc_cfg;
  vpx_decrypt_cb          decrypt_cb;
  void                    *decrypt_state;
  vpx_image_t             img;
-  int                     img_avail;
+  int                     flushed;
  int                     invert_tile_order;
+  int                     last_show_frame;  // Index of last output frame.
+
+  // Frame parallel related.
  int                     frame_parallel_decode;  // frame-based threading.
+  VP9Worker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_worker_id;
+  int                     last_submit_worker_id;
+  int                     next_output_worker_id;
+  int                     available_threads;
+  cache_frame             frame_cache[FRAME_CACHE_SIZE];
+  int                     frame_cache_write;
+  int                     frame_cache_read;
+  int                     num_cache_frames;
+
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool              *buffer_pool;

  // External frame buffer info to save for VP9 common.
  void *ext_priv;  // Private data associated with the external frame buffers.
@@ -68,11 +93,11 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
    ctx->priv->alg_priv = alg_priv;
    ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
    ctx->priv->init_flags = ctx->init_flags;
+    ctx->priv->alg_priv->flushed = 0;
+    // Only do frame parallel decode when threads > 1.
    ctx->priv->alg_priv->frame_parallel_decode =
-        (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
-
-    // Disable frame parallel decoding for now.
-    ctx->priv->alg_priv->frame_parallel_decode = 0;
+        ((ctx->config.dec->threads > 1) &&
+         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0;

    if (ctx->config.dec) {
      // Update the reference to the config structure to an internal copy.
@@ -85,11 +110,29 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
 }

 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->pbi) {
-    vp9_decoder_remove(ctx->pbi);
-    ctx->pbi = NULL;
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      VP9Worker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      vp9_get_worker_interface()->end(worker);
+      vp9_decoder_remove(frame_worker_data->pbi);
+      vpx_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+      pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+      vpx_free(frame_worker_data);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+    vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
  }

+  vpx_free(ctx->frame_workers);
+  vpx_free(ctx->buffer_pool);
  vpx_free(ctx);

  return VPX_CODEC_OK;
@@ -188,32 +231,43 @@ static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx,
  return VPX_CODEC_OK;
 }

+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
                           const struct vpx_internal_error_info *error) {
  if (error->error_code)
-    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);

  return error->error_code;
 }

 static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  VP9_COMMON *const cm = &ctx->pbi->common;
+  int i;
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    BufferPool *const pool = cm->buffer_pool;

    cm->new_fb_idx = -1;
-
    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-    cm->get_fb_cb = ctx->get_ext_fb_cb;
-    cm->release_fb_cb = ctx->release_ext_fb_cb;
-    cm->cb_priv = ctx->ext_priv;
+      pool->get_fb_cb = ctx->get_ext_fb_cb;
+      pool->release_fb_cb = ctx->release_ext_fb_cb;
+      pool->cb_priv = ctx->ext_priv;
    } else {
-    cm->get_fb_cb = vp9_get_frame_buffer;
-    cm->release_fb_cb = vp9_release_frame_buffer;
+      pool->get_fb_cb = vp9_get_frame_buffer;
+      pool->release_fb_cb = vp9_release_frame_buffer;

-    if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
+      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                           "Failed to initialize internal frame buffers");

-    cm->cb_priv = &cm->int_frame_buffers;
+      pool->cb_priv = &pool->int_frame_buffers;
+    }
  }
 }

@@ -232,14 +286,126 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
  flags->noise_level = ctx->postproc_cfg.noise_level;
 }

-static void init_decoder(vpx_codec_alg_priv_t *ctx) {
-  ctx->pbi = vp9_decoder_create();
-  if (ctx->pbi == NULL)
-    return;
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
+  (void)arg2;

-  ctx->pbi->max_threads = ctx->cfg.threads;
-  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
-  ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+  frame_worker_data->result =
+      vp9_receive_compressed_data(frame_worker_data->pbi,
+                                  frame_worker_data->data_size,
+                                  &data);
+  frame_worker_data->data_end = data;
+
+  if (frame_worker_data->pbi->frame_parallel_decode) {
+    // In frame parallel decoding, a worker thread must successfully decode all
+    // the compressed data.
+    if (frame_worker_data->result != 0 ||
+        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
+      VP9Worker *const worker = frame_worker_data->pbi->frame_worker_owner;
+      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
+      // Signal all the other threads that are waiting for this frame.
+      vp9_frameworker_lock_stats(worker);
+      frame_worker_data->frame_context_ready = 1;
+      lock_buffer_pool(pool);
+      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+      unlock_buffer_pool(pool);
+      frame_worker_data->pbi->need_resync = 1;
+      vp9_frameworker_signal_stats(worker);
+      vp9_frameworker_unlock_stats(worker);
+      return 0;
+    }
+  } else if (frame_worker_data->result != 0) {
+    // Check decode result in serial decode.
+    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+    frame_worker_data->pbi->need_resync = 1;
+  }
+
+  return !frame_worker_data->result;
+}
+
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  int i;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+
+  ctx->last_show_frame = -1;
+  ctx->next_submit_worker_id = 0;
+  ctx->last_submit_worker_id = 0;
+  ctx->next_output_worker_id = 0;
+  ctx->frame_cache_read = 0;
+  ctx->frame_cache_write = 0;
+  ctx->num_cache_frames = 0;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+  ctx->available_threads = ctx->num_frame_workers;
+  ctx->flushed = 0;
+
+  ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL)
+    return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+
+  ctx->frame_workers = (VP9Worker *)
+      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *frame_worker_data = NULL;
+    winterface->init(worker);
+    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
+    if (frame_worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data->pbi->frame_worker_owner = worker;
+    frame_worker_data->pbi->common.mi_idx = 0;
+    frame_worker_data->pbi->common.prev_mi_idx = 1;
+    frame_worker_data->worker_id = i;
+    frame_worker_data->scratch_buffer = NULL;
+    frame_worker_data->scratch_buffer_size = 0;
+    frame_worker_data->frame_context_ready = 0;
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    frame_worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    frame_worker_data->pbi->common.frame_parallel_decode =
+        ctx->frame_parallel_decode;
+    worker->hook = (VP9WorkerHook)frame_worker_hook;
+    if (!winterface->reset(worker)) {
+      set_error_detail(ctx, "Frame Worker thread creation failed");
+      return VPX_CODEC_MEM_ERROR;
+    }
+  }

  // If postprocessing was enabled by the application and a
  // configuration has not been provided, default it.
@@ -248,20 +414,17 @@ static void init_decoder(vpx_codec_alg_priv_t *ctx) {
    set_default_ppflags(&ctx->postproc_cfg);

  init_buffer_callbacks(ctx);
+
+  return VPX_CODEC_OK;
 }

 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                  const uint8_t **data, unsigned int data_sz,
                                  void *user_priv, int64_t deadline) {
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = {0, 0, 0};
-  VP9_COMMON *cm = NULL;
-
+  vp9_ppflags_t flags = {0};
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
  (void)deadline;

-  vp9_zero(sd);
-  ctx->img_avail = 0;
-
  // Determine the stream parameters. Note that we rely on peek_si to
  // validate that we have a buffer that does not wrap around the top
  // of the heap.
@@ -276,33 +439,73 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
      return VPX_CODEC_ERROR;
  }

-  // Initialize the decoder instance on the first frame
-  if (ctx->pbi == NULL) {
-    init_decoder(ctx);
-    if (ctx->pbi == NULL)
-      return VPX_CODEC_ERROR;
-  }
+  if (!ctx->frame_parallel_decode) {
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->data = *data;
+    frame_worker_data->data_size = data_sz;
+    frame_worker_data->user_priv = user_priv;

    // Set these even if already initialized.  The caller may have changed the
    // decrypt config between frames.
-  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
-  ctx->pbi->decrypt_state = ctx->decrypt_state;
+    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;

-  cm = &ctx->pbi->common;
+    worker->had_error = 0;
+    winterface->execute(worker);

-  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data))
-    return update_error_state(ctx, &cm->error);
+    // Update data pointer after decode.
+    *data = frame_worker_data->data_end;
+
+    if (worker->had_error)
+      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+  } else {
+    const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+    VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    // Copy context from last worker thread to next worker thread.
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      vp9_frameworker_copy_context(
+          &ctx->frame_workers[ctx->next_submit_worker_id],
+          &ctx->frame_workers[ctx->last_submit_worker_id]);
+
+    frame_worker_data->pbi->ready_for_new_data = 0;
+    // Copy the compressed data into worker's internal buffer.
+    // TODO(hkuang): Will all the workers allocate the same size
+    // as the size of the first intra frame be better? This will
+    // avoid too many deallocate and allocate.
+    if (frame_worker_data->scratch_buffer_size < data_sz) {
+      frame_worker_data->scratch_buffer =
+          (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
+      if (frame_worker_data->scratch_buffer == NULL) {
+        set_error_detail(ctx, "Failed to reallocate scratch buffer");
+        return VPX_CODEC_MEM_ERROR;
+      }
+      frame_worker_data->scratch_buffer_size = data_sz;
+    }
+    frame_worker_data->data_size = data_sz;
+    vpx_memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
+
+    frame_worker_data->frame_decoded = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->data = frame_worker_data->scratch_buffer;
+    frame_worker_data->user_priv = user_priv;
+
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      ctx->last_submit_worker_id =
+          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    ctx->next_submit_worker_id =
+        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    --ctx->available_threads;
+    worker->had_error = 0;
+    winterface->launch(worker);
+  }

  if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
    set_ppflags(ctx, &flags);

-  if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
-    return update_error_state(ctx, &cm->error);
-
-  yuvconfig2image(&ctx->img, &sd, user_priv);
-  ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-  ctx->img_avail = 1;
-
  return VPX_CODEC_OK;
 }

@@ -381,6 +584,30 @@ static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
  return VPX_CODEC_OK;
 }

+static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  vp9_ppflags_t flags = {0, 0, 0};
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+  VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  ctx->next_output_worker_id =
+      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+  winterface->sync(worker);
+  ++ctx->available_threads;
+  if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
+    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
+                    frame_worker_data->user_priv);
+    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
+        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+    ctx->frame_cache_write =
+        (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
+    ++ctx->num_cache_frames;
+  }
+}
+
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                      const uint8_t *data, unsigned int data_sz,
                                      void *user_priv, long deadline) {
@@ -390,8 +617,20 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
  uint32_t frame_sizes[8];
  int frame_count;

-  if (data == NULL || data_sz == 0)
-    return VPX_CODEC_INVALID_PARAM;
+  if (data == NULL && data_sz == 0) {
+    ctx->flushed = 1;
+    return VPX_CODEC_OK;
+  }
+
+  // Reset flushed when receiving a valid frame.
+  ctx->flushed = 0;
+
+  // Initialize the decoder workers on the first frame.
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
+  }

  res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
                               ctx->decrypt_cb, ctx->decrypt_state);
@@ -409,30 +648,46 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
      for (i = 0; i < frame_count; ++i) {
        const uint8_t *data_start_copy = data_start;
        const uint32_t frame_size = frame_sizes[i];
-        vpx_codec_err_t res;
        if (data_start < data
            || frame_size > (uint32_t) (data_end - data_start)) {
-          ctx->base.err_detail = "Invalid frame size in index";
+          set_error_detail(ctx, "Invalid frame size in index");
          return VPX_CODEC_CORRUPT_FRAME;
        }

+        if (ctx->available_threads == 0) {
+          // No more threads for decoding. Wait until the next output worker
+          // finishes decoding. Then copy the decoded frame into cache.
+          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+            wait_worker_and_cache_frame(ctx);
+          } else {
+            // TODO(hkuang): Add unit test to test this path.
+            set_error_detail(ctx, "Frame output cache is full.");
+            return VPX_CODEC_ERROR;
+          }
+        }
+
        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
                         deadline);
        if (res != VPX_CODEC_OK)
          return res;
-
        data_start += frame_size;
      }
    } else {
-      res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
+      if (ctx->available_threads == 0) {
+        // No more threads for decoding. Wait until the next output worker
+        // finishes decoding. Then copy the decoded frame into cache.
+        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+          wait_worker_and_cache_frame(ctx);
+        } else {
+          // TODO(hkuang): Add unit test to test this path.
+          set_error_detail(ctx, "Frame output cache is full.");
+          return VPX_CODEC_ERROR;
+        }
+      }
+
+      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
      if (res != VPX_CODEC_OK)
        return res;
-
-      // Extra data detected after the frame.
-      if (data_start < data_end - 1) {
-        ctx->base.err_detail = "Fail to decode frame in parallel mode";
-        return VPX_CODEC_INCAPABLE;
-      }
    }
  } else {
    // Decode in serial mode.
@@ -445,7 +700,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
        vpx_codec_err_t res;
        if (data_start < data
            || frame_size > (uint32_t) (data_end - data_start)) {
-          ctx->base.err_detail = "Invalid frame size in index";
+          set_error_detail(ctx, "Invalid frame size in index");
          return VPX_CODEC_CORRUPT_FRAME;
        }

@@ -476,23 +731,73 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
    }
  }

-  return VPX_CODEC_OK;
+  return res;
+}
+
+static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
+  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
+  // Decrease reference count of last output frame in frame parallel mode.
+  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+  }
 }

 static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                      vpx_codec_iter_t *iter) {
  vpx_image_t *img = NULL;

-  if (ctx->img_avail) {
+  // Only return frame when all the cpu are busy or
+  // application fluhsed the decoder in frame parallel decode.
+  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
+      !ctx->flushed) {
+    return img;
+  }
+
+  // Output the frames in the cache first.
+  if (ctx->num_cache_frames > 0) {
+    release_last_output_frame(ctx);
+    ctx->last_show_frame  = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
+    img = &ctx->frame_cache[ctx->frame_cache_read].img;
+    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
+    --ctx->num_cache_frames;
+    return img;
+  }
+
  // iter acts as a flip flop, so an image is only returned on the first
  // call to get_frame.
-    if (!(*iter)) {
+  if (*iter == NULL && ctx->frame_workers != NULL) {
+    do {
+      YV12_BUFFER_CONFIG sd;
+      vp9_ppflags_t flags = {0, 0, 0};
+      const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+      VP9Worker *const worker =
+          &ctx->frame_workers[ctx->next_output_worker_id];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      ctx->next_output_worker_id =
+          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      // Wait for the frame from worker thread.
+      if (!winterface->sync(worker)) {
+        // Decoding failed. Release the worker thread.
+        ++ctx->available_threads;
+        if (ctx->flushed != 1)
+          return img;
+      } else if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+        VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+        RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+        ++ctx->available_threads;
+        release_last_output_frame(ctx);
+        ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+        yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+        ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
        img = &ctx->img;
-      *iter = img;
+        return img;
      }
+    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
  }
-  ctx->img_avail = 0;
-
  return img;
 }

@@ -502,7 +807,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
  if (cb_get == NULL || cb_release == NULL) {
    return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->pbi == NULL) {
+  } else if (ctx->frame_workers == NULL) {
    // If the decoder has already been initialized, do not accept changes to
    // the frame buffer functions.
    ctx->get_ext_fb_cb = cb_get;
@@ -518,12 +823,19 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
  vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (data) {
    vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
    YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
    image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&ctx->pbi->common,
+    return vp9_set_reference_dec(&frame_worker_data->pbi->common,
                                 (VP9_REFFRAME)frame->frame_type, &sd);
  } else {
    return VPX_CODEC_INVALID_PARAM;
@@ -534,13 +846,19 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
  vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
    YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
    image2yuvconfig(&frame->img, &sd);
-
-    return vp9_copy_reference_dec(ctx->pbi,
+    return vp9_copy_reference_dec(frame_worker_data->pbi,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
  } else {
    return VPX_CODEC_INVALID_PARAM;
@@ -551,10 +869,17 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (data) {
    YV12_BUFFER_CONFIG* fb;
-
-    vp9_get_reference_dec(ctx->pbi, data->idx, &fb);
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    vp9_get_reference_dec(frame_worker_data->pbi, data->idx, &fb);
    yuvconfig2image(&data->img, fb, NULL);
    return VPX_CODEC_OK;
  } else {
@@ -592,27 +917,42 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
  int *const update_info = va_arg(args, int *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (update_info) {
-    if (ctx->pbi)
-      *update_info = ctx->pbi->refresh_frame_flags;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+    } else {
      return VPX_CODEC_ERROR;
+    }
    return VPX_CODEC_OK;
  } else {
    return VPX_CODEC_INVALID_PARAM;
  }
 }

-
 static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                                va_list args) {
  int *corrupted = va_arg(args, int *);

  if (corrupted) {
-    if (ctx->pbi)
-      *corrupted = ctx->pbi->common.frame_to_show->corrupted;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      RefCntBuffer *const frame_bufs =
+          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
+      *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+    } else {
      return VPX_CODEC_ERROR;
+    }
    return VPX_CODEC_OK;
  } else {
    return VPX_CODEC_INVALID_PARAM;
@@ -623,9 +963,18 @@ static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
                                             va_list args) {
  int *const display_size = va_arg(args, int *);

+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
  if (display_size) {
-    if (ctx->pbi) {
-      const VP9_COMMON *const cm = &ctx->pbi->common;
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
      display_size[0] = cm->display_width;
      display_size[1] = cm->display_height;
    } else {
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -31,8 +31,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
-VP9_DX_SRCS-yes += decoder/vp9_thread.c
-VP9_DX_SRCS-yes += decoder/vp9_thread.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h

--- a/vpx/vpx_frame_buffer.h
+++ b/vpx/vpx_frame_buffer.h
@@ -22,8 +22,11 @@ extern "C" {
 #include "./vpx_integer.h"

 /*!\brief The maximum number of work buffers used by libvpx.
+ *  Support maximum 4 threads to decode video in parallel.
+ *  Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
 */
-#define VPX_MAXIMUM_WORK_BUFFERS 1
+#define VPX_MAXIMUM_WORK_BUFFERS 8

 /*!\brief The maximum number of reference buffers that a VP9 encoder may use.
 */
--- a/webmdec.cc
+++ b/webmdec.cc
@@ -41,6 +41,7 @@ void reset(struct WebmInputContext *const webm_ctx) {
  webm_ctx->block_frame_index = 0;
  webm_ctx->video_track_index = 0;
  webm_ctx->timestamp_ns = 0;
+  webm_ctx->is_key_frame = false;
 }

 void get_first_cluster(struct WebmInputContext *const webm_ctx) {
@@ -182,6 +183,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
  }
  *bytes_in_buffer = frame.len;
  webm_ctx->timestamp_ns = block->GetTime(cluster);
+  webm_ctx->is_key_frame = block->IsKey();

  mkvparser::MkvReader *const reader =
      reinterpret_cast<mkvparser::MkvReader*>(webm_ctx->reader);
--- a/webmdec.h
+++ b/webmdec.h
@@ -28,6 +28,7 @@ struct WebmInputContext {
  int block_frame_index;
  int video_track_index;
  uint64_t timestamp_ns;
+  int is_key_frame;
 };

 // Checks if the input is a WebM file. If so, initializes WebMInputContext so
Author	SHA1	Message	Date
hkuang	d05cf10fe7	Add error handling for frame parallel decode and unit test for that. Change-Id: I6e309e11f1641618d2424b7a2c0fe744b8974dec	2014-12-08 12:30:19 -08:00
hkuang	a9a20a1040	Fix a bug in frame parallel decode and add a unit test for that. A flush bug is discovered during putting frame parallel decoder into Android. This test will expose that bug. Change-Id: Ia047f27972f4da0471649f79f1f91e7695297473	2014-11-14 10:16:34 -08:00
hkuang	165db2343a	Merge "Add two test vectors to test frame parallel decode." into frame_parallel	2014-11-07 10:06:35 -08:00
hkuang	f2fe530e15	Add two test vectors to test frame parallel decode. The added vectors are mainly used to test the output cache mechanism in frame parallel decode. Change-Id: I3d413d060daa5abf72358f6350bd1d16d71adc5a	2014-11-06 16:28:35 -08:00
hkuang	ad693e1ff5	Add key frame seeking to webmdec and webm_video_source. This is for the frame parallel's pause/seek/resume unit test. Change-Id: Ie235f86ca5f3d525896222766f6d610e6682fd76	2014-11-06 13:51:08 -08:00
Hangyu Kuang	9ce3a7d76c	Implement frame parallel decode for VP9. Using 4 threads, frame parallel decode is ~3x faster than single thread decode and around 30% faster than tile parallel decode for frame parallel encoded video on both Android and desktop with 4 threads. Decode speed is scalable to threads too which means decode could be even faster with more threads. Change-Id: Ia0a549aaa3e83b5a17b31d8299aa496ea4f21e3e	2014-10-22 10:50:58 -07:00
hkuang	4d0d78424b	Increase the thread test range to cover 5, 6, 7, 8 threads. Change-Id: Id25c294720551bb5153987d8758668befaa57929	2014-09-05 17:11:30 -07:00
hkuang	5106e4dfa8	Fix a bug in adding frame parallel unit test. There are two CreateDecoder functions and decode_test_driver is not calling the right function now. This bug is discovered during really enable the frame parallel flag inside libvpx. This bug does not affect any existing unit test though. Change-Id: Icd9633c4b66d50e422a09c4310ff791082878936	2014-08-18 11:16:15 -07:00
hkuang	02410659cd	Merge "Add VP9 frame-parallel unit test." into frame_parallel	2014-08-14 14:55:09 -07:00
hkuang	a3ef7d5a50	Add VP9 frame-parallel unit test. Make sure VP9 frame-parallel decode passes all the standard test vectors. Only test running with 2,3,4 threads now. Also refactor the video decode test driver to support passing in decode flags which is used to enable frame-parallel decode. Change-Id: I6a712464232c2e13681634951c7e176312522e1e	2014-08-14 13:06:26 -07:00
hkuang	48c5d470e7	Manually pick "Make the api behavior conform to api spec." from master branch. Change-Id: I7323ec4cf8b8b7841e37f2bf90548cefa9de9795	2014-08-07 15:15:46 -07:00
hkuang	44395a21da	Move vp9_dec_build_inter_predictors_* to decoder folder. Change-Id: Ibe9fa28440cc79ba9f3504d78c7dca7bb01a23e1	2014-07-28 11:09:11 -07:00
hkuang	7eca086707	Add segmentation map array for current and last frame segmentation. The original implementation only allocates one segmentation map and this works fine for serial decode. But for frame parallel decode, each thread need to have its own segmentation map and the last frame segmentation map should be provided from last frame decoding thread. After finishing decoding a frame, thread need to serve the old segmentation map that associate with the previous decoded frame. The thread also need to use another segmentation map for decoding the current frame. Change-Id: I442ddff36b5de9cb8a7eb59e225744c78f4492d8	2014-07-28 10:44:02 -07:00
hkuang	4c08120ca0	Merge "Include the right header for VP9 worker thread." into frame_parallel	2014-07-14 16:09:16 -07:00
hkuang	294b849796	Include the right header for VP9 worker thread. pthread.h is not supported in windows. vp9_thread.h includes the emulation layer for pthread in windows. Change-Id: I2b1c8ec299928472faca7ebeea998170c9f4d744	2014-07-14 16:03:38 -07:00
hkuang	3cffa0c74e	Move vp9_thread.* to common. Prepare for frame parallel decoding, the reference count buffers need to be protected by mutex. Move vp9_thread.* to common folder so that those buffers could use cross-platform mutex from vp9_thread.*. (cherry picked from commit `337e8015c9`) Change-Id: I0587a08447925f4554d7788686a31483c2ae3f37	2014-07-11 15:24:31 -07:00
hkuang	10aa23f751	ctrl_get_reference does not need user_priv. The relationship of the user private data at runtime is not preserved from decode() to this call which may occur at an unknown point in the future Change-Id: Ia7eb25365c805147614574c3af87aedbe0305fc6	2014-07-02 19:28:38 -07:00
hkuang	28a794f680	Seperate the frame buffers from VP9 encoder/decoder structure. Prepare for frame parallel decoding, the frame buffers must be separated from the encoder and decoder structure, while the encoder and decoder will hold the pointer of the BufferPool. Change-Id: I172c78f876e41fb5aea11be5f632adadf2a6f466	2014-07-02 15:34:20 -07:00
hkuang	bf58d1725c	Revert "Revert "Revert "Revert 3 patches from Hangyu to get Chrome to build:""" This reverts commit `749e0c7b28`. Change-Id: I0c63a152baf94d38496dd925a40040366153bf4f	2014-07-02 14:57:39 -07:00