Decode a full row of bitstream before reconstructing

Needs more memory but allows for future parallelization. Noticeably faster on ARM, slightly faster on x86 also: remove dec->filter_row_ unnecessary field Change-Id: I044a808839b4e000c838a477e3e8688820436d9a
2013-10-10 21:29:58 +02:00 · 2013-10-10 21:29:58 +02:00 · cb22155201
commit cb22155201
parent dca8a4d315
5 changed files with 128 additions and 125 deletions
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -252,10 +252,13 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int filter_row =
+      (dec->filter_type_ > 0) &&
+      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
  if (!dec->use_threads_) {
    // ctx->id_ and ctx->f_info_ are already set
    ctx->mb_y_ = dec->mb_y_;
-    ctx->filter_row_ = dec->filter_row_;
+    ctx->filter_row_ = filter_row;
    ok = FinishRow(dec, io);
  } else {
    WebPWorker* const worker = &dec->worker_;
@ -266,8 +269,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
      ctx->io_ = *io;
      ctx->id_ = dec->cache_id_;
      ctx->mb_y_ = dec->mb_y_;
-      ctx->filter_row_ = dec->filter_row_;
-      if (ctx->filter_row_) {    // just swap filter info
+      ctx->filter_row_ = filter_row;
+      if (filter_row) {    // just swap filter info
        VP8FInfo* const tmp = ctx->f_info_;
        ctx->f_info_ = dec->f_info_;
        dec->f_info_ = tmp;
@ -416,7 +419,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
        : 0;
  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const size_t mb_data_size = sizeof(*dec->mb_data_);
+  const size_t mb_data_size = mb_w * sizeof(*dec->mb_data_);
  const size_t cache_height = (16 * num_caches
                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
  const size_t cache_size = top_size * cache_height;
@ -491,8 +494,9 @@ static int AllocateMemory(VP8Decoder* const dec) {
  mem += alpha_size;
  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);

-  // note: left-info is initialized once for all.
+  // note: left/top-info is initialized once for all.
  memset(dec->mb_info_ - 1, 0, mb_info_size);
+  VP8InitScanline(dec);   // initialize left too.

  // initialize top
  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
@ -572,115 +576,118 @@ static void DoUVTransform(uint32_t bits, const int16_t* const src,
  }
 }

-void VP8ReconstructBlock(const VP8Decoder* const dec) {
+void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y) {
  int j;
+  int mb_x;
  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
-  const VP8MBData* const block = dec->mb_data_;
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    const VP8MBData* const block = dec->mb_data_ + mb_x;

-  // Rotate in the left samples from previously decoded block. We move four
-  // pixels at a time for alignment reason, and because of in-loop filter.
-  if (dec->mb_x_ > 0) {
-    for (j = -1; j < 16; ++j) {
-      Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
-    }
-    for (j = -1; j < 8; ++j) {
-      Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
-      Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
-    }
-  } else {
-    for (j = 0; j < 16; ++j) {
-      y_dst[j * BPS - 1] = 129;
-    }
-    for (j = 0; j < 8; ++j) {
-      u_dst[j * BPS - 1] = 129;
-      v_dst[j * BPS - 1] = 129;
-    }
-    // Init top-left sample on left column too
-    if (dec->mb_y_ > 0) {
-      y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
-    }
-  }
-  {
-    // bring top samples into the cache
-    VP8TopSamples* const top_yuv = dec->yuv_t_ + dec->mb_x_;
-    const int16_t* const coeffs = block->coeffs_;
-    uint32_t bits = block->non_zero_y_;
-    int n;
-
-    if (dec->mb_y_ > 0) {
-      memcpy(y_dst - BPS, top_yuv[0].y, 16);
-      memcpy(u_dst - BPS, top_yuv[0].u, 8);
-      memcpy(v_dst - BPS, top_yuv[0].v, 8);
-    } else if (dec->mb_x_ == 0) {
-      // we only need to do this init once at block (0,0).
-      // Afterward, it remains valid for the whole topmost row.
-      memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
-      memset(u_dst - BPS - 1, 127, 8 + 1);
-      memset(v_dst - BPS - 1, 127, 8 + 1);
-    }
-
-    // predict and add residuals
-    if (block->is_i4x4_) {   // 4x4
-      uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
-
-      if (dec->mb_y_ > 0) {
-        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
-          memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
-        } else {
-          memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
-        }
+    // Rotate in the left samples from previously decoded block. We move four
+    // pixels at a time for alignment reason, and because of in-loop filter.
+    if (mb_x > 0) {
+      for (j = -1; j < 16; ++j) {
+        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
      }
-      // replicate the top-right pixels below
-      top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
-
-      // predict and add residuals for all 4x4 blocks in turn.
-      for (n = 0; n < 16; ++n, bits <<= 2) {
-        uint8_t* const dst = y_dst + kScan[n];
-        VP8PredLuma4[block->imodes_[n]](dst);
-        DoTransform(bits, coeffs + n * 16, dst);
+      for (j = -1; j < 8; ++j) {
+        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
      }
-    } else {    // 16x16
-      const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_,
-                                      block->imodes_[0]);
-      VP8PredLuma16[pred_func](y_dst);
-      if (bits != 0) {
-        for (n = 0; n < 16; ++n, bits <<= 2) {
-          DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
-        }
+    } else {
+      for (j = 0; j < 16; ++j) {
+        y_dst[j * BPS - 1] = 129;
+      }
+      for (j = 0; j < 8; ++j) {
+        u_dst[j * BPS - 1] = 129;
+        v_dst[j * BPS - 1] = 129;
+      }
+      // Init top-left sample on left column too
+      if (mb_y > 0) {
+        y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
      }
    }
    {
-      // Chroma
-      const uint32_t bits_uv = block->non_zero_uv_;
-      const int pred_func = CheckMode(dec->mb_x_, dec->mb_y_, block->uvmode_);
-      VP8PredChroma8[pred_func](u_dst);
-      VP8PredChroma8[pred_func](v_dst);
-      DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
-      DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
-    }
+      // bring top samples into the cache
+      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
+      const int16_t* const coeffs = block->coeffs_;
+      uint32_t bits = block->non_zero_y_;
+      int n;

-    // stash away top samples for next block
-    if (dec->mb_y_ < dec->mb_h_ - 1) {
-      memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
-      memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
-      memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+      if (mb_y > 0) {
+        memcpy(y_dst - BPS, top_yuv[0].y, 16);
+        memcpy(u_dst - BPS, top_yuv[0].u, 8);
+        memcpy(v_dst - BPS, top_yuv[0].v, 8);
+      } else if (mb_x == 0) {
+        // we only need to do this init once at block (0,0).
+        // Afterward, it remains valid for the whole topmost row.
+        memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+        memset(u_dst - BPS - 1, 127, 8 + 1);
+        memset(v_dst - BPS - 1, 127, 8 + 1);
+      }
+
+      // predict and add residuals
+      if (block->is_i4x4_) {   // 4x4
+        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+
+        if (mb_y > 0) {
+          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
+            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
+          } else {
+            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
+          }
+        }
+        // replicate the top-right pixels below
+        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+        // predict and add residuals for all 4x4 blocks in turn.
+        for (n = 0; n < 16; ++n, bits <<= 2) {
+          uint8_t* const dst = y_dst + kScan[n];
+          VP8PredLuma4[block->imodes_[n]](dst);
+          DoTransform(bits, coeffs + n * 16, dst);
+        }
+      } else {    // 16x16
+        const int pred_func = CheckMode(mb_x, mb_y,
+                                        block->imodes_[0]);
+        VP8PredLuma16[pred_func](y_dst);
+        if (bits != 0) {
+          for (n = 0; n < 16; ++n, bits <<= 2) {
+            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
+          }
+        }
+      }
+      {
+        // Chroma
+        const uint32_t bits_uv = block->non_zero_uv_;
+        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
+        VP8PredChroma8[pred_func](u_dst);
+        VP8PredChroma8[pred_func](v_dst);
+        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
+        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
+      }
+
+      // stash away top samples for next block
+      if (mb_y < dec->mb_h_ - 1) {
+        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
+        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+      }
    }
-  }
-  // Transfer reconstructed samples from yuv_b_ cache to final destination.
-  {
-    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
-    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
-    uint8_t* const y_out = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
-    uint8_t* const u_out = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
-    uint8_t* const v_out = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
-    for (j = 0; j < 16; ++j) {
-      memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
-    }
-    for (j = 0; j < 8; ++j) {
-      memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
-      memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+    // Transfer reconstructed samples from yuv_b_ cache to final destination.
+    {
+      const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
+      const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
+      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
+      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
+      for (j = 0; j < 16; ++j) {
+        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+      }
+      for (j = 0; j < 8; ++j) {
+        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+      }
    }
  }
 }
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -451,16 +451,11 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  VP8Io* const io = &idec->io_;

  assert(dec->ready_);
-
  for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    if (dec->mb_x_ == 0) {
-      VP8InitScanline(dec);
-    }
-    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      MBContext context;
      SaveContext(dec, token_br, &context);
-
      if (!VP8DecodeMB(dec, token_br)) {
        RestoreContext(&context, dec, token_br);
        // We shouldn't fail when MAX_MB data was available
@ -469,19 +464,20 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
        }
        return VP8_STATUS_SUSPENDED;
      }
-      // Reconstruct and emit samples.
-      VP8ReconstructBlock(dec);
-
      // Release buffer only if there is only one partition
      if (dec->num_parts_ == 1) {
        idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
        assert(idec->mem_.start_ <= idec->mem_.end_);
      }
    }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct the samples.
+    VP8ReconstructBlocks(dec, dec->mb_y_);
+    // Filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return IDecError(idec, VP8_STATUS_USER_ABORT);
    }
-    dec->mb_x_ = 0;
  }
  // Synchronize the thread and check for errors.
  if (!VP8ExitCritical(dec, io)) {
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -338,7 +338,7 @@ void VP8ResetProba(VP8Proba* const proba) {
 void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
  uint8_t* const left = dec->intra_l_;
-  VP8MBData* const block = dec->mb_data_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;

  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
  if (!block->is_i4x4_) {
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -511,7 +511,7 @@ static int ParseResiduals(VP8Decoder* const dec,
  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
  const VP8BandProbas* ac_proba;
  const VP8QuantMatrix* const q = &dec->dqm_[dec->segment_];
-  VP8MBData* const block = dec->mb_data_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
  int16_t* dst = block->coeffs_;
  VP8MB* const left_mb = dec->mb_info_ - 1;
  uint8_t tnz, lnz;
@ -598,7 +598,7 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
  VP8BitReader* const br = &dec->br_;
  VP8MB* const left = dec->mb_info_ - 1;
  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
-  VP8MBData* const block = dec->mb_data_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
  int skip;

  // Note: we don't save segment map (yet), as we don't expect
@ -641,24 +641,25 @@ void VP8InitScanline(VP8Decoder* const dec) {
  left->nz_ = 0;
  left->nz_dc_ = 0;
  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-  dec->filter_row_ =
-    (dec->filter_type_ > 0) &&
-    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+  dec->mb_x_ = 0;
 }

 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
+    // Parse bitstream for this row.
    VP8BitReader* const token_br =
        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    VP8InitScanline(dec);
-    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      if (!VP8DecodeMB(dec, token_br)) {
        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                           "Premature end-of-file encountered.");
      }
-      // Reconstruct and emit samples.
-      VP8ReconstructBlock(dec);
    }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct the samples.
+    VP8ReconstructBlocks(dec, dec->mb_y_);
+    // Filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
    }
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -292,7 +292,6 @@ struct VP8Decoder {

  // Filtering side-info
  int filter_type_;                          // 0=off, 1=simple, 2=complex
-  int filter_row_;                           // per-row flag
  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type

  // Alpha
@ -325,8 +324,8 @@ void VP8ParseQuant(VP8Decoder* const dec);

 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
-// Predict a block and add residual
-void VP8ReconstructBlock(const VP8Decoder* const dec);
+// Reconstruct a full row of blocks (prediction + residual adding)
+void VP8ReconstructBlocks(const VP8Decoder* const dec, int mb_y);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK