Reduce dqcoeff array size in decoder

The decoding process handles detokenization and reconstruction per transform block sequentially. There is no need to offset the dqcoeff buffer according to the transform block index. This allows to reduce the memory spill and improve cache performance. Change-Id: Ibb8bfe532a7a08fcabaf6d42cbec1e986901d32d
2015-07-07 11:36:05 -07:00 · 2015-07-07 11:36:05 -07:00 · cccad1c5de
commit cccad1c5de
parent 0ede9f52b7
3 changed files with 6 additions and 3 deletions
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@ -188,8 +188,11 @@ typedef struct macroblockd {
 #endif

  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+#if CONFIG_VP9_ENCODER
  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
-
+#else
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+#endif
  int lossless;
  int corrupted;

--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@ -188,7 +188,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
  struct macroblockd_plane *const pd = &xd->plane[plane];
  if (eob > 0) {
    TX_TYPE tx_type = DCT_DCT;
-    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    tran_low_t *const dqcoeff = pd->dqcoeff;
 #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      if (xd->lossless) {
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@ -217,7 +217,7 @@ int vp9_decode_block_tokens(MACROBLOCKD *xd,
                                               pd->left_context + y);
  const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block);
  const int eob = decode_coefs(xd, pd->plane_type,
-                               BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
+                               pd->dqcoeff, tx_size,
                               dequant, ctx, so->scan, so->neighbors, r);
  vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
  return eob;