Optimize 8x8 dequant and idct

Similar to 16x16 dequant and idct, based on the value of eobs, the 8x8 dequant and idct calculation was simplified to improve decorder performance. Combined vp9_dequant_idct_add_8x8 and vp9_dequant_dc_idct_add_8x8 to eliminate duplicate code. Change-Id: Ia58e50ab27f7012b7379c495837c9c0b5ba9cf7f
2012-11-09 17:50:13 -08:00 · 2012-11-09 17:50:13 -08:00 · e60478d46d
commit e60478d46d
parent 5d65614fdd
7 changed files with 310 additions and 244 deletions
--- a/vp9/common/generic/systemdependent.c
+++ b/vp9/common/generic/systemdependent.c
@ -29,10 +29,11 @@ void vp9_machine_specific_config(VP9_COMMON *ctx) {
  rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
  rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
  rtcd->idct.idct8        = vp9_short_idct8x8_c;
+  rtcd->idct.idct10_8     = vp9_short_idct10_8x8_c;
  rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
  rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;
  rtcd->idct.idct16x16    = vp9_short_idct16x16_c;
-  rtcd->idct.idct10_16x16    = vp9_short_idct10_16x16_c;
+  rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c;

  rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;
  rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;
--- a/vp9/common/idct.h
+++ b/vp9/common/idct.h
@ -60,6 +60,11 @@ extern prototype_idct(vp9_idct_idct10_16x16);
 #endif
 extern prototype_idct(vp9_idct_idct8);

+#ifndef vp9_idct_idct10_8
+#define vp9_idct_idct10_8 vp9_short_idct10_8x8_c
+#endif
+extern prototype_idct(vp9_idct_idct10_8);
+
 #ifndef vp9_idct_idct8_1
 #define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
 #endif
@ -132,6 +137,7 @@ typedef struct {
  vp9_second_order_fn_t iwalsh16;

  vp9_idct_fn_t            idct8;
+  vp9_idct_fn_t            idct10_8;
  vp9_idct_fn_t            idct8_1;
  vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
  vp9_idct_fn_t ihaar2;
--- a/vp9/common/idctllm.c
+++ b/vp9/common/idctllm.c
@ -967,6 +967,127 @@ void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
  }
 }

+/* Row IDCT when only first 4 coefficients are non-zero. */
+static void idctrow10(int *blk) {
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
+        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
+    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
+           = blk[5] = blk[6] = blk[7] = blk[0] << 3;
+    return;
+  }
+
+  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */
+  /* first stage */
+  x5 = W7 * x4;
+  x4 = W1 * x4;
+  x6 = W3 * x7;
+  x7 = -W5 * x7;
+
+  /* second stage */
+  x2 = W6 * x3;
+  x3 = W2 * x3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x0 + x3;
+  x8 = x0 - x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[0] = (x7 + x1) >> 8;
+  blk[1] = (x3 + x2) >> 8;
+  blk[2] = (x0 + x4) >> 8;
+  blk[3] = (x8 + x6) >> 8;
+  blk[4] = (x8 - x6) >> 8;
+  blk[5] = (x0 - x4) >> 8;
+  blk[6] = (x3 - x2) >> 8;
+  blk[7] = (x7 - x1) >> 8;
+}
+
+/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */
+static void idctcol10(int *blk) {
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
+        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
+        (x7 = blk[8 * 3]))) {
+    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
+        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
+        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
+    return;
+  }
+
+  x0 = (blk[8 * 0] << 8) + 16384;
+
+  /* first stage */
+  x5 = (W7 * x4 + 4) >> 3;
+  x4 = (W1 * x4 + 4) >> 3;
+  x6 = (W3 * x7 + 4) >> 3;
+  x7 = (-W5 * x7 + 4) >> 3;
+
+  /* second stage */
+  x2 = (W6 * x3 + 4) >> 3;
+  x3 = (W2 * x3 + 4) >> 3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x0 + x3;
+  x8 = x0 - x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[8 * 0] = (x7 + x1) >> 14;
+  blk[8 * 1] = (x3 + x2) >> 14;
+  blk[8 * 2] = (x0 + x4) >> 14;
+  blk[8 * 3] = (x8 + x6) >> 14;
+  blk[8 * 4] = (x8 - x6) >> 14;
+  blk[8 * 5] = (x0 - x4) >> 14;
+  blk[8 * 6] = (x3 - x2) >> 14;
+  blk[8 * 7] = (x7 - x1) >> 14;
+}
+
+void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) {
+  int X[TX_DIM * TX_DIM];
+  int i, j;
+  int shortpitch = pitch >> 1;
+
+  for (i = 0; i < TX_DIM; i++) {
+    for (j = 0; j < TX_DIM; j++) {
+      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
+                                + (coefs[i * TX_DIM + j] < 0)) >> 2;
+    }
+  }
+
+  /* Do first 4 row idct only since non-zero dct coefficients are all in
+   *  upper-left 4x4 area. */
+  for (i = 0; i < 4; i++)
+    idctrow10(X + 8 * i);
+
+  for (i = 0; i < 8; i++)
+    idctcol10(X + i);
+
+  for (i = 0; i < TX_DIM; i++) {
+    for (j = 0; j < TX_DIM; j++) {
+      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;
+    }
+  }
+}

 void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
  int i;
--- a/vp9/common/rtcd_defs.sh
+++ b/vp9/common/rtcd_defs.sh
@ -57,12 +57,9 @@ specialize vp9_dequant_idct_add_uv_block_8x8
 prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
 specialize vp9_dequant_idct_add_16x16

-prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"
 specialize vp9_dequant_idct_add_8x8

-prototype void vp9_dequant_dc_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
-specialize vp9_dequant_dc_idct_add_8x8
-
 prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
 specialize vp9_dequant_idct_add

--- a/vp9/decoder/decodframe.c
+++ b/vp9/decoder/decodframe.c
@ -442,7 +442,8 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
          vp9_ht_dequant_idct_add_8x8_c(tx_type,
                                        q, dq, pre, dst, 16, stride);
        } else {
-          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0,
+                                     xd->eobs[idx]);
        }
        q += 64;
      } else {
--- a/vp9/decoder/dequantize.c
+++ b/vp9/decoder/dequantize.c
@ -19,8 +19,8 @@
 extern int dec_debug;
 #endif

-static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
-                  int stride, int width, int height) {
+static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
+                         uint8_t *dest, int stride, int width, int height) {
  int r, c;

  for (r = 0; r < height; r++) {
@ -41,12 +41,34 @@ static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
  }
 }

+static void add_constant_residual(const int16_t diff, const uint8_t *pred,
+                                  int pitch, uint8_t *dest, int stride,
+                                  int width, int height) {
+  int r, c;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      int a = diff + pred[c];
+
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+
+      dest[c] = (uint8_t) a;
+    }
+
+    dest += stride;
+    pred += pitch;
+  }
+}
+
 void vp9_dequantize_b_c(BLOCKD *d) {

  int i;
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
+  int16_t *DQ  = d->dqcoeff;
+  int16_t *Q   = d->qcoeff;
+  int16_t *DQC = d->dequant;

  for (i = 0; i < 16; i++) {
    DQ[i] = Q[i] * DQC[i];
@ -54,11 +76,11 @@ void vp9_dequantize_b_c(BLOCKD *d) {
 }


-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
-                               unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
+                               uint8_t *pred, uint8_t *dest,
                               int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
  int i;

  for (i = 0; i < 16; i++) {
@ -69,18 +91,15 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,

  vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }

-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
-                                   unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
+                                   uint8_t *pred, uint8_t *dest,
                                   int pitch, int stride) {
-  short output[64];
-  short *diff_ptr = output;
-  int b, r, c;
+  int16_t output[64];
+  int16_t *diff_ptr = output;
  int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;

  input[0] = dq[0] * input[0];
  for (i = 1; i < 64; i++) {
@ -91,35 +110,13 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,

  vpx_memset(input, 0, 128);

-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    // shift buffer pointers to next 4x4 block in the submacroblock
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
-  }
+  add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
 }

-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
-                            unsigned char *dest, int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+void vp9_dequant_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                            uint8_t *dest, int pitch, int stride) {
+  int16_t output[16];
+  int16_t *diff_ptr = output;
  int i;

  for (i = 0; i < 16; i++) {
@ -131,17 +128,17 @@ void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,

  vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }

-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
-                               unsigned char *dest, int pitch, int stride,
+void vp9_dequant_dc_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                               uint8_t *dest, int pitch, int stride,
                               int Dc) {
  int i;
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;

-  input[0] = (short)Dc;
+  input[0] = (int16_t)Dc;

  for (i = 1; i < 16; i++) {
    input[i] = dq[i] * input[i];
@ -152,15 +149,15 @@ void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,

  vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }

 #if CONFIG_LOSSLESS
-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
+void vp9_dequant_idct_add_lossless_c(int16_t *input, int16_t *dq,
+                                     uint8_t *pred, uint8_t *dest,
                                     int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
  int i;

  for (i = 0; i < 16; i++) {
@ -171,18 +168,18 @@ void vp9_dequant_idct_add_lossless_c(short *input, short *dq,

  vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }

-void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
-                                        unsigned char *pred,
-                                        unsigned char *dest,
+void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, int16_t *dq,
+                                        uint8_t *pred,
+                                        uint8_t *dest,
                                        int pitch, int stride, int dc) {
  int i;
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;

-  input[0] = (short)dc;
+  input[0] = (int16_t)dc;

  for (i = 1; i < 16; i++) {
    input[i] = dq[i] * input[i];
@ -191,18 +188,18 @@ void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
  vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 #endif

 void vp9_dequantize_b_2x2_c(BLOCKD *d) {
  int i;
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
+  int16_t *DQ  = d->dqcoeff;
+  int16_t *Q   = d->qcoeff;
+  int16_t *DQC = d->dequant;

  for (i = 0; i < 16; i++) {
-    DQ[i] = (short)((Q[i] * DQC[i]));
+    DQ[i] = (int16_t)((Q[i] * DQC[i]));
  }
 #ifdef DEC_DEBUG
  if (dec_debug) {
@ -216,14 +213,12 @@ void vp9_dequantize_b_2x2_c(BLOCKD *d) {
 #endif
 }

-void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
-                                unsigned char *dest, int pitch, int stride) {
-  short output[64];
-  short *diff_ptr = output;
-  int r, c, b;
+void vp9_dequant_idct_add_8x8_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                                uint8_t *dest, int pitch, int stride,
+                                int dc, uint16_t eobs) {
+  int16_t output[64];
+  int16_t *diff_ptr = output;
  int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;

 #ifdef DEC_DEBUG
  if (dec_debug) {
@ -236,12 +231,57 @@ void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
  }
 #endif

-  input[0] = input[0] * dq[0];
+  /* If dc is 1, then input[0] is the reconstructed value, do not need
+   * dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+   */
+  if (!dc)
+    input[0] *= dq[0];

-  // recover quantizer for 4 4x4 blocks
-  for (i = 1; i < 64; i++) {
-    input[i] = input[i] * dq[1];
-  }
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to decide what to do.
+   * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+   * Combine that with code here.
+   */
+  if (eobs == 0) {
+    /* All 0 DCT coefficient */
+    vp9_copy_mem8x8(pred, pitch, dest, stride);
+  } else if (eobs == 1) {
+    /* DC only DCT coefficient. */
+    int16_t out;
+
+    /* Note: the idct1 will need to be modified accordingly whenever
+     * vp9_short_idct8x8_c() is modified. */
+    out = (input[0] + 1 + (input[0] < 0)) >> 2;
+    out = out << 3;
+    out = (out + 32) >> 7;
+
+    input[0] = 0;
+
+    add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
+  } else if (eobs <= 10) {
+    input[1] = input[1] * dq[1];
+    input[2] = input[2] * dq[1];
+    input[3] = input[3] * dq[1];
+    input[8] = input[8] * dq[1];
+    input[9] = input[9] * dq[1];
+    input[10] = input[10] * dq[1];
+    input[16] = input[16] * dq[1];
+    input[17] = input[17] * dq[1];
+    input[24] = input[24] * dq[1];
+
+    vp9_short_idct10_8x8_c(input, output, 16);
+
+    input[0] = input[1] = input[2] = input[3] = 0;
+    input[8] = input[9] = input[10] = 0;
+    input[16] = input[17] = 0;
+    input[24] = 0;
+
+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
+  } else {
+    // recover quantizer for 4 4x4 blocks
+    for (i = 1; i < 64; i++) {
+      input[i] = input[i] * dq[1];
+    }
 #ifdef DEC_DEBUG
  if (dec_debug) {
    int j;
@ -253,8 +293,8 @@ void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
  }
 #endif

-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct8x8_c(input, output, 16);
+    // the idct halves ( >> 1) the pitch
+    vp9_short_idct8x8_c(input, output, 16);
 #ifdef DEC_DEBUG
  if (dec_debug) {
    int j;
@ -266,30 +306,10 @@ void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
  }
 #endif

-  vpx_memset(input, 0, 128);// test what should i put here
+    vpx_memset(input, 0, 128);

-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);

-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
-  }
 #ifdef DEC_DEBUG
  if (dec_debug) {
    int k, j;
@ -303,101 +323,14 @@ void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
    }
  }
 #endif
+  }
 }

-void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
-                                   unsigned char *dest, int pitch, int stride,
-                                   int Dc) { // Dc for 1st order T in some rear case
-  short output[64];
-  short *diff_ptr = output;
-  int r, c, b;
-  int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
-
-  input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
-  // dc value is recovered after dequantization, since dc need not quantization
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Input 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-  for (i = 1; i < 64; i++) {
-    input[i] = input[i] * dq[1];
-  }
-
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Input DQ 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct8x8_c(input, output, 16);
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Output 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", output[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-  vpx_memset(input, 0, 128);
-
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
-  }
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int k, j;
-    printf("Final 8x8\n");
-    for (j = 0; j < 8; j++) {
-      for (k = 0; k < 8; k++) {
-        printf("%d ", origdest[k]);
-      }
-      printf("\n");
-      origdest += stride;
-    }
-  }
-#endif
-}
-
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
+                                     int16_t *dq, uint8_t *pred, uint8_t *dest,
                                     int pitch, int stride) {
-  short output[256];
-  short *diff_ptr = output;
+  int16_t output[256];
+  int16_t *diff_ptr = output;
  int i;

  input[0]= input[0] * dq[0];
@ -414,7 +347,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,

  vpx_memset(input, 0, 512);

-  recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
 }

 void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,
@ -422,7 +355,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,
                                  uint16_t eobs) {
  int16_t output[256];
  int16_t *diff_ptr = output;
-  int r, c, i;
+  int i;

  /* The calculation can be simplified if there are not many non-zero dct
   * coefficients. Use eobs to separate different cases. */
@ -433,28 +366,15 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,
    /* DC only DCT coefficient. */
    int16_t out;

+    /* Note: the idct1 will need to be modified accordingly whenever
+     * vp9_short_idct16x16_c() is modified. */
    out = (input[0] * dq[0] + 2) >> 2;
    out = (out + 2) >> 2;
    out = (out + 4) >> 3;

    input[0] = 0;

-    for (r = 0; r < 16; r++) {
-      for (c = 0; c < 16; c++) {
-        int a = out + pred[c];
-
-        if (a < 0)
-          a = 0;
-        else if (a > 255)
-          a = 255;
-
-        dest[c] = (uint8_t) a;
-      }
-
-      dest += stride;
-      pred += pitch;
-    }
-
+    add_constant_residual(out, pred, pitch, dest, stride, 16, 16);
  } else if (eobs <= 10) {
    input[0]= input[0] * dq[0];
    input[1] = input[1] * dq[1];
@ -475,7 +395,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,
    input[32] = input[33] = 0;
    input[48] = 0;

-    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
  } else {
    input[0]= input[0] * dq[0];

@ -488,6 +408,6 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,

    vpx_memset(input, 0, 512);

-    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
  }
 }
--- a/vp9/decoder/idct_blk.c
+++ b/vp9/decoder/idct_blk.c
@ -177,12 +177,21 @@ void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, short *dq,
                                           int stride, unsigned short *eobs,
                                           short *dc,
                                           MACROBLOCKD *xd) {
-  vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
-                                dst + 8 * stride, 16, stride, dc[4]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
-                                dst + 8 * stride + 8, 16, stride, dc[8]);
+  q[0] = dc[0];
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]);
+
+  q[64] = dc[1];
+  vp9_dequant_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, 1,
+                             xd->eobs[4]);
+
+  q[128] = dc[4];
+  vp9_dequant_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
+                                dst + 8 * stride, 16, stride, 1, xd->eobs[8]);
+
+  q[192] = dc[8];
+  vp9_dequant_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
+                                dst + 8 * stride + 8, 16, stride, 1,
+                                xd->eobs[12]);
 }

 #if CONFIG_SUPERBLOCKS
@ -191,13 +200,22 @@ void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
                                                   int stride,
                                                   unsigned short *eobs,
                                                   short *dc, MACROBLOCKD *xd) {
-  vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,
-                                dst + 8, stride, stride, dc[1]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
-                                dst + 8 * stride, stride, stride, dc[4]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
-                                dst + 8 * stride + 8, stride, stride, dc[8]);
+  q[0] = dc[0];
+  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]);
+
+  q[64] = dc[1];
+  vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8,
+                                dst + 8, stride, stride, 1, xd->eobs[4]);
+
+  q[128] = dc[4];
+  vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
+                                dst + 8 * stride, stride, stride, 1,
+                                xd->eobs[8]);
+
+  q[192] = dc[8];
+  vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
+                                dst + 8 * stride + 8, stride, stride, 1,
+                                xd->eobs[12]);
 }
 #endif

@ -209,13 +227,14 @@ void vp9_dequant_idct_add_y_block_8x8_c(short *q, short *dq,
  unsigned char *origdest = dst;
  unsigned char *origpred = pre;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]);
  vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
-                             origdest + 8, 16, stride);
+                             origdest + 8, 16, stride, 0, xd->eobs[4]);
  vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
-                             origdest + 8 * stride, 16, stride);
+                             origdest + 8 * stride, 16, stride, 0, xd->eobs[8]);
  vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
-                             origdest + 8 * stride + 8, 16, stride);
+                             origdest + 8 * stride + 8, 16, stride, 0,
+                             xd->eobs[12]);
 }

 void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,
@ -224,12 +243,12 @@ void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,
                                         unsigned char *dstv,
                                         int stride, unsigned short *eobs,
                                         MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]);

  q    += 64;
  pre  += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]);
 }

 #if CONFIG_SUPERBLOCKS
@ -239,11 +258,12 @@ void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
                                                 int stride,
                                                 unsigned short *eobs,
                                                 MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0,
+                             xd->eobs[16]);

-  q    += 64;
-
-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
+  q += 64;
+  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0,
+                             xd->eobs[20]);
 }
 #endif