Remove usage of predict buffer for decode

Instead of using the predict buffer, the decoder now writes the predictor into the recon buffer. For blocks with eob=0, unnecessary idcts can be eliminated. This gave a performance boost of ~1.8% for the HD clips used. Tero: Added needed changes to ARM side and scheduled some assembly code to prevent interlocks. Patch Set 6: Merged (I1bcdca7a95aacc3a181b9faa6b10e3a71ee24df3) into this commit because of similarities in the idct functions. Patch Set 7: EC bug fix. Change-Id: Ie31d90b5d3522e1108163f2ac491e455e3f955e6
2011-10-18 12:06:50 -04:00
parent 6505adf271
commit ed9c66f584
56 changed files with 1496 additions and 2455 deletions
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -138,11 +138,11 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
    /* do prediction */
    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
    {
-        vp8mt_build_intra_predictors_mbuv(pbi, xd, mb_row, mb_col);
+        vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);

        if (xd->mode_info_context->mbmi.mode != B_PRED)
        {
-            vp8mt_build_intra_predictors_mby(pbi, xd, mb_row, mb_col);
+            vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
        } else {
            vp8mt_intra_prediction_down_copy(pbi, xd, mb_row, mb_col);
        }
@@ -201,7 +201,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m

        DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->predictor, xd->dst.y_buffer,
+                         xd->dst.y_buffer,
                         xd->dst.y_stride, xd->eobs, xd->block[24].diff);
    }
    else if (xd->mode_info_context->mbmi.mode == B_PRED)
@@ -211,19 +211,21 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            BLOCKD *b = &xd->block[i];
            int b_mode = xd->mode_info_context->bmi[i].as_mode;

-            vp8mt_predict_intra4x4(pbi, xd, b_mode, b->predictor, mb_row, mb_col, i);
+            vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
+                                   b->dst_stride, mb_row, mb_col, i);

            if (xd->eobs[i] > 1)
            {
                DEQUANT_INVOKE(&pbi->dequant, idct_add)
-                    (b->qcoeff, b->dequant,  b->predictor,
-                    *(b->base_dst) + b->dst, 16, b->dst_stride);
+                    (b->qcoeff, b->dequant,
+                    *(b->base_dst) + b->dst, b->dst_stride);
            }
            else
            {
                IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
-                    (b->qcoeff[0] * b->dequant[0], b->predictor,
-                    *(b->base_dst) + b->dst, 16, b->dst_stride);
+                    (b->qcoeff[0] * b->dequant[0],
+                    *(b->base_dst) + b->dst, b->dst_stride,
+                    *(b->base_dst) + b->dst, b->dst_stride);
                ((int *)b->qcoeff)[0] = 0;
            }
        }
@@ -232,13 +234,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
    {
        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
                        (xd->qcoeff, xd->block[0].dequant,
-                         xd->predictor, xd->dst.y_buffer,
+                         xd->dst.y_buffer,
                         xd->dst.y_stride, xd->eobs);
    }

    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
                    (xd->qcoeff+16*16, xd->block[16].dequant,
-                     xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
 }