Moved dequant idct into common

These functions are now used by the encoder. This is WIP with the goal of creating a common idct/add for the encoder and decoder. A boost of 1.8% was seen for the HD rt test clip used. [Tero] Added needed changes to ARM side. Change-Id: Ibbb8000be09034203d7adffc457d3c3f8b06a5bf
2011-12-15 14:23:36 -05:00
parent c8df1656bd
commit a53d5a4c44
39 changed files with 192 additions and 272 deletions
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -63,6 +63,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
        rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
        rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
        rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6;
+
+        rtcd->dequant.block               = vp8_dequantize_b_v6;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_v6;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
+
    }
 #endif

@@ -97,6 +103,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
            vp8_build_intra_predictors_mby_neon;
        rtcd->recon.build_intra_predictors_mby_s =
            vp8_build_intra_predictors_mby_s_neon;
+
+        rtcd->dequant.block               = vp8_dequantize_b_neon;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_neon;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
+
    }
 #endif

--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
--- a/vp8/decoder/arm/armv6/dequantize_v6.asm
+++ b/vp8/decoder/arm/armv6/dequantize_v6.asm
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c
@@ -10,7 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"


 void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/decoder/arm/dequantize_arm.c
@@ -10,9 +10,8 @@


 #include "vpx_config.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "vp8/common/idct.h"
-#include "vpx_mem/vpx_mem.h"

 #if HAVE_ARMV7
 extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -22,13 +22,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_v6

-#undef vp8_dequant_idct_add
+#undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_v6

-#undef vp8_dequant_idct_add_y_block
+#undef  vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6

-#undef vp8_dequant_idct_add_uv_block
+#undef  vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6
 #endif
 #endif
@@ -44,13 +44,13 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
 #undef  vp8_dequant_block
 #define vp8_dequant_block vp8_dequantize_b_neon

-#undef vp8_dequant_idct_add
+#undef  vp8_dequant_idct_add
 #define vp8_dequant_idct_add vp8_dequant_idct_add_neon

-#undef vp8_dequant_idct_add_y_block
+#undef  vp8_dequant_idct_add_y_block
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon

-#undef vp8_dequant_idct_add_uv_block
+#undef  vp8_dequant_idct_add_uv_block
 #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
 #endif

--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm
--- a/vp8/decoder/arm/neon/dequantizeb_neon.asm
+++ b/vp8/decoder/arm/neon/dequantizeb_neon.asm
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -10,7 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"

 /* place these declarations here because we don't want to maintain them
 * outside of this scope
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -70,6 +70,14 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
 #if CONFIG_RUNTIME_CPU_DETECT
    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;

+
+    rtcd->dequant.block             = vp8_dequantize_b_c;
+    rtcd->dequant.idct_add          = vp8_dequant_idct_add_c;
+    rtcd->dequant.idct_add_y_block  = vp8_dequant_idct_add_y_block_c;
+    rtcd->dequant.idct_add_uv_block =
+        vp8_dequant_idct_add_uv_block_c;
+
+
    rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
    rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
    rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_c;
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -1,56 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "invtrans.h"
-
-
-void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
-                             int pitch)
-{
-    if (*b->eob > 1)
-    {
-        IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,
-              *(b->base_dst) + b->dst, b->dst_stride);
-    }
-    else
-    {
-        IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,
-                         *(b->base_dst) + b->dst, b->dst_stride);
-    }
-
-}
-
-void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    if(x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        /* do 2nd order transform on the dc block */
-        IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->dqcoeff);
-    }
-
-    for (i = 0; i < 16; i++)
-    {
-        vp8_inverse_transform_b(rtcd, &x->block[i], 16);
-    }
-
-}
-void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    for (i = 16; i < 24; i++)
-    {
-        vp8_inverse_transform_b(rtcd, &x->block[i], 8);
-    }
-
-}
--- a/vp8/common/invtrans.h
+++ b/vp8/common/invtrans.h
@@ -15,9 +15,51 @@
 #include "vpx_config.h"
 #include "idct.h"
 #include "blockd.h"
-extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
-extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-extern void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
+#include "onyxc_int.h"

+static void eob_adjust(char *eobs, short *diff)
+{
+    /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+    int js;
+    for(js = 0; js < 16; js++)
+    {
+        if((eobs[js] == 0) && (diff[0] != 0))
+            eobs[js]++;
+        diff+=16;
+    }
+}
+
+static void vp8_inverse_transform_mby(MACROBLOCKD *xd,
+                                      const VP8_COMMON_RTCD *rtcd)
+{
+    short *DQC = xd->block[0].dequant;
+    /* save the dc dequant constant in case it is overridden */
+    short dc_dequant_temp = DQC[0];
+
+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
+        {
+            IDCT_INVOKE(&rtcd->idct, iwalsh16)
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        else
+        {
+            IDCT_INVOKE(&rtcd->idct, iwalsh1)
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        eob_adjust(xd->eobs, xd->qcoeff);
+
+        /* override the dc dequant constant */
+        DQC[0] = 1;
+    }
+    DEQUANT_INVOKE (&rtcd->dequant, idct_add_y_block)
+                    (xd->qcoeff, xd->block[0].dequant,
+                     xd->dst.y_buffer,
+                     xd->dst.y_stride, xd->eobs);
+
+    /* restore the dc dequant constant */
+    DQC[0] = dc_dequant_temp;
+}
 #endif
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -22,6 +22,7 @@
 #if CONFIG_POSTPROC
 #include "postproc.h"
 #endif
+#include "dequantize.h"

 /*#ifdef PACKET_TESTING*/
 #include "header.h"
@@ -73,6 +74,7 @@ typedef enum
 typedef struct VP8_COMMON_RTCD
 {
 #if CONFIG_RUNTIME_CPU_DETECT
+    vp8_dequant_rtcd_vtable_t        dequant;
    vp8_idct_rtcd_vtable_t        idct;
    vp8_recon_rtcd_vtable_t       recon;
    vp8_subpix_rtcd_vtable_t      subpix;
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -334,11 +334,12 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)


 /*encoder only*/
-void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                         unsigned char *dst_y,
+                                         int dst_ystride)
 {
    unsigned char *ptr_base;
    unsigned char *ptr;
-    unsigned char *pred_ptr = x->predictor;
    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
    int pre_stride = x->block[0].pre_stride;
@@ -348,11 +349,13 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)

    if ((mv_row | mv_col) & 7)
    {
-        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+                                 dst_y, dst_ystride);
    }
    else
    {
-        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
+        RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y,
+            dst_ystride);
    }
 }

@@ -596,69 +599,3 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
        build_inter4x4_predictors_mb(xd);
    }
 }
-/* encoder only*/
-static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)
-{
-    int i;
-
-    if (x->mode_info_context->mbmi.partitioning < 3)
-    {
-        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
-        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
-        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
-        x->block[10].bmi = x->mode_info_context->bmi[10];
-
-        build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);
-        build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);
-        build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);
-        build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);
-    }
-    else
-    {
-        for (i = 0; i < 16; i += 2)
-        {
-            BLOCKD *d0 = &x->block[i];
-            BLOCKD *d1 = &x->block[i+1];
-
-            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
-            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
-
-            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                build_inter_predictors2b(x, d0, d0->predictor, 16);
-            else
-            {
-                build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);
-                build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);
-            }
-
-        }
-
-    }
-
-    for (i = 16; i < 24; i += 2)
-    {
-        BLOCKD *d0 = &x->block[i];
-        BLOCKD *d1 = &x->block[i+1];
-
-        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-            build_inter_predictors2b(x, d0, d0->predictor, 8);
-        else
-        {
-            build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);
-            build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);
-        }
-    }
-}
-void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)
-{
-    if (xd->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],
-                                           &xd->predictor[320], 16, 8);
-    }
-    else
-    {
-        build_4x4uvmvs(xd);
-        build_inter4x4_predictors_mb_e(xd);
-    }
-}
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -21,11 +21,13 @@ extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
                                               int dst_uvstride);


-extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                                unsigned char *dst_y,
+                                                int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                         vp8_subpix_fn_t sppf);

 extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
 extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);

 #endif
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/decoder/x86/dequantize_x86.h
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/decoder/x86/idct_blk_mmx.c
@@ -10,7 +10,17 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d)
+{
+    short *sq = (short *) d->qcoeff;
+    short *dq = (short *) d->dqcoeff;
+    short *q = (short *) d->dequant;
+    vp8_dequantize_b_impl_mmx(sq, dq, q);
+}

 void vp8_dequant_idct_add_y_block_mmx
            (short *q, short *dq,
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -10,7 +10,7 @@

 #include "vpx_config.h"
 #include "vp8/common/idct.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"

 void vp8_idct_dequant_0_2x_sse2
            (short *q, short *dq ,
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -37,6 +37,11 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)

    if (flags & HAS_MMX)
    {
+        rtcd->dequant.block               = vp8_dequantize_b_mmx;
+        rtcd->dequant.idct_add            = vp8_dequant_idct_add_mmx;
+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
+
        rtcd->idct.idct16       = vp8_short_idct4x4llm_mmx;
        rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_mmx;
@@ -82,6 +87,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
        rtcd->recon.build_intra_predictors_mbuv_s =
            vp8_build_intra_predictors_mbuv_s_sse2;

+        rtcd->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
+        rtcd->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
+
        rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_sse2;

        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_sse2;
--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c
@@ -11,9 +11,6 @@

 #include "vpx_config.h"
 #include "vpx_ports/arm.h"
-#include "vp8/common/blockd.h"
-#include "vp8/common/pragmas.h"
-#include "vp8/decoder/dequantize.h"
 #include "vp8/decoder/onyxd_int.h"

 void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
@@ -30,20 +27,12 @@ void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
 #if HAVE_ARMV6
    if (flags & HAS_MEDIA)
    {
-        pbi->dequant.block               = vp8_dequantize_b_v6;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_v6;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
    }
 #endif

 #if HAVE_ARMV7
    if (flags & HAS_NEON)
    {
-        pbi->dequant.block               = vp8_dequantize_b_neon;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_neon;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
    }
 #endif
 #endif
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -15,7 +15,7 @@
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/recon.h"
 #include "vp8/common/reconinter.h"
-#include "dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "detokenize.h"
 #include "vp8/common/invtrans.h"
 #include "vp8/common/alloccommon.h"
@@ -32,7 +32,7 @@
 #endif
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/idct.h"
-#include "dequantize.h"
+
 #include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include "dboolhuff.h"
@@ -218,7 +218,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
            {
                if (xd->eobs[i] > 1)
                {
-                    DEQUANT_INVOKE(&pbi->dequant, idct_add)
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
                        (b->qcoeff, b->dequant,
                        *(b->base_dst) + b->dst, b->dst_stride);
                }
@@ -247,7 +247,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
            /* do 2nd order transform on the dc block */
            if (xd->eobs[24] > 1)
            {
-                DEQUANT_INVOKE(&pbi->dequant, block)(b);
+                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);

                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
                    xd->qcoeff);
@@ -272,7 +272,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
            DQC[0] = 1;
        }

-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
                        (xd->qcoeff, xd->block[0].dequant,
                         xd->dst.y_buffer,
                         xd->dst.y_stride, xd->eobs);
@@ -281,7 +281,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
        DQC[0] = dc_dequant_temp;
    }

-    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+    DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
                    (xd->qcoeff+16*16, xd->block[16].dequant,
                     xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c
@@ -10,7 +10,7 @@


 #include "vpx_config.h"
-#include "vp8/decoder/dequantize.h"
+#include "vp8/common/dequantize.h"
 #include "vp8/decoder/onyxd_int.h"

 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
@@ -20,11 +20,7 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi)
 {
    /* Pure C: */
 #if CONFIG_RUNTIME_CPU_DETECT
-    pbi->mb.rtcd                     = &pbi->common.rtcd;
-    pbi->dequant.block               = vp8_dequantize_b_c;
-    pbi->dequant.idct_add            = vp8_dequant_idct_add_c;
-    pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
-    pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
+    pbi->mb.rtcd                               = &pbi->common.rtcd;
 #endif

 #if ARCH_X86 || ARCH_X86_64
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -16,7 +16,8 @@
 #include "treereader.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/threading.h"
-#include "dequantize.h"
+
+
 #if CONFIG_ERROR_CONCEALMENT
 #include "ec_types.h"
 #endif
@@ -93,11 +94,6 @@ typedef struct VP8Decompressor

    DATARATE dr[16];

-#if CONFIG_RUNTIME_CPU_DETECT
-    vp8_dequant_rtcd_vtable_t        dequant;
-#endif
-
-
    vp8_prob prob_intra;
    vp8_prob prob_last;
    vp8_prob prob_gf;
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -189,7 +189,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            {
                if (xd->eobs[i] > 1)
                {
-                    DEQUANT_INVOKE(&pbi->dequant, idct_add)
+                    DEQUANT_INVOKE(&pbi->common.rtcd.dequant, idct_add)
                        (b->qcoeff, b->dequant,
                        *(b->base_dst) + b->dst, b->dst_stride);
                }
@@ -217,7 +217,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            /* do 2nd order transform on the dc block */
            if (xd->eobs[24] > 1)
            {
-                DEQUANT_INVOKE(&pbi->dequant, block)(b);
+                DEQUANT_INVOKE(&pbi->common.rtcd.dequant, block)(b);

                IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0],
                    xd->qcoeff);
@@ -248,13 +248,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
            DQC = local_dequant;
        }

-        DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+        DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_y_block)
                        (xd->qcoeff, DQC,
                         xd->dst.y_buffer,
                         xd->dst.y_stride, xd->eobs);
    }

-    DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+    DEQUANT_INVOKE (&pbi->common.rtcd.dequant, idct_add_uv_block)
                    (xd->qcoeff+16*16, xd->block[16].dequant,
                     xd->dst.u_buffer, xd->dst.v_buffer,
                     xd->dst.uv_stride, xd->eobs+16);
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -13,47 +13,7 @@
 #include "vpx_ports/x86.h"
 #include "vp8/decoder/onyxd_int.h"

-
-#if HAVE_MMX
-void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp8_dequantize_b_mmx(BLOCKD *d)
-{
-    short *sq = (short *) d->qcoeff;
-    short *dq = (short *) d->dqcoeff;
-    short *q = (short *) d->dequant;
-    vp8_dequantize_b_impl_mmx(sq, dq, q);
-}
-#endif
-
 void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
 {
-#if CONFIG_RUNTIME_CPU_DETECT
-    int flags = x86_simd_caps();

-    /* Note:
-     *
-     * This platform can be built without runtime CPU detection as well. If
-     * you modify any of the function mappings present in this file, be sure
-     * to also update them in static mapings (<arch>/filename_<arch>.h)
-     */
-    /* Override default functions with fastest ones for this CPU. */
-#if HAVE_MMX
-    if (flags & HAS_MMX)
-    {
-        pbi->dequant.block               = vp8_dequantize_b_mmx;
-        pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_mmx;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_mmx;
-    }
-#endif
-#if HAVE_SSE2
-    if (flags & HAS_SSE2)
-    {
-        pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_sse2;
-        pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_sse2;
-    }
-#endif
-
-#endif
 }
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1091,8 +1091,10 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
 #endif
 }

-int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col)
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int mb_row, int mb_col)
 {
+    MACROBLOCKD *xd = &x->e_mbd;
    int rate;

    if (cpi->sf.RD && cpi->compressor_speed != 2)
@@ -1112,14 +1114,17 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);

    vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+
    sum_intra_stats(cpi, x);
    vp8_tokenize_mb(cpi, &x->e_mbd, t);

-    if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
-        vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+    if (xd->mode_info_context->mbmi.mode != B_PRED)
+        vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));

+    DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+                    (xd->qcoeff+16*16, xd->block[16].dequant,
+                     xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
    return rate;
 }
 #ifdef SPEEDSTATS
@@ -1312,12 +1317,14 @@ int vp8cx_encode_inter_macroblock
    if (!x->skip)
    {
        vp8_tokenize_mb(cpi, xd, t);
-        if (x->e_mbd.mode_info_context->mbmi.mode != B_PRED)
-        {
-          vp8_inverse_transform_mby(IF_RTCD(&cpi->rtcd.common->idct),
-                                      &x->e_mbd);
-        }
-        vp8_inverse_transform_mbuv(IF_RTCD(&cpi->rtcd.common->idct), &x->e_mbd);
+
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+            vp8_inverse_transform_mby(xd, IF_RTCD(&cpi->common.rtcd));
+
+        DEQUANT_INVOKE (&cpi->common.rtcd.dequant, idct_add_uv_block)
+                        (xd->qcoeff+16*16, xd->block[16].dequant,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
    }
    else
    {
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -45,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)

        vp8_encode_intra16x16mby(rtcd, x);

-        vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+        vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(&cpi->common.rtcd));
    }
    else
    {
@@ -77,8 +77,17 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,

    x->quantize_b(be, b);

-    vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);
-
+    if (*b->eob > 1)
+    {
+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct16)(b->dqcoeff,
+            b->predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+    else
+    {
+        IDCT_INVOKE(IF_RTCD(&rtcd->common->idct), idct1_scalar_add)
+            (b->dqcoeff[0], b->predictor, 16, *(b->base_dst) + b->dst,
+                b->dst_stride);
+    }
 }

 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
@@ -96,11 +105,12 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
 void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;

-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby_s)(&x->e_mbd);

-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby) (x->src_diff, *(b->base_src),
+        b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);

    vp8_transform_intra_mby(x);

@@ -108,16 +118,17 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        vp8_optimize_mby(x, rtcd);
-
 }

 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv_s)(&x->e_mbd);

    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
-        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
-        &x->e_mbd.predictor[320], 8);
+        x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
+        xd->dst.v_buffer, xd->dst.uv_stride);

    vp8_transform_mbuv(x);

@@ -125,5 +136,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        vp8_optimize_mbuv(x, rtcd);
-
 }
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -105,10 +105,10 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
    BLOCK *b = &x->block[0];

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
-        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
-        &x->e_mbd.predictor[320], 8);
+        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
+        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
 }

 static void build_dcblock(MACROBLOCK *x)
@@ -625,7 +625,7 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)

 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
-    vp8_build_inter_predictors_mb_e(&x->e_mbd);
+    vp8_build_inter_predictors_mb(&x->e_mbd);

    vp8_subtract_mb(rtcd, x);

@@ -635,7 +635,6 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)

    if (x->optimize)
        optimize_mb(x, rtcd);
-
 }

 /* this funciton is used by first pass only */
@@ -643,15 +642,15 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
    BLOCK *b = &x->block[0];

-    vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
+                                        x->e_mbd.dst.y_stride);

    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        b->src_stride, x->e_mbd.predictor, 16);
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);

    transform_mby(x);

    vp8_quantize_mby(x);

-    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
+    vp8_inverse_transform_mby(&x->e_mbd, IF_RTCD(rtcd->common));
 }
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -12,6 +12,7 @@
 #ifndef __INC_ENCODEMB_H
 #define __INC_ENCODEMB_H

+
 #include "vpx_config.h"
 #include "block.h"

--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2166,7 +2166,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                continue;

            vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
-            vp8_build_inter16x16_predictors_mby(&x->e_mbd);
+            vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);

            if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
                x->skip = 1;
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -20,6 +20,8 @@ VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
 VP8_COMMON_SRCS-yes += common/debugmodes.c
 VP8_COMMON_SRCS-yes += common/default_coef_probs.h
+VP8_COMMON_SRCS-yes += common/dequantize.c
+VP8_COMMON_SRCS-yes += common/dequantize.h
 VP8_COMMON_SRCS-yes += common/entropy.c
 VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
@@ -28,6 +30,7 @@ VP8_COMMON_SRCS-yes += common/filter.c
 VP8_COMMON_SRCS-yes += common/filter.h
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP8_COMMON_SRCS-yes += common/idct_blk.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
 VP8_COMMON_SRCS-yes += common/alloccommon.h
 VP8_COMMON_SRCS-yes += common/blockd.h
@@ -57,7 +60,6 @@ VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
 VP8_COMMON_SRCS-yes += common/systemdependent.h
 VP8_COMMON_SRCS-yes += common/threading.h
 VP8_COMMON_SRCS-yes += common/treecoder.h
-VP8_COMMON_SRCS-yes += common/invtrans.c
 VP8_COMMON_SRCS-yes += common/loopfilter.c
 VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
 VP8_COMMON_SRCS-yes += common/mbpitch.c
@@ -69,9 +71,13 @@ VP8_COMMON_SRCS-yes += common/reconintra.c
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+
+
+
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP8_COMMON_SRCS-yes += common/treecoder.c

+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/dequantize_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
@@ -84,11 +90,14 @@ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
@@ -115,6 +124,8 @@ VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/recon_arm.h
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/subpixel_arm.h
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.h

 # common (armv6)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
@@ -129,6 +140,9 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/loopfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dequant_idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/dequantize_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/idct_blk_v6.c

 # common (neon)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
@@ -151,3 +165,8 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dequant_idct_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/dequantizeb_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/idct_blk_neon.c
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk
@@ -52,7 +52,6 @@ VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
 VP8_DX_SRCS-yes += decoder/dboolhuff.c
 VP8_DX_SRCS-yes += decoder/decodemv.c
 VP8_DX_SRCS-yes += decoder/decodframe.c
-VP8_DX_SRCS-yes += decoder/dequantize.c
 VP8_DX_SRCS-yes += decoder/detokenize.c
 VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
 VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h
@@ -61,20 +60,14 @@ VP8_DX_SRCS-yes += decoder/generic/dsystemdependent.c
 VP8_DX_SRCS-yes += decoder/dboolhuff.h
 VP8_DX_SRCS-yes += decoder/decodemv.h
 VP8_DX_SRCS-yes += decoder/decoderthreading.h
-VP8_DX_SRCS-yes += decoder/dequantize.h
 VP8_DX_SRCS-yes += decoder/detokenize.h
 VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
-VP8_DX_SRCS-yes += decoder/idct_blk.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c

 VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))

-VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/dequantize_x86.h
 VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
-VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
-VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk
@@ -12,17 +12,3 @@
 #VP8_DX_SRCS list is modified according to different platforms.

 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/arm_dsystemdependent.c
-VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.c
-VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/dequantize_arm.h
-
-#File list for armv6
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_idct_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequantize_v6$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c
-
-#File list for neon
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)
-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c