Redo the forward 4x4 dct

The new fdct lowers the round trip sum squared error for a 4x4 block ~0.12. or ~0.008/pixel. For reference, the old matrix multiply version has average round trip error 1.46 for a 4x4 block. Thanks to "derf" for his suggestions and references. Change-Id: I5559d1e81d333b319404ab16b336b739f87afc79
2010-06-16 12:52:18 -07:00
parent a5906668a3
commit d0dd01b8ce
14 changed files with 118 additions and 562 deletions
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -137,8 +137,6 @@ extern unsigned int inter_b_modes[15];

 extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
 extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);

 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];

@@ -1136,15 +1134,11 @@ void vp8_set_speed_features(VP8_COMP *cpi)
    {
        cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
        cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
-        cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
-        cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
    }
    else
    {
        cpi->mb.vp8_short_fdct8x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
        cpi->mb.vp8_short_fdct4x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
-        cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
-        cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
    }

    cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4);