vpx/vp8/encoder/dct.c
Yaowu Xu d0dd01b8ce Redo the forward 4x4 dct
The new fdct lowers the round trip sum squared error for a
4x4 block ~0.12. or ~0.008/pixel. For reference, the old
matrix multiply version has average round trip error 1.46
for a 4x4 block.

Thanks to "derf" for his suggestions and references.

Change-Id: I5559d1e81d333b319404ab16b336b739f87afc79
2010-06-24 13:17:58 -07:00

116 lines
2.5 KiB
C

/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
{
int i;
int a1, b1, c1, d1;
short *ip = input;
short *op = output;
for (i = 0; i < 4; i++)
{
a1 = ((ip[0] + ip[3])<<3);
b1 = ((ip[1] + ip[2])<<3);
c1 = ((ip[1] - ip[2])<<3);
d1 = ((ip[0] - ip[3])<<3);
op[0] = a1 + b1;
op[2] = a1 - b1;
op[1] = (c1 * 2217 + d1 * 5352 + 14500)>>12;
op[3] = (d1 * 2217 - c1 * 5352 + 7500)>>12;
ip += pitch / 2;
op += 4;
}
ip = output;
op = output;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[12];
b1 = ip[4] + ip[8];
c1 = ip[4] - ip[8];
d1 = ip[0] - ip[12];
op[0] = ( a1 + b1 + 7)>>4;
op[8] = ( a1 - b1 + 7)>>4;
op[4] =((c1 * 2217 + d1 * 5352 + 12000)>>16) + (d1!=0);
op[12] = (d1 * 2217 - c1 * 5352 + 51000)>>16;
ip++;
op++;
}
}
void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
{
vp8_short_fdct4x4_c(input, output, pitch);
vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
}
void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
{
int i;
int a1, b1, c1, d1;
int a2, b2, c2, d2;
short *ip = input;
short *op = output;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[3];
b1 = ip[1] + ip[2];
c1 = ip[1] - ip[2];
d1 = ip[0] - ip[3];
op[0] = a1 + b1;
op[1] = c1 + d1;
op[2] = a1 - b1;
op[3] = d1 - c1;
ip += pitch / 2;
op += 4;
}
ip = output;
op = output;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[12];
b1 = ip[4] + ip[8];
c1 = ip[4] - ip[8];
d1 = ip[0] - ip[12];
a2 = a1 + b1;
b2 = c1 + d1;
c2 = a1 - b1;
d2 = d1 - c1;
a2 += (a2 > 0);
b2 += (b2 > 0);
c2 += (c2 > 0);
d2 += (d2 > 0);
op[0] = (a2) >> 1;
op[4] = (b2) >> 1;
op[8] = (c2) >> 1;
op[12] = (d2) >> 1;
ip++;
op++;
}
}