vpx/vp9/common/vp9_idct.c

1407 lines
42 KiB
C
Raw Normal View History

2010-05-18 17:58:33 +02:00
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
2010-05-18 17:58:33 +02:00
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
2010-05-18 17:58:33 +02:00
*/
#include <assert.h>
#include <math.h>
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
int i;
int16_t output[16];
int a1, b1, c1, d1, e1;
const int16_t *ip = input;
int16_t *op = output;
for (i = 0; i < 4; i++) {
a1 = ip[0] >> UNIT_QUANT_SHIFT;
c1 = ip[1] >> UNIT_QUANT_SHIFT;
d1 = ip[2] >> UNIT_QUANT_SHIFT;
b1 = ip[3] >> UNIT_QUANT_SHIFT;
a1 += c1;
d1 -= b1;
e1 = (a1 - d1) >> 1;
b1 = e1 - b1;
c1 = e1 - c1;
a1 -= b1;
d1 += c1;
op[0] = a1;
op[1] = b1;
op[2] = c1;
op[3] = d1;
ip += 4;
op += 4;
}
ip = output;
for (i = 0; i < 4; i++) {
a1 = ip[4 * 0];
c1 = ip[4 * 1];
d1 = ip[4 * 2];
b1 = ip[4 * 3];
a1 += c1;
d1 -= b1;
e1 = (a1 - d1) >> 1;
b1 = e1 - b1;
c1 = e1 - c1;
a1 -= b1;
d1 += c1;
dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
ip++;
dest++;
}
}
Add lossless compression mode. This commit adds lossless compression capability to the experimental branch. The lossless experiment can be enabled using --enable-lossless in configure. When the experiment is enabled, the encoder will use lossless compression mode by command line option --lossless, and the decoder automatically recognizes a losslessly encoded clip and decodes accordingly. To achieve the lossless coding, this commit has changed the following: 1. To encode at lossless mode, encoder forces the use of unit quantizer, i.e, Q 0, where effective quantization is 1. Encoder also disables the usage of 8x8 transform and allows only 4x4 transform; 2. At Q 0, the first order 4x4 DCT/IDCT have been switched over to a pair of forward and inverse Walsh-Hadamard Transform (http://goo.gl/EIsfy), with proper scaling applied to match the range of the original 4x4 DCT/IDCT pair; 3. At Q 0, the second order remains to use the previous walsh-hadamard transform pair. However, to maintain the reversibility in second order transform at Q 0, scaling down is applied to first order DC coefficients prior to forward transform, and scaling up is applied to the second order output prior to quantization. Symmetric upscaling and downscaling are added around inverse second order transform; 4. At lossless mode, encoder also disables a number of minor features to ensure no loss is introduced, these features includes: a. Trellis quantization optimization b. Loop filtering c. Aggressive zero-binning, rounding and zero-bin boosting d. Mode based zero-bin boosting Lossless coding test was performed on all clips within the derf set, to verify that the commit has achieved lossless compression for all clips. The average compression ratio is around 2.57 to 1. (http://goo.gl/dEShs) Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
int i;
int a1, e1;
int16_t tmp[4];
const int16_t *ip = in;
int16_t *op = tmp;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
e1 = a1 >> 1;
a1 -= e1;
op[0] = a1;
op[1] = op[2] = op[3] = e1;
ip = tmp;
for (i = 0; i < 4; i++) {
e1 = ip[0] >> 1;
a1 = ip[0] - e1;
dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
ip++;
dest++;
}
}
Add lossless compression mode. This commit adds lossless compression capability to the experimental branch. The lossless experiment can be enabled using --enable-lossless in configure. When the experiment is enabled, the encoder will use lossless compression mode by command line option --lossless, and the decoder automatically recognizes a losslessly encoded clip and decodes accordingly. To achieve the lossless coding, this commit has changed the following: 1. To encode at lossless mode, encoder forces the use of unit quantizer, i.e, Q 0, where effective quantization is 1. Encoder also disables the usage of 8x8 transform and allows only 4x4 transform; 2. At Q 0, the first order 4x4 DCT/IDCT have been switched over to a pair of forward and inverse Walsh-Hadamard Transform (http://goo.gl/EIsfy), with proper scaling applied to match the range of the original 4x4 DCT/IDCT pair; 3. At Q 0, the second order remains to use the previous walsh-hadamard transform pair. However, to maintain the reversibility in second order transform at Q 0, scaling down is applied to first order DC coefficients prior to forward transform, and scaling up is applied to the second order output prior to quantization. Symmetric upscaling and downscaling are added around inverse second order transform; 4. At lossless mode, encoder also disables a number of minor features to ensure no loss is introduced, these features includes: a. Trellis quantization optimization b. Loop filtering c. Aggressive zero-binning, rounding and zero-bin boosting d. Mode based zero-bin boosting Lossless coding test was performed on all clips within the derf set, to verify that the commit has achieved lossless compression for all clips. The average compression ratio is around 2.57 to 1. (http://goo.gl/dEShs) Change-Id: Ia3aba7dd09df40dd590f93b9aba134defbc64e34
2012-06-14 04:03:31 +02:00
static void idct4_1d(const int16_t *input, int16_t *output) {
int16_t step[4];
int temp1, temp2;
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
step[0] = dct_const_round_shift(temp1);
step[1] = dct_const_round_shift(temp2);
temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
step[2] = dct_const_round_shift(temp1);
step[3] = dct_const_round_shift(temp2);
// stage 2
output[0] = step[0] + step[3];
output[1] = step[1] + step[2];
output[2] = step[1] - step[2];
output[3] = step[0] - step[3];
}
void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[4 * 4];
int16_t *outptr = out;
int i, j;
int16_t temp_in[4], temp_out[4];
// Rows
for (i = 0; i < 4; ++i) {
idct4_1d(input, outptr);
input += 4;
outptr += 4;
}
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
idct4_1d(temp_in, temp_out);
for (j = 0; j < 4; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ dest[j * stride + i]);
}
}
void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
int i;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 4);
for (i = 0; i < 4; i++) {
dest[0] = clip_pixel(dest[0] + a1);
dest[1] = clip_pixel(dest[1] + a1);
dest[2] = clip_pixel(dest[2] + a1);
dest[3] = clip_pixel(dest[3] + a1);
dest += dest_stride;
}
}
static void idct8_1d(const int16_t *input, int16_t *output) {
int16_t step1[8], step2[8];
int temp1, temp2;
// stage 1
step1[0] = input[0];
step1[2] = input[4];
step1[1] = input[2];
step1[3] = input[6];
temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
step1[4] = dct_const_round_shift(temp1);
step1[7] = dct_const_round_shift(temp2);
temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
step1[5] = dct_const_round_shift(temp1);
step1[6] = dct_const_round_shift(temp2);
// stage 2 & stage 3 - even half
idct4_1d(step1, step1);
// stage 2 - odd half
step2[4] = step1[4] + step1[5];
step2[5] = step1[4] - step1[5];
step2[6] = -step1[6] + step1[7];
step2[7] = step1[6] + step1[7];
// stage 3 -odd half
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
step1[5] = dct_const_round_shift(temp1);
step1[6] = dct_const_round_shift(temp2);
step1[7] = step2[7];
// stage 4
output[0] = step1[0] + step1[7];
output[1] = step1[1] + step1[6];
output[2] = step1[2] + step1[5];
output[3] = step1[3] + step1[4];
output[4] = step1[3] - step1[4];
output[5] = step1[2] - step1[5];
output[6] = step1[1] - step1[6];
output[7] = step1[0] - step1[7];
}
void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
int i, j;
int16_t temp_in[8], temp_out[8];
// First transform rows
for (i = 0; i < 8; ++i) {
idct8_1d(input, outptr);
input += 8;
outptr += 8;
}
// Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * stride + i]);
}
}
void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i)
dest[i] = clip_pixel(dest[i] + a1);
dest += stride;
}
}
static void iadst4_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[0];
int x1 = input[1];
int x2 = input[2];
int x3 = input[3];
if (!(x0 | x1 | x2 | x3)) {
output[0] = output[1] = output[2] = output[3] = 0;
return;
}
s0 = sinpi_1_9 * x0;
s1 = sinpi_2_9 * x0;
s2 = sinpi_3_9 * x1;
s3 = sinpi_4_9 * x2;
s4 = sinpi_1_9 * x2;
s5 = sinpi_2_9 * x3;
s6 = sinpi_4_9 * x3;
s7 = x0 - x2 + x3;
x0 = s0 + s3 + s5;
x1 = s1 - s4 - s6;
x2 = sinpi_3_9 * s7;
x3 = s2;
s0 = x0 + x3;
s1 = x1 + x3;
s2 = x2;
s3 = x0 + x1 - x3;
// 1-D transform scaling factor is sqrt(2).
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
output[0] = dct_const_round_shift(s0);
output[1] = dct_const_round_shift(s1);
output[2] = dct_const_round_shift(s2);
output[3] = dct_const_round_shift(s3);
}
void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
int tx_type) {
const transform_2d IHT_4[] = {
{ idct4_1d, idct4_1d }, // DCT_DCT = 0
{ iadst4_1d, idct4_1d }, // ADST_DCT = 1
{ idct4_1d, iadst4_1d }, // DCT_ADST = 2
{ iadst4_1d, iadst4_1d } // ADST_ADST = 3
};
int i, j;
int16_t out[4 * 4];
int16_t *outptr = out;
int16_t temp_in[4], temp_out[4];
// inverse transform row vectors
for (i = 0; i < 4; ++i) {
IHT_4[tx_type].rows(input, outptr);
input += 4;
outptr += 4;
}
// inverse transform column vectors
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
IHT_4[tx_type].cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ dest[j * stride + i]);
}
}
static void iadst8_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[7];
int x1 = input[0];
int x2 = input[5];
int x3 = input[2];
int x4 = input[3];
int x5 = input[4];
int x6 = input[1];
int x7 = input[6];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
output[0] = output[1] = output[2] = output[3] = output[4]
= output[5] = output[6] = output[7] = 0;
return;
}
// stage 1
s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
x0 = dct_const_round_shift(s0 + s4);
x1 = dct_const_round_shift(s1 + s5);
x2 = dct_const_round_shift(s2 + s6);
x3 = dct_const_round_shift(s3 + s7);
x4 = dct_const_round_shift(s0 - s4);
x5 = dct_const_round_shift(s1 - s5);
x6 = dct_const_round_shift(s2 - s6);
x7 = dct_const_round_shift(s3 - s7);
// stage 2
s0 = x0;
s1 = x1;
s2 = x2;
s3 = x3;
s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
x0 = s0 + s2;
x1 = s1 + s3;
x2 = s0 - s2;
x3 = s1 - s3;
x4 = dct_const_round_shift(s4 + s6);
x5 = dct_const_round_shift(s5 + s7);
x6 = dct_const_round_shift(s4 - s6);
x7 = dct_const_round_shift(s5 - s7);
// stage 3
s2 = cospi_16_64 * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (x6 - x7);
x2 = dct_const_round_shift(s2);
x3 = dct_const_round_shift(s3);
x6 = dct_const_round_shift(s6);
x7 = dct_const_round_shift(s7);
output[0] = x0;
output[1] = -x4;
output[2] = x6;
output[3] = -x2;
output[4] = x3;
output[5] = -x7;
output[6] = x5;
output[7] = -x1;
}
static const transform_2d IHT_8[] = {
{ idct8_1d, idct8_1d }, // DCT_DCT = 0
{ iadst8_1d, idct8_1d }, // ADST_DCT = 1
{ idct8_1d, iadst8_1d }, // DCT_ADST = 2
{ iadst8_1d, iadst8_1d } // ADST_ADST = 3
};
void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
int tx_type) {
int i, j;
int16_t out[8 * 8];
int16_t *outptr = out;
int16_t temp_in[8], temp_out[8];
const transform_2d ht = IHT_8[tx_type];
// inverse transform row vectors
for (i = 0; i < 8; ++i) {
ht.rows(input, outptr);
input += 8;
outptr += 8;
}
// inverse transform column vectors
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * stride + i]);
}
}
void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[8 * 8] = { 0 };
int16_t *outptr = out;
int i, j;
int16_t temp_in[8], temp_out[8];
// First transform rows
// only first 4 row has non-zero coefs
for (i = 0; i < 4; ++i) {
idct8_1d(input, outptr);
input += 8;
outptr += 8;
}
// Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * stride + i]);
}
}
static void idct16_1d(const int16_t *input, int16_t *output) {
int16_t step1[16], step2[16];
int temp1, temp2;
// stage 1
step1[0] = input[0/2];
step1[1] = input[16/2];
step1[2] = input[8/2];
step1[3] = input[24/2];
step1[4] = input[4/2];
step1[5] = input[20/2];
step1[6] = input[12/2];
step1[7] = input[28/2];
step1[8] = input[2/2];
step1[9] = input[18/2];
step1[10] = input[10/2];
step1[11] = input[26/2];
step1[12] = input[6/2];
step1[13] = input[22/2];
step1[14] = input[14/2];
step1[15] = input[30/2];
// stage 2
step2[0] = step1[0];
step2[1] = step1[1];
step2[2] = step1[2];
step2[3] = step1[3];
step2[4] = step1[4];
step2[5] = step1[5];
step2[6] = step1[6];
step2[7] = step1[7];
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
step2[8] = dct_const_round_shift(temp1);
step2[15] = dct_const_round_shift(temp2);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
step2[9] = dct_const_round_shift(temp1);
step2[14] = dct_const_round_shift(temp2);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
step2[10] = dct_const_round_shift(temp1);
step2[13] = dct_const_round_shift(temp2);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
step2[11] = dct_const_round_shift(temp1);
step2[12] = dct_const_round_shift(temp2);
// stage 3
step1[0] = step2[0];
step1[1] = step2[1];
step1[2] = step2[2];
step1[3] = step2[3];
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
step1[4] = dct_const_round_shift(temp1);
step1[7] = dct_const_round_shift(temp2);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
step1[5] = dct_const_round_shift(temp1);
step1[6] = dct_const_round_shift(temp2);
step1[8] = step2[8] + step2[9];
step1[9] = step2[8] - step2[9];
step1[10] = -step2[10] + step2[11];
step1[11] = step2[10] + step2[11];
step1[12] = step2[12] + step2[13];
step1[13] = step2[12] - step2[13];
step1[14] = -step2[14] + step2[15];
step1[15] = step2[14] + step2[15];
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
step2[0] = dct_const_round_shift(temp1);
step2[1] = dct_const_round_shift(temp2);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
step2[2] = dct_const_round_shift(temp1);
step2[3] = dct_const_round_shift(temp2);
step2[4] = step1[4] + step1[5];
step2[5] = step1[4] - step1[5];
step2[6] = -step1[6] + step1[7];
step2[7] = step1[6] + step1[7];
step2[8] = step1[8];
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
step2[9] = dct_const_round_shift(temp1);
step2[14] = dct_const_round_shift(temp2);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
step2[10] = dct_const_round_shift(temp1);
step2[13] = dct_const_round_shift(temp2);
step2[11] = step1[11];
step2[12] = step1[12];
// stage 5
step1[0] = step2[0] + step2[3];
step1[1] = step2[1] + step2[2];
step1[2] = step2[1] - step2[2];
step1[3] = step2[0] - step2[3];
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
step1[5] = dct_const_round_shift(temp1);
step1[6] = dct_const_round_shift(temp2);
step1[7] = step2[7];
step1[8] = step2[8] + step2[11];
step1[9] = step2[9] + step2[10];
step1[10] = step2[9] - step2[10];
step1[11] = step2[8] - step2[11];
step1[12] = -step2[12] + step2[15];
step1[13] = -step2[13] + step2[14];
step1[14] = step2[13] + step2[14];
step1[15] = step2[12] + step2[15];
// stage 6
step2[0] = step1[0] + step1[7];
step2[1] = step1[1] + step1[6];
step2[2] = step1[2] + step1[5];
step2[3] = step1[3] + step1[4];
step2[4] = step1[3] - step1[4];
step2[5] = step1[2] - step1[5];
step2[6] = step1[1] - step1[6];
step2[7] = step1[0] - step1[7];
step2[8] = step1[8];
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
step2[10] = dct_const_round_shift(temp1);
step2[13] = dct_const_round_shift(temp2);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
step2[11] = dct_const_round_shift(temp1);
step2[12] = dct_const_round_shift(temp2);
step2[14] = step1[14];
step2[15] = step1[15];
// stage 7
output[0] = step2[0] + step2[15];
output[1] = step2[1] + step2[14];
output[2] = step2[2] + step2[13];
output[3] = step2[3] + step2[12];
output[4] = step2[4] + step2[11];
output[5] = step2[5] + step2[10];
output[6] = step2[6] + step2[9];
output[7] = step2[7] + step2[8];
output[8] = step2[7] - step2[8];
output[9] = step2[6] - step2[9];
output[10] = step2[5] - step2[10];
output[11] = step2[4] - step2[11];
output[12] = step2[3] - step2[12];
output[13] = step2[2] - step2[13];
output[14] = step2[1] - step2[14];
output[15] = step2[0] - step2[15];
}
void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[16 * 16];
int16_t *outptr = out;
int i, j;
int16_t temp_in[16], temp_out[16];
// First transform rows
for (i = 0; i < 16; ++i) {
idct16_1d(input, outptr);
input += 16;
outptr += 16;
}
// Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
}
}
static void iadst16_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
int x0 = input[15];
int x1 = input[0];
int x2 = input[13];
int x3 = input[2];
int x4 = input[11];
int x5 = input[4];
int x6 = input[9];
int x7 = input[6];
int x8 = input[7];
int x9 = input[8];
int x10 = input[5];
int x11 = input[10];
int x12 = input[3];
int x13 = input[12];
int x14 = input[1];
int x15 = input[14];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
| x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
output[0] = output[1] = output[2] = output[3] = output[4]
= output[5] = output[6] = output[7] = output[8]
= output[9] = output[10] = output[11] = output[12]
= output[13] = output[14] = output[15] = 0;
return;
}
// stage 1
s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
x0 = dct_const_round_shift(s0 + s8);
x1 = dct_const_round_shift(s1 + s9);
x2 = dct_const_round_shift(s2 + s10);
x3 = dct_const_round_shift(s3 + s11);
x4 = dct_const_round_shift(s4 + s12);
x5 = dct_const_round_shift(s5 + s13);
x6 = dct_const_round_shift(s6 + s14);
x7 = dct_const_round_shift(s7 + s15);
x8 = dct_const_round_shift(s0 - s8);
x9 = dct_const_round_shift(s1 - s9);
x10 = dct_const_round_shift(s2 - s10);
x11 = dct_const_round_shift(s3 - s11);
x12 = dct_const_round_shift(s4 - s12);
x13 = dct_const_round_shift(s5 - s13);
x14 = dct_const_round_shift(s6 - s14);
x15 = dct_const_round_shift(s7 - s15);
// stage 2
s0 = x0;
s1 = x1;
s2 = x2;
s3 = x3;
s4 = x4;
s5 = x5;
s6 = x6;
s7 = x7;
s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
x0 = s0 + s4;
x1 = s1 + s5;
x2 = s2 + s6;
x3 = s3 + s7;
x4 = s0 - s4;
x5 = s1 - s5;
x6 = s2 - s6;
x7 = s3 - s7;
x8 = dct_const_round_shift(s8 + s12);
x9 = dct_const_round_shift(s9 + s13);
x10 = dct_const_round_shift(s10 + s14);
x11 = dct_const_round_shift(s11 + s15);
x12 = dct_const_round_shift(s8 - s12);
x13 = dct_const_round_shift(s9 - s13);
x14 = dct_const_round_shift(s10 - s14);
x15 = dct_const_round_shift(s11 - s15);
// stage 3
s0 = x0;
s1 = x1;
s2 = x2;
s3 = x3;
s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
s8 = x8;
s9 = x9;
s10 = x10;
s11 = x11;
s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
x0 = s0 + s2;
x1 = s1 + s3;
x2 = s0 - s2;
x3 = s1 - s3;
x4 = dct_const_round_shift(s4 + s6);
x5 = dct_const_round_shift(s5 + s7);
x6 = dct_const_round_shift(s4 - s6);
x7 = dct_const_round_shift(s5 - s7);
x8 = s8 + s10;
x9 = s9 + s11;
x10 = s8 - s10;
x11 = s9 - s11;
x12 = dct_const_round_shift(s12 + s14);
x13 = dct_const_round_shift(s13 + s15);
x14 = dct_const_round_shift(s12 - s14);
x15 = dct_const_round_shift(s13 - s15);
// stage 4
s2 = (- cospi_16_64) * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (- x6 + x7);
s10 = cospi_16_64 * (x10 + x11);
s11 = cospi_16_64 * (- x10 + x11);
s14 = (- cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15);
x2 = dct_const_round_shift(s2);
x3 = dct_const_round_shift(s3);
x6 = dct_const_round_shift(s6);
x7 = dct_const_round_shift(s7);
x10 = dct_const_round_shift(s10);
x11 = dct_const_round_shift(s11);
x14 = dct_const_round_shift(s14);
x15 = dct_const_round_shift(s15);
output[0] = x0;
output[1] = -x8;
output[2] = x12;
output[3] = -x4;
output[4] = x6;
output[5] = x14;
output[6] = x10;
output[7] = x2;
output[8] = x3;
output[9] = x11;
output[10] = x15;
output[11] = x7;
output[12] = x5;
output[13] = -x13;
output[14] = x9;
output[15] = -x1;
}
static const transform_2d IHT_16[] = {
{ idct16_1d, idct16_1d }, // DCT_DCT = 0
{ iadst16_1d, idct16_1d }, // ADST_DCT = 1
{ idct16_1d, iadst16_1d }, // DCT_ADST = 2
{ iadst16_1d, iadst16_1d } // ADST_ADST = 3
};
void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
int tx_type) {
int i, j;
int16_t out[16 * 16];
int16_t *outptr = out;
int16_t temp_in[16], temp_out[16];
const transform_2d ht = IHT_16[tx_type];
// Rows
for (i = 0; i < 16; ++i) {
ht.rows(input, outptr);
input += 16;
outptr += 16;
}
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
}
}
void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[16 * 16] = { 0 };
int16_t *outptr = out;
int i, j;
int16_t temp_in[16], temp_out[16];
32x32 transform for superblocks. This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds code all over the place to wrap that in the bitstream/encoder/decoder/RD. Some implementation notes (these probably need careful review): - token range is extended by 1 bit, since the value range out of this transform is [-16384,16383]. - the coefficients coming out of the FDCT are manually scaled back by 1 bit, or else they won't fit in int16_t (they are 17 bits). Because of this, the RD error scoring does not right-shift the MSE score by two (unlike for 4x4/8x8/16x16). - to compensate for this loss in precision, the quantizer is halved also. This is currently a little hacky. - FDCT and IDCT is double-only right now. Needs a fixed-point impl. - There are no default probabilities for the 32x32 transform yet; I'm simply using the 16x16 luma ones. A future commit will add newly generated probabilities for all transforms. - No ADST version. I don't think we'll add one for this level; if an ADST is desired, transform-size selection can scale back to 16x16 or lower, and use an ADST at that level. Additional notes specific to Debargha's DWT/DCT hybrid: - coefficient scale is different for the top/left 16x16 (DCT-over-DWT) block than for the rest (DWT pixel differences) of the block. Therefore, RD error scoring isn't easily scalable between coefficient and pixel domain. Thus, unfortunately, we need to compute the RD distortion in the pixel domain until we figure out how to scale these appropriately. Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
for (i = 0; i < 4; ++i) {
idct16_1d(input, outptr);
input += 16;
outptr += 16;
}
// Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j*16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
}
}
void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i)
dest[i] = clip_pixel(dest[i] + a1);
dest += stride;
}
}
static void idct32_1d(const int16_t *input, int16_t *output) {
int16_t step1[32], step2[32];
int temp1, temp2;
32x32 transform for superblocks. This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds code all over the place to wrap that in the bitstream/encoder/decoder/RD. Some implementation notes (these probably need careful review): - token range is extended by 1 bit, since the value range out of this transform is [-16384,16383]. - the coefficients coming out of the FDCT are manually scaled back by 1 bit, or else they won't fit in int16_t (they are 17 bits). Because of this, the RD error scoring does not right-shift the MSE score by two (unlike for 4x4/8x8/16x16). - to compensate for this loss in precision, the quantizer is halved also. This is currently a little hacky. - FDCT and IDCT is double-only right now. Needs a fixed-point impl. - There are no default probabilities for the 32x32 transform yet; I'm simply using the 16x16 luma ones. A future commit will add newly generated probabilities for all transforms. - No ADST version. I don't think we'll add one for this level; if an ADST is desired, transform-size selection can scale back to 16x16 or lower, and use an ADST at that level. Additional notes specific to Debargha's DWT/DCT hybrid: - coefficient scale is different for the top/left 16x16 (DCT-over-DWT) block than for the rest (DWT pixel differences) of the block. Therefore, RD error scoring isn't easily scalable between coefficient and pixel domain. Thus, unfortunately, we need to compute the RD distortion in the pixel domain until we figure out how to scale these appropriately. Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
// stage 1
step1[0] = input[0];
step1[1] = input[16];
step1[2] = input[8];
step1[3] = input[24];
step1[4] = input[4];
step1[5] = input[20];
step1[6] = input[12];
step1[7] = input[28];
step1[8] = input[2];
step1[9] = input[18];
step1[10] = input[10];
step1[11] = input[26];
step1[12] = input[6];
step1[13] = input[22];
step1[14] = input[14];
step1[15] = input[30];
temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
step1[16] = dct_const_round_shift(temp1);
step1[31] = dct_const_round_shift(temp2);
temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
step1[17] = dct_const_round_shift(temp1);
step1[30] = dct_const_round_shift(temp2);
temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
step1[18] = dct_const_round_shift(temp1);
step1[29] = dct_const_round_shift(temp2);
temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
step1[19] = dct_const_round_shift(temp1);
step1[28] = dct_const_round_shift(temp2);
temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
step1[20] = dct_const_round_shift(temp1);
step1[27] = dct_const_round_shift(temp2);
temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
step1[21] = dct_const_round_shift(temp1);
step1[26] = dct_const_round_shift(temp2);
temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
step1[22] = dct_const_round_shift(temp1);
step1[25] = dct_const_round_shift(temp2);
temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
step1[23] = dct_const_round_shift(temp1);
step1[24] = dct_const_round_shift(temp2);
// stage 2
step2[0] = step1[0];
step2[1] = step1[1];
step2[2] = step1[2];
step2[3] = step1[3];
step2[4] = step1[4];
step2[5] = step1[5];
step2[6] = step1[6];
step2[7] = step1[7];
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
step2[8] = dct_const_round_shift(temp1);
step2[15] = dct_const_round_shift(temp2);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
step2[9] = dct_const_round_shift(temp1);
step2[14] = dct_const_round_shift(temp2);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
step2[10] = dct_const_round_shift(temp1);
step2[13] = dct_const_round_shift(temp2);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
step2[11] = dct_const_round_shift(temp1);
step2[12] = dct_const_round_shift(temp2);
step2[16] = step1[16] + step1[17];
step2[17] = step1[16] - step1[17];
step2[18] = -step1[18] + step1[19];
step2[19] = step1[18] + step1[19];
step2[20] = step1[20] + step1[21];
step2[21] = step1[20] - step1[21];
step2[22] = -step1[22] + step1[23];
step2[23] = step1[22] + step1[23];
step2[24] = step1[24] + step1[25];
step2[25] = step1[24] - step1[25];
step2[26] = -step1[26] + step1[27];
step2[27] = step1[26] + step1[27];
step2[28] = step1[28] + step1[29];
step2[29] = step1[28] - step1[29];
step2[30] = -step1[30] + step1[31];
step2[31] = step1[30] + step1[31];
// stage 3
step1[0] = step2[0];
step1[1] = step2[1];
step1[2] = step2[2];
step1[3] = step2[3];
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
step1[4] = dct_const_round_shift(temp1);
step1[7] = dct_const_round_shift(temp2);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
step1[5] = dct_const_round_shift(temp1);
step1[6] = dct_const_round_shift(temp2);
step1[8] = step2[8] + step2[9];
step1[9] = step2[8] - step2[9];
step1[10] = -step2[10] + step2[11];
step1[11] = step2[10] + step2[11];
step1[12] = step2[12] + step2[13];
step1[13] = step2[12] - step2[13];
step1[14] = -step2[14] + step2[15];
step1[15] = step2[14] + step2[15];
step1[16] = step2[16];
step1[31] = step2[31];
temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
step1[17] = dct_const_round_shift(temp1);
step1[30] = dct_const_round_shift(temp2);
temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
step1[18] = dct_const_round_shift(temp1);
step1[29] = dct_const_round_shift(temp2);
step1[19] = step2[19];
step1[20] = step2[20];
temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
step1[21] = dct_const_round_shift(temp1);
step1[26] = dct_const_round_shift(temp2);
temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
step1[22] = dct_const_round_shift(temp1);
step1[25] = dct_const_round_shift(temp2);
step1[23] = step2[23];
step1[24] = step2[24];
step1[27] = step2[27];
step1[28] = step2[28];
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
step2[0] = dct_const_round_shift(temp1);
step2[1] = dct_const_round_shift(temp2);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
step2[2] = dct_const_round_shift(temp1);
step2[3] = dct_const_round_shift(temp2);
step2[4] = step1[4] + step1[5];
step2[5] = step1[4] - step1[5];
step2[6] = -step1[6] + step1[7];
step2[7] = step1[6] + step1[7];
step2[8] = step1[8];
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
step2[9] = dct_const_round_shift(temp1);
step2[14] = dct_const_round_shift(temp2);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
step2[10] = dct_const_round_shift(temp1);
step2[13] = dct_const_round_shift(temp2);
step2[11] = step1[11];
step2[12] = step1[12];
step2[16] = step1[16] + step1[19];
step2[17] = step1[17] + step1[18];
step2[18] = step1[17] - step1[18];
step2[19] = step1[16] - step1[19];
step2[20] = -step1[20] + step1[23];
step2[21] = -step1[21] + step1[22];
step2[22] = step1[21] + step1[22];
step2[23] = step1[20] + step1[23];
step2[24] = step1[24] + step1[27];
step2[25] = step1[25] + step1[26];
step2[26] = step1[25] - step1[26];
step2[27] = step1[24] - step1[27];
step2[28] = -step1[28] + step1[31];
step2[29] = -step1[29] + step1[30];
step2[30] = step1[29] + step1[30];
step2[31] = step1[28] + step1[31];
// stage 5
step1[0] = step2[0] + step2[3];
step1[1] = step2[1] + step2[2];
step1[2] = step2[1] - step2[2];
step1[3] = step2[0] - step2[3];
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
step1[5] = dct_const_round_shift(temp1);
step1[6] = dct_const_round_shift(temp2);
step1[7] = step2[7];
step1[8] = step2[8] + step2[11];
step1[9] = step2[9] + step2[10];
step1[10] = step2[9] - step2[10];
step1[11] = step2[8] - step2[11];
step1[12] = -step2[12] + step2[15];
step1[13] = -step2[13] + step2[14];
step1[14] = step2[13] + step2[14];
step1[15] = step2[12] + step2[15];
step1[16] = step2[16];
step1[17] = step2[17];
temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
step1[18] = dct_const_round_shift(temp1);
step1[29] = dct_const_round_shift(temp2);
temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
step1[19] = dct_const_round_shift(temp1);
step1[28] = dct_const_round_shift(temp2);
temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
step1[20] = dct_const_round_shift(temp1);
step1[27] = dct_const_round_shift(temp2);
temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
step1[21] = dct_const_round_shift(temp1);
step1[26] = dct_const_round_shift(temp2);
step1[22] = step2[22];
step1[23] = step2[23];
step1[24] = step2[24];
step1[25] = step2[25];
step1[30] = step2[30];
step1[31] = step2[31];
// stage 6
step2[0] = step1[0] + step1[7];
step2[1] = step1[1] + step1[6];
step2[2] = step1[2] + step1[5];
step2[3] = step1[3] + step1[4];
step2[4] = step1[3] - step1[4];
step2[5] = step1[2] - step1[5];
step2[6] = step1[1] - step1[6];
step2[7] = step1[0] - step1[7];
step2[8] = step1[8];
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
step2[10] = dct_const_round_shift(temp1);
step2[13] = dct_const_round_shift(temp2);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
step2[11] = dct_const_round_shift(temp1);
step2[12] = dct_const_round_shift(temp2);
step2[14] = step1[14];
step2[15] = step1[15];
step2[16] = step1[16] + step1[23];
step2[17] = step1[17] + step1[22];
step2[18] = step1[18] + step1[21];
step2[19] = step1[19] + step1[20];
step2[20] = step1[19] - step1[20];
step2[21] = step1[18] - step1[21];
step2[22] = step1[17] - step1[22];
step2[23] = step1[16] - step1[23];
step2[24] = -step1[24] + step1[31];
step2[25] = -step1[25] + step1[30];
step2[26] = -step1[26] + step1[29];
step2[27] = -step1[27] + step1[28];
step2[28] = step1[27] + step1[28];
step2[29] = step1[26] + step1[29];
step2[30] = step1[25] + step1[30];
step2[31] = step1[24] + step1[31];
// stage 7
step1[0] = step2[0] + step2[15];
step1[1] = step2[1] + step2[14];
step1[2] = step2[2] + step2[13];
step1[3] = step2[3] + step2[12];
step1[4] = step2[4] + step2[11];
step1[5] = step2[5] + step2[10];
step1[6] = step2[6] + step2[9];
step1[7] = step2[7] + step2[8];
step1[8] = step2[7] - step2[8];
step1[9] = step2[6] - step2[9];
step1[10] = step2[5] - step2[10];
step1[11] = step2[4] - step2[11];
step1[12] = step2[3] - step2[12];
step1[13] = step2[2] - step2[13];
step1[14] = step2[1] - step2[14];
step1[15] = step2[0] - step2[15];
step1[16] = step2[16];
step1[17] = step2[17];
step1[18] = step2[18];
step1[19] = step2[19];
temp1 = (-step2[20] + step2[27]) * cospi_16_64;
temp2 = (step2[20] + step2[27]) * cospi_16_64;
step1[20] = dct_const_round_shift(temp1);
step1[27] = dct_const_round_shift(temp2);
temp1 = (-step2[21] + step2[26]) * cospi_16_64;
temp2 = (step2[21] + step2[26]) * cospi_16_64;
step1[21] = dct_const_round_shift(temp1);
step1[26] = dct_const_round_shift(temp2);
temp1 = (-step2[22] + step2[25]) * cospi_16_64;
temp2 = (step2[22] + step2[25]) * cospi_16_64;
step1[22] = dct_const_round_shift(temp1);
step1[25] = dct_const_round_shift(temp2);
temp1 = (-step2[23] + step2[24]) * cospi_16_64;
temp2 = (step2[23] + step2[24]) * cospi_16_64;
step1[23] = dct_const_round_shift(temp1);
step1[24] = dct_const_round_shift(temp2);
step1[28] = step2[28];
step1[29] = step2[29];
step1[30] = step2[30];
step1[31] = step2[31];
// final stage
output[0] = step1[0] + step1[31];
output[1] = step1[1] + step1[30];
output[2] = step1[2] + step1[29];
output[3] = step1[3] + step1[28];
output[4] = step1[4] + step1[27];
output[5] = step1[5] + step1[26];
output[6] = step1[6] + step1[25];
output[7] = step1[7] + step1[24];
output[8] = step1[8] + step1[23];
output[9] = step1[9] + step1[22];
output[10] = step1[10] + step1[21];
output[11] = step1[11] + step1[20];
output[12] = step1[12] + step1[19];
output[13] = step1[13] + step1[18];
output[14] = step1[14] + step1[17];
output[15] = step1[15] + step1[16];
output[16] = step1[15] - step1[16];
output[17] = step1[14] - step1[17];
output[18] = step1[13] - step1[18];
output[19] = step1[12] - step1[19];
output[20] = step1[11] - step1[20];
output[21] = step1[10] - step1[21];
output[22] = step1[9] - step1[22];
output[23] = step1[8] - step1[23];
output[24] = step1[7] - step1[24];
output[25] = step1[6] - step1[25];
output[26] = step1[5] - step1[26];
output[27] = step1[4] - step1[27];
output[28] = step1[3] - step1[28];
output[29] = step1[2] - step1[29];
output[30] = step1[1] - step1[30];
output[31] = step1[0] - step1[31];
32x32 transform for superblocks. This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds code all over the place to wrap that in the bitstream/encoder/decoder/RD. Some implementation notes (these probably need careful review): - token range is extended by 1 bit, since the value range out of this transform is [-16384,16383]. - the coefficients coming out of the FDCT are manually scaled back by 1 bit, or else they won't fit in int16_t (they are 17 bits). Because of this, the RD error scoring does not right-shift the MSE score by two (unlike for 4x4/8x8/16x16). - to compensate for this loss in precision, the quantizer is halved also. This is currently a little hacky. - FDCT and IDCT is double-only right now. Needs a fixed-point impl. - There are no default probabilities for the 32x32 transform yet; I'm simply using the 16x16 luma ones. A future commit will add newly generated probabilities for all transforms. - No ADST version. I don't think we'll add one for this level; if an ADST is desired, transform-size selection can scale back to 16x16 or lower, and use an ADST at that level. Additional notes specific to Debargha's DWT/DCT hybrid: - coefficient scale is different for the top/left 16x16 (DCT-over-DWT) block than for the rest (DWT pixel differences) of the block. Therefore, RD error scoring isn't easily scalable between coefficient and pixel domain. Thus, unfortunately, we need to compute the RD distortion in the pixel domain until we figure out how to scale these appropriately. Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
}
void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[32 * 32];
int16_t *outptr = out;
int i, j;
int16_t temp_in[32], temp_out[32];
// Rows
for (i = 0; i < 32; ++i) {
int16_t zero_coeff[16];
for (j = 0; j < 16; ++j)
zero_coeff[j] = input[2 * j] | input[2 * j + 1];
for (j = 0; j < 8; ++j)
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
for (j = 0; j < 4; ++j)
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
for (j = 0; j < 2; ++j)
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
if (zero_coeff[0] | zero_coeff[1])
idct32_1d(input, outptr);
else
vpx_memset(outptr, 0, sizeof(int16_t) * 32);
input += 32;
outptr += 32;
}
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
32x32 transform for superblocks. This adds Debargha's DCT/DWT hybrid and a regular 32x32 DCT, and adds code all over the place to wrap that in the bitstream/encoder/decoder/RD. Some implementation notes (these probably need careful review): - token range is extended by 1 bit, since the value range out of this transform is [-16384,16383]. - the coefficients coming out of the FDCT are manually scaled back by 1 bit, or else they won't fit in int16_t (they are 17 bits). Because of this, the RD error scoring does not right-shift the MSE score by two (unlike for 4x4/8x8/16x16). - to compensate for this loss in precision, the quantizer is halved also. This is currently a little hacky. - FDCT and IDCT is double-only right now. Needs a fixed-point impl. - There are no default probabilities for the 32x32 transform yet; I'm simply using the 16x16 luma ones. A future commit will add newly generated probabilities for all transforms. - No ADST version. I don't think we'll add one for this level; if an ADST is desired, transform-size selection can scale back to 16x16 or lower, and use an ADST at that level. Additional notes specific to Debargha's DWT/DCT hybrid: - coefficient scale is different for the top/left 16x16 (DCT-over-DWT) block than for the rest (DWT pixel differences) of the block. Therefore, RD error scoring isn't easily scalable between coefficient and pixel domain. Thus, unfortunately, we need to compute the RD distortion in the pixel domain until we figure out how to scale these appropriately. Change-Id: I00386f20f35d7fabb19aba94c8162f8aee64ef2b
2012-12-07 23:45:05 +01:00
}
}
void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[32 * 32] = {0};
int16_t *outptr = out;
int i, j;
int16_t temp_in[32], temp_out[32];
// Rows
// only upper-left 8x8 has non-zero coeff
for (i = 0; i < 8; ++i) {
idct32_1d(input, outptr);
input += 32;
outptr += 32;
}
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ dest[j * stride + i]);
}
}
void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) {
for (i = 0; i < 32; ++i)
dest[i] = clip_pixel(dest[i] + a1);
dest += stride;
}
}
// idct
void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
if (eob > 1)
vp9_idct4x4_16_add(input, dest, stride);
else
vp9_idct4x4_1_add(input, dest, stride);
}
void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
if (eob > 1)
vp9_iwht4x4_16_add(input, dest, stride);
else
vp9_iwht4x4_1_add(input, dest, stride);
}
void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
// The calculation can be simplified if there are not many non-zero dct
// coefficients. Use eobs to decide what to do.
// TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
// Combine that with code here.
if (eob == 1)
// DC only DCT coefficient
vp9_idct8x8_1_add(input, dest, stride);
else if (eob <= 10)
vp9_idct8x8_10_add(input, dest, stride);
else
vp9_idct8x8_64_add(input, dest, stride);
}
void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
int eob) {
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
if (eob == 1)
/* DC only DCT coefficient. */
vp9_idct16x16_1_add(input, dest, stride);
else if (eob <= 10)
vp9_idct16x16_10_add(input, dest, stride);
else
vp9_idct16x16_256_add(input, dest, stride);
}
void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
int eob) {
if (eob == 1)
vp9_idct32x32_1_add(input, dest, stride);
else if (eob <= 34)
// non-zero coeff only in upper-left 8x8
vp9_idct32x32_34_add(input, dest, stride);
else
vp9_idct32x32_1024_add(input, dest, stride);
}
// iht
void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT)
vp9_idct4x4_add(input, dest, stride, eob);
else
vp9_iht4x4_16_add(input, dest, stride, tx_type);
}
void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT) {
vp9_idct8x8_add(input, dest, stride, eob);
} else {
vp9_iht8x8_64_add(input, dest, stride, tx_type);
}
}
void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT) {
vp9_idct16x16_add(input, dest, stride, eob);
} else {
vp9_iht16x16_256_add(input, dest, stride, tx_type);
}
}