From d15e1da4940f813311035c3ed101a9c69f15b527 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Tue, 5 Feb 2013 12:37:13 -0800 Subject: [PATCH] Butterfly ADST based hybrid transform Refactor the 8x8 inverse hybrid transform. It is now consistent with the new inverse DCT. Overall performance loss (due to the use of this variant ADST, and the rounding errors in the butterfly implementation) for std-hd is -0.02. Fixed BUILD warning. Devise a variant of the original ADST, which allows butterfly computation structure. This new transform has kernel of the form: sin((2k+1)*(2n+1) / (4N)). One of its butterfly structures using floating-point multiplications was reported in Z. Wang, "Fast algorithms for the discrete W transform and for the discrete Fourier transform", IEEE Trans. on ASSP, 1984. This patch includes the butterfly implementation of the inverse ADST/DCT hybrid transform of dimension 8x8. Change-Id: I3533cb715f749343a80b9087ce34b3e776d1581d --- configure | 1 + vp9/common/vp9_blockd.h | 4 +- vp9/common/vp9_idctllm.c | 170 +++++++++++++++++++++++++++++++++- vp9/common/vp9_invtrans.c | 10 ++ vp9/common/vp9_rtcd_defs.sh | 5 + vp9/decoder/vp9_dequantize.c | 5 +- vp9/encoder/vp9_dct.c | 58 ++++++++++++ vp9/encoder/vp9_encodeintra.c | 6 ++ vpxenc.c | 1 - 9 files changed, 255 insertions(+), 5 deletions(-) diff --git a/configure b/configure index 46919bd3a..04090786f 100755 --- a/configure +++ b/configure @@ -249,6 +249,7 @@ EXPERIMENT_LIST=" newcoefcontext enable_6tap abovesprefmv + intht " CONFIG_LIST=" external_build diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 337dc14f5..c6702ae31 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -413,9 +413,9 @@ typedef struct macroblockd { } MACROBLOCKD; -#define ACTIVE_HT 110 // quantization stepsize threshold +#define ACTIVE_HT 110 // quantization stepsize threshold -#define ACTIVE_HT8 300 +#define ACTIVE_HT8 300 #define ACTIVE_HT16 300 diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index f9318191d..92367fe5a 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c @@ -120,6 +120,42 @@ static const int16_t idct_i16[256] = { 4096, -3675, 3218, -2731, 2217, -1682, 1130, -568 }; +#if CONFIG_INTHT +static const int16_t iadst_i16[256] = { + 284, 850, 1407, 1951, 2476, 2977, 3450, 3889, + 4291, 4652, 4967, 5235, 5453, 5618, 5729, 5784, + 850, 2476, 3889, 4967, 5618, 5784, 5453, 4652, + 3450, 1951, 284, -1407, -2977, -4291, -5235, -5729, + 1407, 3889, 5453, 5729, 4652, 2476, -284, -2977, + -4967, -5784, -5235, -3450, -850, 1951, 4291, 5618, + 1951, 4967, 5729, 3889, 284, -3450, -5618, -5235, + -2476, 1407, 4652, 5784, 4291, 850, -2977, -5453, + 2476, 5618, 4652, 284, -4291, -5729, -2977, 1951, + 5453, 4967, 850, -3889, -5784, -3450, 1407, 5235, + 2977, 5784, 2476, -3450, -5729, -1951, 3889, 5618, + 1407, -4291, -5453, -850, 4652, 5235, 284, -4967, + 3450, 5453, -284, -5618, -2977, 3889, 5235, -850, + -5729, -2476, 4291, 4967, -1407, -5784, -1951, 4652, + 3889, 4652, -2977, -5235, 1951, 5618, -850, -5784, + -284, 5729, 1407, -5453, -2476, 4967, 3450, -4291, + 4291, 3450, -4967, -2476, 5453, 1407, -5729, -284, + 5784, -850, -5618, 1951, 5235, -2977, -4652, 3889, + 4652, 1951, -5784, 1407, 4967, -4291, -2476, 5729, + -850, -5235, 3889, 2977, -5618, 284, 5453, -3450, + 4967, 284, -5235, 4652, 850, -5453, 4291, 1407, + -5618, 3889, 1951, -5729, 3450, 2476, -5784, 2977, + 5235, -1407, -3450, 5784, -3889, -850, 4967, -5453, + 1951, 2977, -5729, 4291, 284, -4652, 5618, -2476, + 5453, -2977, -850, 4291, -5784, 4652, -1407, -2476, + 5235, -5618, 3450, 284, -3889, 5729, -4967, 1951, + 5618, -4291, 1951, 850, -3450, 5235, -5784, 4967, + -2977, 284, 2476, -4652, 5729, -5453, 3889, -1407, + 5729, -5235, 4291, -2977, 1407, 284, -1951, 3450, + -4652, 5453, -5784, 5618, -4967, 3889, -2476, 850, + 5784, -5729, 5618, -5453, 5235, -4967, 4652, -4291, + 3889, -3450, 2977, -2476, 1951, -1407, 850, -284 +}; +#else static const int16_t iadst_i16[256] = { 542, 1607, 2614, 3526, 4311, 4940, 5390, 5646, 5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080, @@ -154,7 +190,7 @@ static const int16_t iadst_i16[256] = { 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311, 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542 }; - +#endif /* Converted the transforms to integer form. */ #define HORIZONTAL_SHIFT 14 // 16 @@ -657,6 +693,138 @@ void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) { } } +#if CONFIG_INTHT +static void iadst8_1d(int16_t *input, int16_t *output) { + int x0, x1, x2, x3, x4, x5, x6, x7; + int s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = input[7]; + x1 = input[0]; + x2 = input[5]; + x3 = input[2]; + x4 = input[3]; + x5 = input[4]; + x6 = input[1]; + x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = dct_const_round_shift(s0 + s4); + x1 = dct_const_round_shift(s1 + s5); + x2 = dct_const_round_shift(s2 + s6); + x3 = dct_const_round_shift(s3 + s7); + x4 = dct_const_round_shift(s0 - s4); + x5 = dct_const_round_shift(s1 - s5); + x6 = dct_const_round_shift(s2 - s6); + x7 = dct_const_round_shift(s3 - s7); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + + output[0] = x0; + output[1] = - x4; + output[2] = x6; + output[3] = - x2; + output[4] = x3; + output[5] = - x7; + output[6] = x5; + output[7] = - x1; + + return; +} + +void vp9_short_iht8x8_c(int16_t *input, int16_t *output, + TX_TYPE tx_type, int pitch) { + int16_t out[8 * 8]; + int16_t *outptr = &out[0]; + const int short_pitch = pitch >> 1; + int i, j; + int16_t temp_in[8], temp_out[8]; + + void (*invr)(int16_t*, int16_t*); + void (*invc)(int16_t*, int16_t*); + + switch (tx_type) { + case ADST_ADST: + invc = &iadst8_1d; + invr = &iadst8_1d; + break; + case ADST_DCT: + invc = &iadst8_1d; + invr = &idct8_1d; + break; + case DCT_ADST: + invc = &idct8_1d; + invr = &iadst8_1d; + break; + case DCT_DCT: + invc = &idct8_1d; + invr = &idct8_1d; + break; + default: + assert(0); + } + + // inverse transform row vectors + for (i = 0; i < 8; ++i) { + invr(input, outptr); + input += 8; + outptr += 8; + } + + // inverse transform column vectors + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + invc(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j * short_pitch + i] = (temp_out[j] + 16) >> 5; + } +} +#endif + + void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) { int16_t out[8 * 8]; int16_t *outptr = &out[0]; diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index b5e6e3cc2..c81fe2d0d 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -91,8 +91,13 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { for (i = 0; i < 9; i += 8) { TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]); if (tx_type != DCT_DCT) { +#if CONFIG_INTHT + vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, + tx_type, 32); +#else vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8, xd->block[i].eob); +#endif } else { vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], &blockd[i].diff[0], 32); @@ -101,8 +106,13 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { for (i = 2; i < 11; i += 8) { TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]); if (tx_type != DCT_DCT) { +#if CONFIG_INTHT + vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff, + tx_type, 32); +#else vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8, xd->block[i + 2].eob); +#endif } else { vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0], &blockd[i].diff[0], 32); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 5339aaa5f..5e4d485b5 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -411,6 +411,11 @@ specialize vp9_short_idct32x32 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 +#if CONFIG_INTHT +prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int tx_type, int pitch" +specialize vp9_short_iht8x8 +#endif + prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs" specialize vp9_ihtllm diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index 18d4e59c7..839a918fb 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c @@ -92,8 +92,11 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, input[i] = dq[1] * input[i]; } +#if CONFIG_INTHT + vp9_short_iht8x8(input, output, tx_type, 16); +#else vp9_ihtllm(input, output, 16, tx_type, 8, eobs); - +#endif vpx_memset(input, 0, 128); add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8); diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index dcd19ca42..d4f5c0c07 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -104,6 +104,26 @@ static const int16_t dct_i8[64] = { 16069, -13623, 9102, -3196 }; +#if CONFIG_INTHT +static const int16_t adst_i8[64] = { + 1606, 4756, 7723, 10394, + 12665, 14449, 15678, 16305, + 4756, 12665, 16305, 14449, + 7723, -1606, -10394, -15678, + 7723, 16305, 10394, -4756, + -15678, -12665, 1606, 14449, + 10394, 14449, -4756, -16305, + -1606, 15678, 7723, -12665, + 12665, 7723, -15678, -1606, + 16305, -4756, -14449, 10394, + 14449, -1606, -12665, 15678, + -4756, -10394, 16305, -7723, + 15678, -10394, 1606, 7723, + -14449, 16305, -12665, 4756, + 16305, -15678, 14449, -12665, + 10394, -7723, 4756, -1606 +}; +#else static const int16_t adst_i8[64] = { 2921, 5742, 8368, 10708, 12684, 14228, 15288, 15827, @@ -122,6 +142,7 @@ static const int16_t adst_i8[64] = { 5742, -10708, 14228, -15827, 15288, -12684, 8368, -2921 }; +#endif static const float dct_16[256] = { 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, @@ -229,6 +250,42 @@ static const int16_t dct_i16[256] = { 11529, -11086, 10217, -8955, 7350, -5461, 3363, -1136 }; +#if CONFIG_INTHT +static const int16_t adst_i16[256] = { + 568, 1700, 2815, 3903, 4953, 5956, 6901, 7780, + 8584, 9305, 9937, 10473, 10908, 11238, 11459, 11571, + 1700, 4953, 7780, 9937, 11238, 11571, 10908, 9305, + 6901, 3903, 568, -2815, -5956, -8584, -10473, -11459, + 2815, 7780, 10908, 11459, 9305, 4953, -568, -5956, + -9937, -11571, -10473, -6901, -1700, 3903, 8584, 11238, + 3903, 9937, 11459, 7780, 568, -6901, -11238, -10473, + -4953, 2815, 9305, 11571, 8584, 1700, -5956, -10908, + 4953, 11238, 9305, 568, -8584, -11459, -5956, 3903, + 10908, 9937, 1700, -7780, -11571, -6901, 2815, 10473, + 5956, 11571, 4953, -6901, -11459, -3903, 7780, 11238, + 2815, -8584, -10908, -1700, 9305, 10473, 568, -9937, + 6901, 10908, -568, -11238, -5956, 7780, 10473, -1700, + -11459, -4953, 8584, 9937, -2815, -11571, -3903, 9305, + 7780, 9305, -5956, -10473, 3903, 11238, -1700, -11571, + -568, 11459, 2815, -10908, -4953, 9937, 6901, -8584, + 8584, 6901, -9937, -4953, 10908, 2815, -11459, -568, + 11571, -1700, -11238, 3903, 10473, -5956, -9305, 7780, + 9305, 3903, -11571, 2815, 9937, -8584, -4953, 11459, + -1700, -10473, 7780, 5956, -11238, 568, 10908, -6901, + 9937, 568, -10473, 9305, 1700, -10908, 8584, 2815, + -11238, 7780, 3903, -11459, 6901, 4953, -11571, 5956, + 10473, -2815, -6901, 11571, -7780, -1700, 9937, -10908, + 3903, 5956, -11459, 8584, 568, -9305, 11238, -4953, + 10908, -5956, -1700, 8584, -11571, 9305, -2815, -4953, + 10473, -11238, 6901, 568, -7780, 11459, -9937, 3903, + 11238, -8584, 3903, 1700, -6901, 10473, -11571, 9937, + -5956, 568, 4953, -9305, 11459, -10908, 7780, -2815, + 11459, -10473, 8584, -5956, 2815, 568, -3903, 6901, + -9305, 10908, -11571, 11238, -9937, 7780, -4953, 1700, + 11571, -11459, 11238, -10908, 10473, -9937, 9305, -8584, + 7780, -6901, 5956, -4953, 3903, -2815, 1700, -568 +}; +#else static const int16_t adst_i16[256] = { 1084, 2159, 3214, 4240, 5228, 6168, 7052, 7873, 8622, 9293, 9880, 10377, 10781, 11087, 11292, 11395, @@ -263,6 +320,7 @@ static const int16_t adst_i16[256] = { 2159, -4240, 6168, -7873, 9293, -10377, 11087, -11395, 11292, -10781, 9880, -8622, 7052, -5228, 3214, -1084 }; +#endif static const int xC1S7 = 16069; static const int xC2S6 = 15137; diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index eacc2cd28..fa7229714 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -152,8 +152,14 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8); x->quantize_b_8x8(x->block + idx, xd->block + idx); + +#if CONFIG_INTHT + vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, + tx_type, 32); +#else vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32, tx_type, 8, xd->block[idx].eob); +#endif } else { x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32); x->quantize_b_8x8(x->block + idx, xd->block + idx); diff --git a/vpxenc.c b/vpxenc.c index cb2569acf..10a606330 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -2472,7 +2472,6 @@ int main(int argc, const char **argv_) { " and --passes=2\n", stream->index, global.pass); }); - /* Use the frame rate from the file only if none was specified * on the command-line. */