From b575394e215ea46c9885992d85c3047de5171f4c Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 12 Dec 2012 15:49:39 -0800 Subject: [PATCH] Improved vp9_ihtllm_c As suggested by Yaowu, we can use eob to reduce the complexity of the vp9_ihtllm_c function. For the 1080p test clip used, the decoder performance improved by 17%. Change-Id: I32486f2f06f9b8f60467d2a574209aa3a3daa435 --- vp9/common/vp9_idctllm.c | 22 ++++++++++--- vp9/common/vp9_invtrans.c | 10 +++--- vp9/common/vp9_rtcd_defs.sh | 2 +- vp9/decoder/vp9_decodframe.c | 19 +++++++----- vp9/decoder/vp9_dequantize.c | 58 ++++++++++++++++++++--------------- vp9/decoder/vp9_dequantize.h | 7 +++-- vp9/encoder/vp9_encodeintra.c | 6 ++-- vp9/encoder/vp9_rdopt.c | 2 +- 8 files changed, 77 insertions(+), 49 deletions(-) diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index 9622dfdee..897514ee1 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c @@ -404,8 +404,9 @@ void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch, #define HORIZONTAL_SHIFT 17 // 15 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1) void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch, - TX_TYPE tx_type, int tx_dim) { + TX_TYPE tx_type, int tx_dim, uint16_t eobs) { int i, j, k; + int nz_dim; int16_t imbuf[256]; const int16_t *ip = input; @@ -444,12 +445,25 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch, break; } + nz_dim = tx_dim; + if(tx_dim > 4) { + if(eobs < 36) { + vpx_memset(im, 0, 512); + nz_dim = 8; + if(eobs < 3) { + nz_dim = 2; + } else if(eobs < 10) { + nz_dim = 4; + } + } + } + /* vertical transformation */ for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { + for (i = 0; i < nz_dim; i++) { int temp = 0; - for (k = 0; k < tx_dim; k++) { + for (k = 0; k < nz_dim; k++) { temp += ptv[k] * ip[(k * tx_dim)]; } @@ -470,7 +484,7 @@ void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch, for (i = 0; i < tx_dim; i++) { int temp = 0; - for (k = 0; k < tx_dim; k++) { + for (k = 0; k < nz_dim; k++) { temp += im[k] * pthc[k]; } diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index c78f1ad3c..eff919865 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -52,7 +52,7 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]); if (tx_type != DCT_DCT) { vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, - tx_type, 4); + tx_type, 4, xd->block[i].eob); } else { vp9_inverse_transform_b_4x4(xd, i, 32); } @@ -91,7 +91,8 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { for (i = 0; i < 9; i += 8) { TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]); if (tx_type != DCT_DCT) { - vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8); + vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8, + xd->block[i].eob); } else { vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], &blockd[i].diff[0], 32); @@ -100,7 +101,8 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { for (i = 2; i < 11; i += 8) { TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]); if (tx_type != DCT_DCT) { - vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8); + vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8, + xd->block[i + 2].eob); } else { vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0], &blockd[i].diff[0], 32); @@ -132,7 +134,7 @@ void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) { BLOCKD *bd = &xd->block[0]; TX_TYPE tx_type = get_tx_type_16x16(xd, bd); if (tx_type != DCT_DCT) { - vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16); + vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16, bd->eob); } else { vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0], &xd->block[0].diff[0], 32); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 5b7af100b..e8981ce5e 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -361,7 +361,7 @@ specialize vp9_short_idct16x16 prototype void vp9_short_idct10_16x16 "short *input, short *output, int pitch" specialize vp9_short_idct10_16x16 -prototype void vp9_ihtllm "const short *input, short *output, int pitch, int tx_type, int tx_dim" +prototype void vp9_ihtllm "const short *input, short *output, int pitch, int tx_type, int tx_dim, short eobs" specialize vp9_ihtllm # diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index f95a83afa..b18ef8b02 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -248,7 +248,8 @@ static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd, if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff, xd->block[0].dequant, xd->predictor, - xd->dst.y_buffer, 16, xd->dst.y_stride); + xd->dst.y_buffer, 16, xd->dst.y_stride, + xd->eobs[0]); } else { vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant, xd->predictor, xd->dst.y_buffer, @@ -294,7 +295,8 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd, } tx_type = get_tx_type_8x8(xd, &xd->block[ib]); if (tx_type != DCT_DCT) { - vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride); + vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride, + xd->eobs[idx]); } else { vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[idx]); @@ -393,7 +395,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 16, - b->dst_stride); + b->dst_stride, b->eob); } else { vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride); @@ -438,7 +440,8 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride); + *(b->base_dst) + b->dst, 16, b->dst_stride, + b->eob); } else { vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride); @@ -500,7 +503,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 16, - b->dst_stride); + b->dst_stride, b->eob); } else { vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride); @@ -553,7 +556,7 @@ static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, tx_type, xd->qcoeff, xd->block[0].dequant, xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_stride, xd->dst.y_stride); + xd->dst.y_stride, xd->dst.y_stride, xd->block[0].eob); } else { vp9_dequant_idct_add_16x16( xd->qcoeff, xd->block[0].dequant, @@ -591,7 +594,7 @@ static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, + x_idx * 16 + (i & 1) * 8, xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride + x_idx * 16 + (i & 1) * 8, - stride, stride); + stride, stride, b->eob); } else { vp9_dequant_idct_add_8x8_c( q, dq, @@ -647,7 +650,7 @@ static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, + x_idx * 16 + (i & 3) * 4, xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride + x_idx * 16 + (i & 3) * 4, - xd->dst.y_stride, xd->dst.y_stride); + xd->dst.y_stride, xd->dst.y_stride, b->eob); } else { vp9_dequant_idct_add_c( b->qcoeff, b->dequant, diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index 79114d58c..39a2de14b 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c @@ -13,7 +13,6 @@ #include "vp9/decoder/vp9_dequantize.h" #include "vpx_mem/vpx_mem.h" #include "vp9/decoder/vp9_onyxd_int.h" - static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride, int width, int height) { int r, c; @@ -74,7 +73,7 @@ void vp9_dequantize_b_c(BLOCKD *d) { void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, - int pitch, int stride) { + int pitch, int stride, uint16_t eobs) { int16_t output[16]; int16_t *diff_ptr = output; int i; @@ -83,7 +82,7 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, input[i] = dq[i] * input[i]; } - vp9_ihtllm(input, output, 4 << 1, tx_type, 4); + vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs); vpx_memset(input, 0, 32); @@ -93,21 +92,25 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, - int pitch, int stride) { + int pitch, int stride, uint16_t eobs) { int16_t output[64]; int16_t *diff_ptr = output; int i; + if (eobs == 0) { + /* All 0 DCT coefficient */ + vp9_copy_mem8x8(pred, pitch, dest, stride); + } else if (eobs > 0) { + input[0] = dq[0] * input[0]; + for (i = 1; i < 64; i++) { + input[i] = dq[1] * input[i]; + } - input[0] = dq[0] * input[0]; - for (i = 1; i < 64; i++) { - input[i] = dq[1] * input[i]; + vp9_ihtllm(input, output, 16, tx_type, 8, eobs); + + vpx_memset(input, 0, 128); + + add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8); } - - vp9_ihtllm(input, output, 16, tx_type, 8); - - vpx_memset(input, 0, 128); - - add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8); } void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, @@ -269,26 +272,31 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq, void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, - uint8_t *dest, int pitch, int stride) { + uint8_t *dest, int pitch, int stride, + uint16_t eobs) { int16_t output[256]; int16_t *diff_ptr = output; int i; + if (eobs == 0) { + /* All 0 DCT coefficient */ + vp9_copy_mem16x16(pred, pitch, dest, stride); + } else if (eobs > 0) { + input[0]= input[0] * dq[0]; - input[0]= input[0] * dq[0]; + // recover quantizer for 4 4x4 blocks + for (i = 1; i < 256; i++) + input[i] = input[i] * dq[1]; - // recover quantizer for 4 4x4 blocks - for (i = 1; i < 256; i++) - input[i] = input[i] * dq[1]; + // inverse hybrid transform + vp9_ihtllm(input, output, 32, tx_type, 16, eobs); - // inverse hybrid transform - vp9_ihtllm(input, output, 32, tx_type, 16); + // the idct halves ( >> 1) the pitch + // vp9_short_idct16x16_c(input, output, 32); - // the idct halves ( >> 1) the pitch - // vp9_short_idct16x16_c(input, output, 32); + vpx_memset(input, 0, 512); - vpx_memset(input, 0, 512); - - add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16); + add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16); + } } void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, diff --git a/vp9/decoder/vp9_dequantize.h b/vp9/decoder/vp9_dequantize.h index 8a6bf2b26..f348b21b0 100644 --- a/vp9/decoder/vp9_dequantize.h +++ b/vp9/decoder/vp9_dequantize.h @@ -58,16 +58,17 @@ typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, const short *dq, void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, const short *dq, unsigned char *pred, unsigned char *dest, - int pitch, int stride); + int pitch, int stride, uint16_t eobs); void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, const short *dq, unsigned char *pred, - unsigned char *dest, int pitch, int stride); + unsigned char *dest, int pitch, int stride, + uint16_t eobs); void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, const short *dq, unsigned char *pred, unsigned char *dest, - int pitch, int stride); + int pitch, int stride, uint16_t eobs); #if CONFIG_SUPERBLOCKS void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, const short *dq, diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index 4ee21bb46..810f1c42b 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -70,7 +70,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) { if (tx_type != DCT_DCT) { vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4); vp9_ht_quantize_b_4x4(be, b, tx_type); - vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4); + vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob); } else { x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b_4x4(be, b) ; @@ -191,7 +191,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { tx_type, 8); x->quantize_b_8x8(x->block + idx, xd->block + idx); vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32, - tx_type, 8); + tx_type, 8, xd->block[idx].eob); } else { x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32); x->quantize_b_8x8(x->block + idx, xd->block + idx); @@ -205,7 +205,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { if (tx_type != DCT_DCT) { vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4); vp9_ht_quantize_b_4x4(be, b, tx_type); - vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4); + vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob); } else { x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b_4x4(be, b); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 9cea18969..4559e4479 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1120,7 +1120,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, // inverse transform if (best_tx_type != DCT_DCT) - vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4); + vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob); else xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);