Added vp9_short_idct1_32x32_c

and called this function in vp9_dequant_idct_add_32x32_c when
eob == 1.  For the test clip used, the decoder performance improved
by 21+%.  Based on Yaowu's 16 point idct work.

Change-Id: Ib579a90fed531d45777980e04bf0c9b23c093c43
This commit is contained in:
Scott LaVarnway 2013-02-04 16:49:17 -08:00
parent ebd5808970
commit 5780c4cbd5
3 changed files with 29 additions and 7 deletions

View File

@ -1644,6 +1644,16 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
}
}
void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
int tmp;
int16_t out;
tmp = input[0] * cospi_16_64;
out = dct_const_round_shift(tmp);
tmp = out * cospi_16_64;
out = dct_const_round_shift(tmp);
*output = (out + 32) >> 6;
}
#else // !CONFIG_DWTDCTHYBRID
#if DWT_TYPE == 53

View File

@ -408,6 +408,9 @@ specialize vp9_short_idct1_16x16
prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct32x32
prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_32x32
prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
specialize vp9_ihtllm

View File

@ -349,13 +349,22 @@ void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
int i;
if (eob) {
input[0]= input[0] * dq[0] / 2;
for (i = 1; i < 1024; i++)
input[i] = input[i] * dq[1] / 2;
vp9_short_idct32x32_c(input, output, 64);
vpx_memset(input, 0, 2048);
add_residual(output, pred, pitch, dest, stride, 32, 32);
input[0] = input[0] * dq[0] / 2;
#if !CONFIG_DWTDCTHYBRID
if (eob == 1) {
vp9_short_idct1_32x32_c(input, output);
add_constant_residual(output[0], pred, pitch, dest, stride, 32, 32);
input[0] = 0;
} else {
#endif
for (i = 1; i < 1024; i++)
input[i] = input[i] * dq[1] / 2;
vp9_short_idct32x32_c(input, output, 64);
vpx_memset(input, 0, 2048);
add_residual(output, pred, pitch, dest, stride, 32, 32);
#if !CONFIG_DWTDCTHYBRID
}
#endif
}
}