rv34: joint coefficient decoding and dequantization
Perform dequantization while decoding coefficients instead of performing it on the entire coefficients buffer. Since quantized coefficients are very sparse, this usually causes a small speedup. Speedup of around 1% on Panda board compared to the removed here neon code. Global speedup is probably around 3%. Signed-off-by: Kostya Shishkov <kostya.shishkov@gmail.com>
This commit is contained in:
parent
0749720b6c
commit
98f24ecd6c
@ -25,12 +25,9 @@
|
||||
|
||||
void ff_rv34_inv_transform_neon(DCTELEM *block);
|
||||
void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
|
||||
void ff_rv34_dequant4x4_neon(DCTELEM *block, int Qdc, int Q);
|
||||
|
||||
void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
|
||||
{
|
||||
c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
|
||||
c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
|
||||
|
||||
c->rv34_dequant4x4 = ff_rv34_dequant4x4_neon;
|
||||
}
|
||||
|
@ -107,27 +107,3 @@ function ff_rv34_inv_transform_noround_neon, export=1
|
||||
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_rv34_dequant4x4_neon, export=1
|
||||
mov r3, r0
|
||||
mov r12, #16
|
||||
vdup.16 q0, r2
|
||||
vmov.16 d0[0], r1
|
||||
vld1.16 {d2}, [r0,:64], r12
|
||||
vld1.16 {d4}, [r0,:64], r12
|
||||
vld1.16 {d6}, [r0,:64], r12
|
||||
vld1.16 {d16}, [r0,:64], r12
|
||||
vmull.s16 q1, d2, d0
|
||||
vmull.s16 q2, d4, d1
|
||||
vmull.s16 q3, d6, d1
|
||||
vmull.s16 q8, d16, d1
|
||||
vqrshrn.s32 d2, q1, #4
|
||||
vqrshrn.s32 d4, q2, #4
|
||||
vqrshrn.s32 d6, q3, #4
|
||||
vqrshrn.s32 d16, q8, #4
|
||||
vst1.16 {d2}, [r3,:64], r12
|
||||
vst1.16 {d4}, [r3,:64], r12
|
||||
vst1.16 {d6}, [r3,:64], r12
|
||||
vst1.16 {d16}, [r3,:64], r12
|
||||
bx lr
|
||||
endfunc
|
||||
|
@ -212,7 +212,7 @@ static int rv34_decode_cbp(GetBitContext *gb, RV34VLC *vlc, int table)
|
||||
/**
|
||||
* Get one coefficient value from the bistream and store it.
|
||||
*/
|
||||
static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext *gb, VLC* vlc)
|
||||
static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext *gb, VLC* vlc, int q)
|
||||
{
|
||||
if(coef){
|
||||
if(coef == esc){
|
||||
@ -225,14 +225,14 @@ static inline void decode_coeff(DCTELEM *dst, int coef, int esc, GetBitContext *
|
||||
}
|
||||
if(get_bits1(gb))
|
||||
coef = -coef;
|
||||
*dst = coef;
|
||||
*dst = (coef*q + 8) >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode 2x2 subblock of coefficients.
|
||||
*/
|
||||
static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc)
|
||||
static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc, int q)
|
||||
{
|
||||
int coeffs[4];
|
||||
|
||||
@ -240,15 +240,35 @@ static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2,
|
||||
coeffs[1] = modulo_three_table[code][1];
|
||||
coeffs[2] = modulo_three_table[code][2];
|
||||
coeffs[3] = modulo_three_table[code][3];
|
||||
decode_coeff(dst , coeffs[0], 3, gb, vlc);
|
||||
decode_coeff(dst , coeffs[0], 3, gb, vlc, q);
|
||||
if(is_block2){
|
||||
decode_coeff(dst+8, coeffs[1], 2, gb, vlc);
|
||||
decode_coeff(dst+1, coeffs[2], 2, gb, vlc);
|
||||
decode_coeff(dst+8, coeffs[1], 2, gb, vlc, q);
|
||||
decode_coeff(dst+1, coeffs[2], 2, gb, vlc, q);
|
||||
}else{
|
||||
decode_coeff(dst+1, coeffs[1], 2, gb, vlc);
|
||||
decode_coeff(dst+8, coeffs[2], 2, gb, vlc);
|
||||
decode_coeff(dst+1, coeffs[1], 2, gb, vlc, q);
|
||||
decode_coeff(dst+8, coeffs[2], 2, gb, vlc, q);
|
||||
}
|
||||
decode_coeff(dst+9, coeffs[3], 2, gb, vlc);
|
||||
decode_coeff(dst+9, coeffs[3], 2, gb, vlc, q);
|
||||
}
|
||||
|
||||
static inline void decode_subblock3(DCTELEM *dst, int code, const int is_block2, GetBitContext *gb, VLC *vlc,
|
||||
int q_dc, int q_ac1, int q_ac2)
|
||||
{
|
||||
int coeffs[4];
|
||||
|
||||
coeffs[0] = modulo_three_table[code][0];
|
||||
coeffs[1] = modulo_three_table[code][1];
|
||||
coeffs[2] = modulo_three_table[code][2];
|
||||
coeffs[3] = modulo_three_table[code][3];
|
||||
decode_coeff(dst , coeffs[0], 3, gb, vlc, q_dc);
|
||||
if(is_block2){
|
||||
decode_coeff(dst+8, coeffs[1], 2, gb, vlc, q_ac1);
|
||||
decode_coeff(dst+1, coeffs[2], 2, gb, vlc, q_ac1);
|
||||
}else{
|
||||
decode_coeff(dst+1, coeffs[1], 2, gb, vlc, q_ac1);
|
||||
decode_coeff(dst+8, coeffs[2], 2, gb, vlc, q_ac1);
|
||||
}
|
||||
decode_coeff(dst+9, coeffs[3], 2, gb, vlc, q_ac2);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -262,7 +282,7 @@ static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2,
|
||||
* o--o
|
||||
*/
|
||||
|
||||
static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *rvlc, int fc, int sc)
|
||||
static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *rvlc, int fc, int sc, int q_dc, int q_ac1, int q_ac2)
|
||||
{
|
||||
int code, pattern;
|
||||
|
||||
@ -271,39 +291,23 @@ static inline void rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *r
|
||||
pattern = code & 0x7;
|
||||
|
||||
code >>= 3;
|
||||
decode_subblock(dst, code, 0, gb, &rvlc->coefficient);
|
||||
decode_subblock3(dst, code, 0, gb, &rvlc->coefficient, q_dc, q_ac1, q_ac2);
|
||||
|
||||
if(pattern & 4){
|
||||
code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2);
|
||||
decode_subblock(dst + 2, code, 0, gb, &rvlc->coefficient);
|
||||
decode_subblock(dst + 2, code, 0, gb, &rvlc->coefficient, q_ac2);
|
||||
}
|
||||
if(pattern & 2){ // Looks like coefficients 1 and 2 are swapped for this block
|
||||
code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2);
|
||||
decode_subblock(dst + 8*2, code, 1, gb, &rvlc->coefficient);
|
||||
decode_subblock(dst + 8*2, code, 1, gb, &rvlc->coefficient, q_ac2);
|
||||
}
|
||||
if(pattern & 1){
|
||||
code = get_vlc2(gb, rvlc->third_pattern[sc].table, 9, 2);
|
||||
decode_subblock(dst + 8*2+2, code, 0, gb, &rvlc->coefficient);
|
||||
decode_subblock(dst + 8*2+2, code, 0, gb, &rvlc->coefficient, q_ac2);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Dequantize 4x4 block of DC values for 16x16 macroblock.
|
||||
* @todo optimize
|
||||
*/
|
||||
static inline void rv34_dequant4x4_16x16(DCTELEM *block, int Qdc, int Q)
|
||||
{
|
||||
int i;
|
||||
|
||||
for(i = 0; i < 3; i++)
|
||||
block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Qdc + 8) >> 4;
|
||||
for(; i < 16; i++)
|
||||
block[rv34_dezigzag[i]] = (block[rv34_dezigzag[i]] * Q + 8) >> 4;
|
||||
}
|
||||
/** @} */ //block functions
|
||||
|
||||
|
||||
/**
|
||||
* @name RV30/40 bitstream parsing
|
||||
* @{
|
||||
@ -1097,6 +1101,7 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
|
||||
MpegEncContext *s = &r->s;
|
||||
GetBitContext *gb = &s->gb;
|
||||
int cbp, cbp2;
|
||||
int q_dc, q_ac;
|
||||
int i, blknum, blkoff;
|
||||
LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
|
||||
int luma_dc_quant;
|
||||
@ -1133,31 +1138,34 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
|
||||
|
||||
luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16 ? r->luma_dc_quant_p[s->qscale] : r->luma_dc_quant_i[s->qscale];
|
||||
if(r->is16){
|
||||
q_dc = rv34_qscale_tab[luma_dc_quant];
|
||||
q_ac = rv34_qscale_tab[s->qscale];
|
||||
memset(block16, 0, 64 * sizeof(*block16));
|
||||
rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0);
|
||||
rv34_dequant4x4_16x16(block16, rv34_qscale_tab[luma_dc_quant],rv34_qscale_tab[s->qscale]);
|
||||
rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac);
|
||||
r->rdsp.rv34_inv_transform_tab[1](block16);
|
||||
}
|
||||
|
||||
q_ac = rv34_qscale_tab[s->qscale];
|
||||
for(i = 0; i < 16; i++, cbp >>= 1){
|
||||
if(!r->is16 && !(cbp & 1)) continue;
|
||||
blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
|
||||
blkoff = ((i & 1) << 2) + ((i & 4) << 3);
|
||||
if(cbp & 1)
|
||||
rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->luma_vlc, 0);
|
||||
r->rdsp.rv34_dequant4x4(s->block[blknum] + blkoff, rv34_qscale_tab[s->qscale],rv34_qscale_tab[s->qscale]);
|
||||
rv34_decode_block(s->block[blknum] + blkoff, gb,
|
||||
r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
|
||||
if(r->is16) //FIXME: optimize
|
||||
s->block[blknum][blkoff] = block16[(i & 3) | ((i & 0xC) << 1)];
|
||||
r->rdsp.rv34_inv_transform_tab[0](s->block[blknum] + blkoff);
|
||||
}
|
||||
if(r->block_type == RV34_MB_P_MIX16x16)
|
||||
r->cur_vlcs = choose_vlc_set(r->si.quant, r->si.vlc_set, 1);
|
||||
q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
|
||||
q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
|
||||
for(; i < 24; i++, cbp >>= 1){
|
||||
if(!(cbp & 1)) continue;
|
||||
blknum = ((i & 4) >> 2) + 4;
|
||||
blkoff = ((i & 1) << 2) + ((i & 2) << 4);
|
||||
rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->chroma_vlc, 1);
|
||||
r->rdsp.rv34_dequant4x4(s->block[blknum] + blkoff, rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]],rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]]);
|
||||
rv34_decode_block(s->block[blknum] + blkoff, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac);
|
||||
r->rdsp.rv34_inv_transform_tab[0](s->block[blknum] + blkoff);
|
||||
}
|
||||
if (IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos]))
|
||||
|
@ -100,16 +100,6 @@ static const uint16_t rv34_qscale_tab[32] = {
|
||||
963, 1074, 1212, 1392, 1566, 1708, 1978, 2211
|
||||
};
|
||||
|
||||
/**
|
||||
* 4x4 dezigzag pattern
|
||||
*/
|
||||
static const uint8_t rv34_dezigzag[16] = {
|
||||
0, 1, 8, 16,
|
||||
9, 2, 3, 10,
|
||||
17, 24, 25, 18,
|
||||
11, 19, 26, 27
|
||||
};
|
||||
|
||||
/**
|
||||
* tables used to translate a quantizer value into a VLC set for decoding
|
||||
* The first table is used for intraframes.
|
||||
|
@ -100,26 +100,10 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){
|
||||
/** @} */ // transform
|
||||
|
||||
|
||||
/**
|
||||
* Dequantize ordinary 4x4 block.
|
||||
*/
|
||||
void ff_rv34_dequant4x4_neon(DCTELEM *block, int Qdc, int Q);
|
||||
static void rv34_dequant4x4_c(DCTELEM *block, int Qdc, int Q)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
block[0] = (block[0] * Qdc + 8) >> 4;
|
||||
for (i = 0; i < 4; i++)
|
||||
for (j = !i; j < 4; j++)
|
||||
block[j + i*8] = (block[j + i*8] * Q + 8) >> 4;
|
||||
}
|
||||
|
||||
av_cold void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp) {
|
||||
c->rv34_inv_transform_tab[0] = rv34_inv_transform_c;
|
||||
c->rv34_inv_transform_tab[1] = rv34_inv_transform_noround_c;
|
||||
|
||||
c->rv34_dequant4x4 = rv34_dequant4x4_c;
|
||||
|
||||
if (HAVE_NEON)
|
||||
ff_rv34dsp_init_neon(c, dsp);
|
||||
}
|
||||
|
@ -56,7 +56,6 @@ typedef struct RV34DSPContext {
|
||||
h264_chroma_mc_func avg_chroma_pixels_tab[3];
|
||||
rv40_weight_func rv40_weight_pixels_tab[2];
|
||||
rv34_inv_transform_func rv34_inv_transform_tab[2];
|
||||
void (*rv34_dequant4x4)(DCTELEM *block, int Qdc, int Q);
|
||||
rv40_weak_loop_filter_func rv40_weak_loop_filter[2];
|
||||
rv40_strong_loop_filter_func rv40_strong_loop_filter[2];
|
||||
rv40_loop_filter_strength_func rv40_loop_filter_strength[2];
|
||||
|
Loading…
Reference in New Issue
Block a user