diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c index 65c616614..225feaca6 100644 --- a/vp8/encoder/arm/quantize_arm.c +++ b/vp8/encoder/arm/quantize_arm.c @@ -29,7 +29,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { - d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant); + d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast); } /* diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index 90b42c35c..bf94e508b 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -33,6 +33,7 @@ typedef struct // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries short *quant; + short *quant_fast; short *quant_shift; short *zbin; short *zrun_zbin_boost; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 152838946..cb7cc65d7 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -179,6 +179,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) { // dc values quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q); + cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val; vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0, cpi->Y1quant_shift[Q] + 0, quant_val); cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; @@ -187,6 +188,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7; quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q); + cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val; vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0, cpi->Y2quant_shift[Q] + 0, quant_val); cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7; @@ -195,6 +197,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7; quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q); + cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val; vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0, cpi->UVquant_shift[Q] + 0, quant_val); cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;; @@ -208,6 +211,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) int rc = vp8_default_zig_zag1d[i]; quant_val = vp8_ac_yquant(Q); + cpi->Y1quant_fast[Q][rc] = (1 << 16) / quant_val; vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + rc, cpi->Y1quant_shift[Q] + rc, quant_val); cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; @@ -216,6 +220,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7; quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q); + cpi->Y2quant_fast[Q][rc] = (1 << 16) / quant_val; vp8cx_invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + rc, cpi->Y2quant_shift[Q] + rc, quant_val); cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7; @@ -224,6 +229,7 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7; quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q); + cpi->UVquant_fast[Q][rc] = (1 << 16) / quant_val; vp8cx_invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + rc, cpi->UVquant_shift[Q] + rc, quant_val); cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7; @@ -325,6 +331,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) for (i = 0; i < 16; i++) { x->block[i].quant = cpi->Y1quant[QIndex]; + x->block[i].quant_fast = cpi->Y1quant_fast[QIndex]; x->block[i].quant_shift = cpi->Y1quant_shift[QIndex]; x->block[i].zbin = cpi->Y1zbin[QIndex]; x->block[i].round = cpi->Y1round[QIndex]; @@ -339,6 +346,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) for (i = 16; i < 24; i++) { x->block[i].quant = cpi->UVquant[QIndex]; + x->block[i].quant_fast = cpi->UVquant_fast[QIndex]; x->block[i].quant_shift = cpi->UVquant_shift[QIndex]; x->block[i].zbin = cpi->UVzbin[QIndex]; x->block[i].round = cpi->UVround[QIndex]; @@ -349,6 +357,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) // Y2 zbin_extra = (cpi->common.Y2dequant[QIndex][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7; + x->block[24].quant_fast = cpi->Y2quant_fast[QIndex]; x->block[24].quant = cpi->Y2quant[QIndex]; x->block[24].quant_shift = cpi->Y2quant_shift[QIndex]; x->block[24].zbin = cpi->Y2zbin[QIndex]; @@ -1270,7 +1279,18 @@ int vp8cx_encode_inter_macroblock if (cpi->sf.RD) { + /* Are we using the fast quantizer for the mode selection? */ + if(cpi->sf.use_fastquant_for_pick) + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb); + inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error); + + /* switch back to the regular quantizer for the encode */ + if (cpi->sf.improved_quant) + { + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb); + } + } else #endif diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 1f890790c..05a1338dc 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -591,6 +591,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) sf->max_fs_radius = 32; sf->iterative_sub_pixel = 1; sf->optimize_coefficients = 1; + sf->use_fastquant_for_pick = 0; sf->first_step = 0; sf->max_step_search_steps = MAX_MVSEARCH_STEPS; @@ -758,7 +759,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) cpi->mode_check_freq[THR_SPLITG] = 4; cpi->mode_check_freq[THR_SPLITA] = 4; - cpi->mode_check_freq[THR_SPLITMV] = 0; + cpi->mode_check_freq[THR_SPLITMV] = 2; sf->thresh_mult[THR_TM ] = 1500; sf->thresh_mult[THR_V_PRED ] = 1500; @@ -789,8 +790,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) sf->thresh_mult[THR_SPLITA ] = 20000; } - sf->improved_quant = 0; - sf->improved_dct = 0; + sf->use_fastquant_for_pick = 1; sf->first_step = 1; sf->max_step_search_steps = MAX_MVSEARCH_STEPS; @@ -798,6 +798,8 @@ void vp8_set_speed_features(VP8_COMP *cpi) if (Speed > 1) { + sf->use_fastquant_for_pick = 0; + cpi->mode_check_freq[THR_SPLITG] = 15; cpi->mode_check_freq[THR_SPLITA] = 15; cpi->mode_check_freq[THR_SPLITMV] = 7; @@ -831,6 +833,11 @@ void vp8_set_speed_features(VP8_COMP *cpi) sf->thresh_mult[THR_SPLITA ] = 50000; } + sf->first_step = 1; + + sf->improved_quant = 0; + sf->improved_dct = 0; + // Only do recode loop on key frames, golden frames and // alt ref frames sf->recode_loop = 2; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 990ae1d9e..ab270ca5f 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -182,6 +182,8 @@ typedef struct int first_step; int optimize_coefficients; + int use_fastquant_for_pick; + } SPEED_FEATURES; typedef struct @@ -269,6 +271,9 @@ typedef struct DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]); MACROBLOCK mb; diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index a1be6614b..a67299487 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -27,7 +27,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) short *coeff_ptr = b->coeff; short *zbin_ptr = b->zbin; short *round_ptr = b->round; - short *quant_ptr = b->quant; + short *quant_ptr = b->quant_fast; short *quant_shift_ptr = b->quant_shift; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; @@ -74,7 +74,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) int x, y, z, sz; short *coeff_ptr = b->coeff; short *round_ptr = b->round; - short *quant_ptr = b->quant; + short *quant_ptr = b->quant_fast; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 70be262f8..e6c7c9ab3 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1026,6 +1026,7 @@ static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0}; typedef struct { MV *ref_mv; + MV *mvp; int segment_rd; int segment_num; @@ -1039,6 +1040,9 @@ typedef struct int mvthresh; int *mdcounts; + MV sv_mvp[4]; // save 4 mvp from 8x8 + int sv_istep[2]; // save 2 initial step_param for 16x8/8x16 + } BEST_SEG_INFO; @@ -1124,7 +1128,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, int sseshift; int num00; int step_param = 0; - int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + int further_steps; int n; int thissme; int bestsme = INT_MAX; @@ -1136,6 +1140,27 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, if (best_label_rd < label_mv_thresh) break; + if(cpi->compressor_speed) + { + if (segmentation == BLOCK_8X16 || segmentation == BLOCK_16X8) + { + bsi->mvp = &bsi->sv_mvp[i]; + if (i==1 && segmentation == BLOCK_16X8) bsi->mvp = &bsi->sv_mvp[2]; + + step_param = bsi->sv_istep[i]; + } + + // use previous block's result as next block's MV predictor. + if (segmentation == BLOCK_4X4 && i>0) + { + bsi->mvp = &(x->e_mbd.block[i-1].bmi.mv.as_mv); + if (i==4 || i==8 || i==12) bsi->mvp = &(x->e_mbd.block[i-4].bmi.mv.as_mv); + step_param = 2; + } + } + + further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; + { int sadpb = x->sadperbit4; @@ -1151,7 +1176,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, else { - bestsme = cpi->diamond_search_sad(x, c, e, bsi->ref_mv, + bestsme = cpi->diamond_search_sad(x, c, e, bsi->mvp, &mode_mv[NEW4X4], step_param, sadpb / 2, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv); @@ -1166,7 +1191,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, num00--; else { - thissme = cpi->diamond_search_sad(x, c, e, bsi->ref_mv, + thissme = cpi->diamond_search_sad(x, c, e, bsi->mvp, &temp_mv, step_param + n, sadpb / 2, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv); @@ -1185,7 +1210,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, // Should we do a full search (best quality only) if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) { - thissme = cpi->full_search_sad(x, c, e, bsi->ref_mv, + thissme = cpi->full_search_sad(x, c, e, bsi->mvp, sadpb / 4, 16, v_fn_ptr, x->mvcost, x->mvsadcost,bsi->ref_mv); if (thissme < bestsme) @@ -1254,8 +1279,9 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, segmentyrate += bestlabelyrate; this_segment_rd += best_label_rd; - if (this_segment_rd > bsi->segment_rd) + if (this_segment_rd >= bsi->segment_rd) break; + } /* for each label */ if (this_segment_rd < bsi->segment_rd) @@ -1277,6 +1303,21 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, } } } + +static __inline +void vp8_cal_step_param(int sr, int *sp) +{ + int step = 0; + + if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP; + else if (sr < 1) sr = 1; + + while (sr>>=1) + step++; + + *sp = MAX_MVSEARCH_STEPS - 1 - step; +} + static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *mdcounts, int *returntotrate, @@ -1285,14 +1326,12 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, { int i; BEST_SEG_INFO bsi; - BEST_SEG_INFO bsi_8x8; - int check_8x16 = 0; - int check_16x8 = 0; vpx_memset(&bsi, 0, sizeof(bsi)); bsi.segment_rd = best_rd; bsi.ref_mv = best_ref_mv; + bsi.mvp = best_ref_mv; bsi.mvthresh = mvthresh; bsi.mdcounts = mdcounts; @@ -1300,6 +1339,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, { bsi.modes[i] = ZERO4X4; } + if(cpi->compressor_speed == 0) { /* for now, we will keep the original segmentation order @@ -1311,12 +1351,73 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, } else { + int sr; + vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X8); + if (bsi.segment_rd < best_rd) { - vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X16); - vp8_rd_check_segment(cpi, x, &bsi, BLOCK_16X8); - vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4); + int col_min = (best_ref_mv->col - MAX_POSSIBLE_MV) >>3; + int col_max = (best_ref_mv->col + MAX_POSSIBLE_MV) >>3; + int row_min = (best_ref_mv->row - MAX_POSSIBLE_MV) >>3; + int row_max = (best_ref_mv->row + MAX_POSSIBLE_MV) >>3; + + int tmp_col_min = x->mv_col_min; + int tmp_col_max = x->mv_col_max; + int tmp_row_min = x->mv_row_min; + int tmp_row_max = x->mv_row_max; + + /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */ + if (x->mv_col_min < col_min ) + x->mv_col_min = col_min; + if (x->mv_col_max > col_max ) + x->mv_col_max = col_max; + if (x->mv_row_min < row_min ) + x->mv_row_min = row_min; + if (x->mv_row_max > row_max ) + x->mv_row_max = row_max; + + /* Get 8x8 result */ + bsi.sv_mvp[0] = bsi.mvs[0].as_mv; + bsi.sv_mvp[1] = bsi.mvs[2].as_mv; + bsi.sv_mvp[2] = bsi.mvs[8].as_mv; + bsi.sv_mvp[3] = bsi.mvs[10].as_mv; + + /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range according to the closeness of 2 MV. */ + /* block 8X16 */ + { + sr = MAXF((abs(bsi.sv_mvp[0].row - bsi.sv_mvp[2].row))>>3, (abs(bsi.sv_mvp[0].col - bsi.sv_mvp[2].col))>>3); + vp8_cal_step_param(sr, &bsi.sv_istep[0]); + + sr = MAXF((abs(bsi.sv_mvp[1].row - bsi.sv_mvp[3].row))>>3, (abs(bsi.sv_mvp[1].col - bsi.sv_mvp[3].col))>>3); + vp8_cal_step_param(sr, &bsi.sv_istep[1]); + + vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X16); + } + + /* block 16X8 */ + { + sr = MAXF((abs(bsi.sv_mvp[0].row - bsi.sv_mvp[1].row))>>3, (abs(bsi.sv_mvp[0].col - bsi.sv_mvp[1].col))>>3); + vp8_cal_step_param(sr, &bsi.sv_istep[0]); + + sr = MAXF((abs(bsi.sv_mvp[2].row - bsi.sv_mvp[3].row))>>3, (abs(bsi.sv_mvp[2].col - bsi.sv_mvp[3].col))>>3); + vp8_cal_step_param(sr, &bsi.sv_istep[1]); + + vp8_rd_check_segment(cpi, x, &bsi, BLOCK_16X8); + } + + /* If 8x8 is better than 16x8/8x16, then do 4x4 search */ + if (bsi.segment_num == BLOCK_8X8) /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */ + { + bsi.mvp = &bsi.sv_mvp[0]; + vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4); + } + + /* restore UMV window */ + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; } } diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index d2199a499..6e317e2a2 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -32,7 +32,7 @@ void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) short *coeff_ptr = b->coeff; short *zbin_ptr = b->zbin; short *round_ptr = b->round; - short *quant_ptr = b->quant; + short *quant_ptr = b->quant_fast; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; @@ -90,7 +90,7 @@ void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) short *scan_mask = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr; short *coeff_ptr = b->coeff; short *round_ptr = b->round; - short *quant_ptr = b->quant; + short *quant_ptr = b->quant_fast; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; @@ -183,7 +183,7 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) d->qcoeff, d->dequant, b->round, - b->quant, + b->quant_fast, d->dqcoeff ); }