From 58ca6f65b7d1ad6f65ffddcb3d4c0dcb6f259e0b Mon Sep 17 00:00:00 2001 From: Pascal Massimino Date: Tue, 26 Feb 2013 05:22:44 -0800 Subject: [PATCH] rebalance method tools (-m) for methods [0..4] (methods 5 and 6 are still untouched). Methods #0 and #1 got much faster Method #2 gets vastly improved in quality Method #3 is noticeably faster for little lower quality Method #4 (default) is 10-20% faster for comparable quality + update the internal doc about the methods' tools. Example of speed difference: Time to encode picture: Method | Before | After -m 0 | 1.272s | 0.517s -m 1 | 1.295s | 0.623s -m 2 | 2.217s | 0.834s -m 3 | 2.816s | 2.243s -m 4 | 3.235s | 3.014s -m 5 | 3.668s | 3.654s -m 6 | 8.296s | 8.235s Change-Id: Ic41fda5de65066b3a6586cb8ae1ebb0206d47fe0 --- src/enc/analysis.c | 22 +++++++++----- src/enc/frame.c | 13 +++++++-- src/enc/quant.c | 71 +++++++++++++++++++++++++++++++++++++++++----- src/enc/vp8enci.h | 3 +- src/enc/webpenc.c | 36 ++++++++++++----------- 5 files changed, 111 insertions(+), 34 deletions(-) diff --git a/src/enc/analysis.c b/src/enc/analysis.c index 6ea87745..221e9d06 100644 --- a/src/enc/analysis.c +++ b/src/enc/analysis.c @@ -223,14 +223,18 @@ static void AssignSegments(VP8Encoder* const enc, // susceptibility and set best modes for this macroblock. // Segment assignment is done later. -// Number of modes to inspect for alpha_ evaluation. For high-quality settings, -// we don't need to test all the possible modes during the analysis phase. +// Number of modes to inspect for alpha_ evaluation. For high-quality settings +// (method >= FAST_ANALYSIS_METHOD) we don't need to test all the possible modes +// during the analysis phase. +#define FAST_ANALYSIS_METHOD 4 // method above which we do partial analysis #define MAX_INTRA16_MODE 2 #define MAX_INTRA4_MODE 2 #define MAX_UV_MODE 2 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) { - const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA16_MODE : 4; + const int max_mode = + (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA16_MODE + : NUM_PRED_MODES; int mode; int best_alpha = DEFAULT_ALPHA; int best_mode = 0; @@ -256,7 +260,9 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) { static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it, int best_alpha) { uint8_t modes[16]; - const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA4_MODE : NUM_BMODES; + const int max_mode = + (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA4_MODE + : NUM_BMODES; int i4_alpha; VP8Histogram total_histo = { { 0 } }; int cur_histo = 0; @@ -298,7 +304,9 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it, static int MBAnalyzeBestUVMode(VP8EncIterator* const it) { int best_alpha = DEFAULT_ALPHA; int best_mode = 0; - const int max_mode = (it->enc_->method_ >= 3) ? MAX_UV_MODE : 4; + const int max_mode = + (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_UV_MODE + : NUM_PRED_MODES; int mode; VP8MakeChroma8Preds(it); for (mode = 0; mode < max_mode; ++mode) { @@ -328,7 +336,7 @@ static void MBAnalyze(VP8EncIterator* const it, VP8SetSegment(it, 0); // default segment, spec-wise. best_alpha = MBAnalyzeBestIntra16Mode(it); - if (enc->method_ != 3) { + if (enc->method_ >= 5) { // We go and make a fast decision for intra4/intra16. // It's usually not a good and definitive pick, but helps seeding the stats // about level bit-cost. @@ -383,7 +391,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) { const int do_segments = enc->config_->emulate_jpeg_size || // We need the complexity evaluation. (enc->segment_hdr_.num_segments_ > 1) || - (enc->method_ <= 2); // for methods 0,1,2, we need preds_[] to be filled. + (enc->method_ == 0); // for method 0, we need preds_[] to be filled. enc->alpha_ = 0; enc->uv_alpha_ = 0; if (do_segments) { diff --git a/src/enc/frame.c b/src/enc/frame.c index 3566d81c..d7965bb0 100644 --- a/src/enc/frame.c +++ b/src/enc/frame.c @@ -811,9 +811,10 @@ static int OneStatPass(VP8Encoder* const enc, float q, VP8RDLevel rd_opt, static const int dqs[] = { 20, 15, 10, 8, 6, 4, 2, 1, 0 }; int VP8StatLoop(VP8Encoder* const enc) { + const int method = enc->method_; const int do_search = (enc->config_->target_size > 0 || enc->config_->target_PSNR > 0); - const int fast_probe = (enc->method_ < 2 && !do_search); + const int fast_probe = ((method == 0 || method == 3) && !do_search); float q = enc->config_->quality; const int max_passes = enc->config_->pass; const int task_percent = 20; @@ -824,12 +825,18 @@ int VP8StatLoop(VP8Encoder* const enc) { // Fast mode: quick analysis pass over few mbs. Better than nothing. nb_mbs = enc->mb_w_ * enc->mb_h_; - if (fast_probe && nb_mbs > 100) nb_mbs = 100; + if (fast_probe) { + if (method == 3) { // we need more stats for method 3 to be reliable. + nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100; + } else { + nb_mbs = (nb_mbs > 200) ? nb_mbs >> 2 : 50; + } + } // No target size: just do several pass without changing 'q' if (!do_search) { for (pass = 0; pass < max_passes; ++pass) { - const VP8RDLevel rd_opt = (enc->method_ > 2) ? RD_OPT_BASIC : RD_OPT_NONE; + const VP8RDLevel rd_opt = (method >= 3) ? RD_OPT_BASIC : RD_OPT_NONE; if (!OneStatPass(enc, q, rd_opt, nb_mbs, NULL, percent_per_pass)) { return 0; } diff --git a/src/enc/quant.c b/src/enc/quant.c index d74fa373..dcfd4d16 100644 --- a/src/enc/quant.c +++ b/src/enc/quant.c @@ -27,6 +27,8 @@ #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP // power-law modulation. Must be strictly less than 1. +#define I4_PENALTY 4000 // Rate-penalty for quick i4/i16 decision + #define MULT_8B(a, b) (((a) * (b) + 128) >> 8) #if defined(__cplusplus) || defined(c_plusplus) @@ -773,7 +775,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) { int mode; rd->mode_i16 = -1; - for (mode = 0; mode < 4; ++mode) { + for (mode = 0; mode < NUM_PRED_MODES; ++mode) { uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF; // scratch buffer int nz; @@ -902,7 +904,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { rd->mode_uv = -1; InitScore(&rd_best); - for (mode = 0; mode < 4; ++mode) { + for (mode = 0; mode < NUM_PRED_MODES; ++mode) { VP8ModeScore rd_uv; // Reconstruct @@ -931,10 +933,10 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) { const VP8Encoder* const enc = it->enc_; - const int i16 = (it->mb_->type_ == 1); + const int is_i16 = (it->mb_->type_ == 1); int nz = 0; - if (i16) { + if (is_i16) { nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]); } else { VP8IteratorStartI4(it); @@ -953,12 +955,66 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) { rd->nz = nz; } +// Refine intra16/intra4 sub-modes based on distortion only (not rate). +static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) { + const int is_i16 = (it->mb_->type_ == 1); + score_t best_score = MAX_COST; + + if (try_both_i4_i16 || is_i16) { + int mode; + int best_mode = -1; + for (mode = 0; mode < NUM_PRED_MODES; ++mode) { + const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; + const uint8_t* const src = it->yuv_in_ + Y_OFF; + const score_t score = VP8SSE16x16(src, ref); + if (score < best_score) { + best_mode = mode; + best_score = score; + } + } + VP8SetIntra16Mode(it, best_mode); + } + if (try_both_i4_i16 || !is_i16) { + uint8_t modes_i4[16]; + // We don't evaluate the rate here, but just account for it through a + // constant penalty (i4 mode usually needs more bits compared to i16). + score_t score_i4 = (score_t)I4_PENALTY; + + VP8IteratorStartI4(it); + do { + int mode; + int best_sub_mode = -1; + score_t best_sub_score = MAX_COST; + const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_]; + + // TODO(skal): we don't really need the prediction pixels here, + // but just the distortion against 'src'. + VP8MakeIntra4Preds(it); + for (mode = 0; mode < NUM_BMODES; ++mode) { + const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; + const score_t score = VP8SSE4x4(src, ref); + if (score < best_sub_score) { + best_sub_mode = mode; + best_sub_score = score; + } + } + modes_i4[it->i4_] = best_sub_mode; + score_i4 += best_sub_score; + if (score_i4 >= best_score) break; + } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF)); + if (score_i4 < best_score) { + VP8SetIntra4Mode(it, modes_i4); + } + } +} + //------------------------------------------------------------------------------ // Entry point int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, VP8RDLevel rd_opt) { int is_skipped; + const int method = it->enc_->method_; InitScore(rd); @@ -970,7 +1026,7 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, if (rd_opt > RD_OPT_NONE) { it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL); PickBestIntra16(it, rd); - if (it->enc_->method_ >= 2) { + if (method >= 2) { PickBestIntra4(it, rd); } PickBestUV(it, rd); @@ -979,8 +1035,9 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, SimpleQuantize(it, rd); } } else { - // TODO: for method_ == 2, pick the best intra4/intra16 based on SSE - it->do_trellis_ = (it->enc_->method_ == 2); + // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower). + // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode). + DistoRefine(it, (method >= 2)); SimpleQuantize(it, rd); } is_skipped = (rd->nz == 0); diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h index 153f9e51..21212ab4 100644 --- a/src/enc/vp8enci.h +++ b/src/enc/vp8enci.h @@ -44,7 +44,8 @@ enum { B_DC_PRED = 0, // 4x4 modes // Luma16 or UV modes DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED, - H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED + H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED, + NUM_PRED_MODES = 4 }; enum { NUM_MB_SEGMENTS = 4, diff --git a/src/enc/webpenc.c b/src/enc/webpenc.c index 48ab4b39..b46678c2 100644 --- a/src/enc/webpenc.c +++ b/src/enc/webpenc.c @@ -93,22 +93,26 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) { enc->nz_[-1] = 0; // constant } -// Map configured quality level to coding tools used. -//-------------+---+---+---+---+---+---+ -// Quality | 0 | 1 | 2 | 3 | 4 | 5 + -//-------------+---+---+---+---+---+---+ -// dynamic prob| ~ | x | x | x | x | x | -//-------------+---+---+---+---+---+---+ -// rd-opt modes| | | x | x | x | x | -//-------------+---+---+---+---+---+---+ -// fast i4/i16 | x | x | | | | | -//-------------+---+---+---+---+---+---+ -// rd-opt i4/16| | | x | x | x | x | -//-------------+---+---+---+---+---+---+ -// Trellis | | x | | | x | x | -//-------------+---+---+---+---+---+---+ -// full-SNS | | | | | | x | -//-------------+---+---+---+---+---+---+ +// Mapping from config->method_ to coding tools used. +//-------------------+---+---+---+---+---+---+---+ +// Method | 0 | 1 | 2 | 3 |(4)| 5 | 6 | +//-------------------+---+---+---+---+---+---+---+ +// fast probe | x | | | x | | | | +//-------------------+---+---+---+---+---+---+---+ +// dynamic proba | ~ | x | x | x | x | x | x | +//-------------------+---+---+---+---+---+---+---+ +// fast mode analysis| | | | | x | x | x | +//-------------------+---+---+---+---+---+---+---+ +// basic rd-opt | | | | x | x | x | x | +//-------------------+---+---+---+---+---+---+---+ +// disto-score i4/16 | | | x | | | | | +//-------------------+---+---+---+---+---+---+---+ +// rd-opt i4/16 | | | ~ | x | x | x | x | +//-------------------+---+---+---+---+---+---+---+ +// Trellis | | | | | | x |Ful| +//-------------------+---+---+---+---+---+---+---+ +// full-SNS | | | | | x | x | x | +//-------------------+---+---+---+---+---+---+---+ static void MapConfigToTools(VP8Encoder* const enc) { const int method = enc->config_->method;