diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh index 6f7180d08..ae5ba182d 100755 --- a/build/make/iosbuild.sh +++ b/build/make/iosbuild.sh @@ -29,11 +29,14 @@ SCRIPT_DIR=$(dirname "$0") LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd) LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo) ORIG_PWD="$(pwd)" -TARGETS="arm64-darwin-gcc - armv7-darwin-gcc - armv7s-darwin-gcc - x86-iphonesimulator-gcc - x86_64-iphonesimulator-gcc" +ARM_TARGETS="arm64-darwin-gcc + armv7-darwin-gcc + armv7s-darwin-gcc" +SIM_TARGETS="x86-iphonesimulator-gcc + x86_64-iphonesimulator-gcc" +OSX_TARGETS="x86-darwin15-gcc + x86_64-darwin15-gcc" +TARGETS="${ARM_TARGETS} ${SIM_TARGETS}" # Configures for the target specified by $1, and invokes make with the dist # target using $DIST_DIR as the distribution output directory. @@ -197,15 +200,27 @@ cleanup() { fi } +print_list() { + local indent="$1" + shift + local list="$@" + for entry in ${list}; do + echo "${indent}${entry}" + done +} + iosbuild_usage() { cat << EOF Usage: ${0##*/} [arguments] --help: Display this message and exit. --extra-configure-args : Extra args to pass when configuring libvpx. + --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86 + and x86_64. Allows linking to framework when builds target MacOSX + instead of iOS. --preserve-build-output: Do not delete the build directory. --show-build-output: Show output from each library build. --targets : Override default target list. Defaults: - ${TARGETS} +$(print_list " " ${TARGETS}) --test-link: Confirms all targets can be linked. Functionally identical to passing --enable-examples via --extra-configure-args. --verbose: Output information about the environment and each stage of the @@ -249,6 +264,9 @@ while [ -n "$1" ]; do TARGETS="$2" shift ;; + --macosx) + TARGETS="${ARM_TARGETS} ${OSX_TARGETS}" + ;; --verbose) VERBOSE=yes ;; @@ -273,10 +291,12 @@ cat << EOF MAKEFLAGS=${MAKEFLAGS} ORIG_PWD=${ORIG_PWD} PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT} - TARGETS="${TARGETS}" + TARGETS="$(print_list "" ${TARGETS})" + OSX_TARGETS="${OSX_TARGETS}" + SIM_TARGETS="${SIM_TARGETS}" EOF fi build_framework "${TARGETS}" echo "Successfully built '${FRAMEWORK_DIR}' for:" -echo " ${TARGETS}" +print_list "" ${TARGETS} diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm index be59de311..e7d3fa5eb 100644 --- a/third_party/x86inc/x86inc.asm +++ b/third_party/x86inc/x86inc.asm @@ -119,7 +119,7 @@ %if ABI_IS_32BIT %if CONFIG_PIC=1 %ifidn __OUTPUT_FORMAT__,elf32 - %define GET_GOT_SAVE_ARG 1 + %define GET_GOT_DEFINED 1 %define WRT_PLT wrt ..plt %macro GET_GOT 1 extern _GLOBAL_OFFSET_TABLE_ @@ -138,7 +138,7 @@ %define RESTORE_GOT pop %1 %endmacro %elifidn __OUTPUT_FORMAT__,macho32 - %define GET_GOT_SAVE_ARG 1 + %define GET_GOT_DEFINED 1 %macro GET_GOT 1 push %1 call %%get_got @@ -149,6 +149,8 @@ %undef RESTORE_GOT %define RESTORE_GOT pop %1 %endmacro + %else + %define GET_GOT_DEFINED 0 %endif %endif diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c index ced0b868b..e93619f9a 100644 --- a/vp10/decoder/decodeframe.c +++ b/vp10/decoder/decodeframe.c @@ -1230,10 +1230,12 @@ static void setup_quantization(VP10_COMMON *const cm, MACROBLOCKD *const xd, cm->uv_dc_delta_q = read_delta_q(rb); cm->uv_ac_delta_q = read_delta_q(rb); cm->dequant_bit_depth = cm->bit_depth; - for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) { - const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex); - xd->lossless[i] = cm->y_dc_delta_q == 0 && - qindex == 0 && + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ? + vp10_get_qindex(&cm->seg, i, cm->base_qindex) : + cm->base_qindex; + xd->lossless[i] = qindex == 0 && + cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; } diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c index 2bac77dcb..af2df507a 100644 --- a/vp10/encoder/bitstream.c +++ b/vp10/encoder/bitstream.c @@ -1922,7 +1922,7 @@ void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) { assert(n_log2_tiles > 0); vpx_wb_write_literal(&saved_wb, mag, 2); if (mag < 3) - data_sz = remux_tiles(data, data_sz, 1 << n_log2_tiles, mag); + data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag); } else { assert(n_log2_tiles == 0); } diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h index 62de0bc37..a66002ad1 100644 --- a/vp10/encoder/block.h +++ b/vp10/encoder/block.h @@ -84,6 +84,8 @@ struct macroblock { int rddiv; int rdmult; int mb_energy; + int * m_search_count_ptr; + int * ex_search_count_ptr; // These are set to their default values at the beginning, and then adjusted // further in the encoding process. diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c index 34eac8912..95a65e293 100644 --- a/vp10/encoder/encodeframe.c +++ b/vp10/encoder/encodeframe.c @@ -1181,7 +1181,7 @@ static void rd_pick_sb_modes(VP10_COMP *cpi, if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { x->source_variance = vp10_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, - bsize, xd->bd); + bsize, xd->bd); } else { x->source_variance = vp10_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); @@ -2776,7 +2776,7 @@ static MV_REFERENCE_FRAME get_frame_type(const VP10_COMP *cpi) { } static TX_MODE select_tx_mode(const VP10_COMP *cpi, MACROBLOCKD *const xd) { - if (!cpi->common.seg.enabled && xd->lossless[0]) + if (xd->lossless[0]) return ONLY_4X4; if (cpi->sf.tx_size_search_method == USE_LARGESTALL) return ALLOW_32X32; @@ -2839,6 +2839,10 @@ void vp10_encode_tile(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; int mi_row; + // Set up pointers to per thread motion search counters. + td->mb.m_search_count_ptr = &td->rd_counts.m_search_count; + td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count; + for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; mi_row += MI_BLOCK_SIZE) { encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); @@ -2892,11 +2896,15 @@ static void encode_frame_internal(VP10_COMP *cpi) { vp10_zero(rdc->coef_counts); vp10_zero(rdc->comp_pred_diff); vp10_zero(rdc->filter_diff); + rdc->m_search_count = 0; // Count of motion search hits. + rdc->ex_search_count = 0; // Exhaustive mesh search hits. - for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) { - const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex); - xd->lossless[i] = cm->y_dc_delta_q == 0 && - qindex == 0 && + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ? + vp10_get_qindex(&cm->seg, i, cm->base_qindex) : + cm->base_qindex; + xd->lossless[i] = qindex == 0 && + cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; } diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c index 774683959..c5dabceb4 100644 --- a/vp10/encoder/encoder.c +++ b/vp10/encoder/encoder.c @@ -2968,7 +2968,7 @@ static void output_frame_level_debug_stats(VP10_COMP *cpi) { recon_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" + fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d" "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " "%10"PRId64" %10"PRId64" %10d " "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" @@ -2977,6 +2977,8 @@ static void output_frame_level_debug_stats(VP10_COMP *cpi) { "%10lf %8u %10"PRId64" %10d %10d %10d\n", cpi->common.current_video_frame, cm->width, cm->height, + cpi->td.rd_counts.m_search_count, + cpi->td.rd_counts.ex_search_count, cpi->rc.source_alt_ref_pending, cpi->rc.source_alt_ref_active, cpi->rc.this_frame_target, diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h index 7a879e2a3..c45be4ce0 100644 --- a/vp10/encoder/encoder.h +++ b/vp10/encoder/encoder.h @@ -246,6 +246,8 @@ typedef struct RD_COUNTS { vp10_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; int64_t comp_pred_diff[REFERENCE_MODES]; int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + int m_search_count; + int ex_search_count; } RD_COUNTS; typedef struct ThreadData { diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c index e20d532c3..6cb9494a9 100644 --- a/vp10/encoder/ethread.c +++ b/vp10/encoder/ethread.c @@ -30,6 +30,11 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { for (n = 0; n < ENTROPY_TOKENS; n++) td->rd_counts.coef_counts[i][j][k][l][m][n] += td_t->rd_counts.coef_counts[i][j][k][l][m][n]; + + + // Counts of all motion searches and exhuastive mesh searches. + td->rd_counts.m_search_count += td_t->rd_counts.m_search_count; + td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count; } static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c index d6ab00fdc..04e1dafee 100644 --- a/vp10/encoder/mcomp.c +++ b/vp10/encoder/mcomp.c @@ -1523,69 +1523,83 @@ int vp10_fast_dia_search(const MACROBLOCK *x, #undef CHECK_BETTER -int vp10_full_range_search_c(const MACROBLOCK *x, - const search_site_config *cfg, - MV *ref_mv, MV *best_mv, - int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv) { +// Exhuastive motion search around a given centre position with a given +// step size. +static int exhuastive_mesh_search(const MACROBLOCK *x, + MV *ref_mv, MV *best_mv, + int range, int step, int sad_per_bit, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int range = 64; - const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + MV fcenter_mv = {center_mv->row, center_mv->col}; unsigned int best_sad = INT_MAX; int r, c, i; int start_col, end_col, start_row, end_row; + int col_step = (step > 1) ? step : 4; - // The cfg and search_param parameters are not used in this search variant - (void)cfg; - (void)search_param; + assert(step >= 1); - clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - *best_mv = *ref_mv; - *num00 = 11; + clamp_mv(&fcenter_mv, x->mv_col_min, x->mv_col_max, + x->mv_row_min, x->mv_row_max); + *best_mv = fcenter_mv; best_sad = fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, ref_mv), in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - start_row = VPXMAX(-range, x->mv_row_min - ref_mv->row); - start_col = VPXMAX(-range, x->mv_col_min - ref_mv->col); - end_row = VPXMIN(range, x->mv_row_max - ref_mv->row); - end_col = VPXMIN(range, x->mv_col_max - ref_mv->col); + get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) + + mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit); + start_row = VPXMAX(-range, x->mv_row_min - fcenter_mv.row); + start_col = VPXMAX(-range, x->mv_col_min - fcenter_mv.col); + end_row = VPXMIN(range, x->mv_row_max - fcenter_mv.row); + end_col = VPXMIN(range, x->mv_col_max - fcenter_mv.col); - for (r = start_row; r <= end_row; ++r) { - for (c = start_col; c <= end_col; c += 4) { - if (c + 3 <= end_col) { - unsigned int sads[4]; - const uint8_t *addrs[4]; - for (i = 0; i < 4; ++i) { - const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; - addrs[i] = get_buf_from_mv(in_what, &mv); - } - - fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads); - - for (i = 0; i < 4; ++i) { - if (sads[i] < best_sad) { - const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; - const unsigned int sad = sads[i] + - mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += col_step) { + // Step > 1 means we are not checking every location in this pass. + if (step > 1) { + const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c}; + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; } } } else { - for (i = 0; i < end_col - c; ++i) { - const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; - unsigned int sad = fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride); - if (sad < best_sad) { - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + // 4 sads in a single call if we are checking every location + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i}; + addrs[i] = get_buf_from_mv(in_what, &mv); + } + fn_ptr->sdx4df(what->buf, what->stride, addrs, + in_what->stride, sads); + + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i}; + const unsigned int sad = sads[i] + + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } else { + for (i = 0; i < end_col - c; ++i) { + const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i}; + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; + sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } } } } @@ -2014,6 +2028,70 @@ int vp10_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x, return bestsme; } +#define MIN_RANGE 7 +#define MAX_RANGE 256 +#define MIN_INTERVAL 1 +// Runs an limited range exhaustive mesh search using a pattern set +// according to the encode speed profile. +static int full_pixel_exhaustive(VP10_COMP *cpi, MACROBLOCK *x, + MV *centre_mv_full, int sadpb, int *cost_list, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv) { + const SPEED_FEATURES *const sf = &cpi->sf; + MV temp_mv = {centre_mv_full->row, centre_mv_full->col}; + MV f_ref_mv = {ref_mv->row >> 3, ref_mv->col >> 3}; + int bestsme; + int i; + int interval = sf->mesh_patterns[0].interval; + int range = sf->mesh_patterns[0].range; + int baseline_interval_divisor; + + // Keep track of number of exhaustive calls (this frame in this thread). + ++(*x->ex_search_count_ptr); + + // Trap illegal values for interval and range for this function. + if ((range < MIN_RANGE) || (range > MAX_RANGE) || + (interval < MIN_INTERVAL) || (interval > range)) + return INT_MAX; + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4); + range = VPXMIN(range, MAX_RANGE); + interval = VPXMAX(interval, range / baseline_interval_divisor); + + // initial search + bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, + interval, sadpb, fn_ptr, &temp_mv); + + if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, + sf->mesh_patterns[i].range, + sf->mesh_patterns[i].interval, + sadpb, fn_ptr, &temp_mv); + + if (sf->mesh_patterns[i].interval == 1) + break; + } + } + + if (bestsme < INT_MAX) + bestsme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); + *dst_mv = temp_mv; + + // Return cost list. + if (cost_list) { + calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list); + } + return bestsme; +} + int vp10_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, const vp9_variance_fn_ptr_t *fn_ptr, @@ -2327,6 +2405,18 @@ int vp10_refining_search_8p_c(const MACROBLOCK *x, return best_sad; } +#define MIN_EX_SEARCH_LIMIT 128 +static int is_exhaustive_allowed(VP10_COMP *cpi, MACROBLOCK *x) { + const SPEED_FEATURES *const sf = &cpi->sf; + const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT, + (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100); + + return sf->allow_exhaustive_searches && + (sf->exhaustive_searches_thresh < INT_MAX) && + (*x->ex_search_count_ptr <= max_ex) && + !cpi->rc.is_src_frame_alt_ref; +} + int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int error_per_bit, @@ -2345,6 +2435,9 @@ int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x, cost_list[4] = INT_MAX; } + // Keep track of number of searches (this frame in this thread). + ++(*x->m_search_count_ptr); + switch (method) { case FAST_DIAMOND: var = vp10_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, @@ -2370,6 +2463,29 @@ int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x, var = vp10_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, MAX_MVSEARCH_STEPS - 1 - step_param, 1, cost_list, fn_ptr, ref_mv, tmp_mv); + + // Should we allow a follow on exhaustive search? + if (is_exhaustive_allowed(cpi, x)) { + int64_t exhuastive_thr = sf->exhaustive_searches_thresh; + exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] + + b_height_log2_lookup[bsize]); + + // Threshold variance for an exhaustive full search. + if (var > exhuastive_thr) { + int var_ex; + MV tmp_mv_ex; + var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, + error_per_bit, cost_list, fn_ptr, + ref_mv, &tmp_mv_ex); + + if (var_ex < var) { + var = var_ex; + *tmp_mv = tmp_mv_ex; + } + } + } + break; + break; default: assert(0 && "Invalid search method."); diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c index 813e13e0b..e4bc01846 100644 --- a/vp10/encoder/rdopt.c +++ b/vp10/encoder/rdopt.c @@ -3607,7 +3607,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) { MV *const new_mv = &mode_mv[NEWMV][0].as_mv; int step_param = 0; - int thissme, bestsme = INT_MAX; + int bestsme = INT_MAX; int sadpb = x->sadperbit4; MV mvp_full; int max_mv; @@ -3662,27 +3662,6 @@ static int64_t rd_pick_best_sub8x8_mode(VP10_COMP *cpi, MACROBLOCK *x, &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1); - // Should we do a full search (best quality only) - if (cpi->oxcf.mode == BEST) { - int_mv *const best_mv = &mi->bmi[i].as_mv[0]; - /* Check if mvp_full is within the range. */ - clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, - x->mv_row_min, x->mv_row_max); - thissme = cpi->full_search_sad(x, &mvp_full, - sadpb, 16, &cpi->fn_ptr[bsize], - &bsi->ref_mv[0]->as_mv, - &best_mv->as_mv); - cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX; - if (thissme < bestsme) { - bestsme = thissme; - *new_mv = best_mv->as_mv; - } else { - // The full search result is actually worse so re-instate the - // previous best vector - best_mv->as_mv = *new_mv; - } - } - if (bestsme < INT_MAX) { int distortion; cpi->find_fractional_mv_step( diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c index d40383f0e..ce0aebeab 100644 --- a/vp10/encoder/speed_features.c +++ b/vp10/encoder/speed_features.c @@ -16,6 +16,23 @@ #include "vpx_dsp/vpx_dsp_common.h" +// Mesh search patters for various speed settings +static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = + {{64, 4}, {28, 2}, {15, 1}, {7, 1}}; + +#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1] + [MAX_MESH_STEP] = + {{{64, 8}, {28, 4}, {15, 1}, {7, 1}}, + {{64, 8}, {28, 4}, {15, 1}, {7, 1}}, + {{64, 8}, {14, 2}, {7, 1}, {7, 1}}, + {{64, 16}, {24, 8}, {12, 4}, {7, 1}}, + {{64, 16}, {24, 8}, {12, 4}, {7, 1}}, + {{64, 16}, {24, 8}, {12, 4}, {7, 1}}, + }; +static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = + {50, 25, 15, 5, 1, 1}; + // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality static int frame_is_boosted(const VP10_COMP *cpi) { @@ -251,6 +268,8 @@ static void set_rt_speed_feature(VP10_COMP *cpi, SPEED_FEATURES *sf, sf->static_segmentation = 0; sf->adaptive_rd_thresh = 1; sf->use_fast_coef_costing = 1; + sf->allow_exhaustive_searches = 0; + sf->exhaustive_searches_thresh = INT_MAX; if (speed >= 1) { sf->use_square_partition_only = !frame_is_intra_only(cm); @@ -498,8 +517,36 @@ void vp10_set_speed_features_framesize_independent(VP10_COMP *cpi) { set_good_speed_feature(cpi, cm, sf, oxcf->speed); cpi->full_search_sad = vp10_full_search_sad; - cpi->diamond_search_sad = oxcf->mode == BEST ? vp10_full_range_search - : vp10_diamond_search_sad; + cpi->diamond_search_sad = vp10_diamond_search_sad; + + sf->allow_exhaustive_searches = 1; + if (oxcf->mode == BEST) { + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) + sf->exhaustive_searches_thresh = (1 << 20); + else + sf->exhaustive_searches_thresh = (1 << 21); + sf->max_exaustive_pct = 100; + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range; + sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval; + } + } else { + int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed; + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) + sf->exhaustive_searches_thresh = (1 << 22); + else + sf->exhaustive_searches_thresh = (1 << 23); + sf->max_exaustive_pct = good_quality_max_mesh_pct[speed]; + if (speed > 0) + sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1; + + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[speed][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[speed][i].interval; + } + } // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h index 3969a2ff8..3b9199929 100644 --- a/vp10/encoder/speed_features.h +++ b/vp10/encoder/speed_features.h @@ -195,6 +195,13 @@ typedef struct MV_SPEED_FEATURES { int fullpel_search_step_param; } MV_SPEED_FEATURES; +#define MAX_MESH_STEP 4 + +typedef struct MESH_PATTERN { + int range; + int interval; +} MESH_PATTERN; + typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -290,6 +297,18 @@ typedef struct SPEED_FEATURES { // point for this motion search and limits the search range around it. int adaptive_motion_search; + // Flag for allowing some use of exhaustive searches; + int allow_exhaustive_searches; + + // Threshold for allowing exhaistive motion search. + int exhaustive_searches_thresh; + + // Maximum number of exhaustive searches for a frame. + int max_exaustive_pct; + + // Pattern to be used for any exhaustive mesh searches. + MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; + int schedule_mode_search; // Allows sub 8x8 modes to use the prediction filter that was determined diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 45445df8d..8ab51cd20 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1075,7 +1075,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, if (!cpi->refresh_alt_ref_frame) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); // Modify best quality for second level arfs. For mode VPX_Q this // becomes the baseline frame q. diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 16f9c8573..015dbc0ca 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -135,15 +135,38 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { - int src_byte = frame1[byte]; - int pixel_value = *frame2++; + int pixel_value = *frame2; + + // non-local mean approach + int diff_sse[9] = { 0 }; + int idx, idy, index = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + int row = i + idy; + int col = j + idx; + + if (row >= 0 && row < (int)block_height && + col >= 0 && col < (int)block_width) { + int diff = frame1[byte + idy * (int)stride + idx] - + frame2[idy * (int)block_width + idx]; + diff_sse[index] = diff * diff; + ++index; + } + } + } + + assert(index > 0); + + modifier = 0; + for (idx = 0; idx < 9; ++idx) + modifier += diff_sse[idx]; + + modifier *= 3; + modifier /= index; + + ++frame2; - modifier = src_byte - pixel_value; - // This is an integer approximation of: - // float coeff = (3.0 * modifer * modifier) / pow(2, strength); - // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff); - modifier *= modifier; - modifier *= 3; modifier += rounding; modifier >>= strength; @@ -182,15 +205,34 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8, for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { - int src_byte = frame1[byte]; - int pixel_value = *frame2++; + int pixel_value = *frame2; + int diff_sse[9] = { 0 }; + int idx, idy, index = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + int row = i + idy; + int col = j + idx; + + if (row >= 0 && row < (int)block_height && + col >= 0 && col < (int)block_width) { + int diff = frame1[byte + idy * (int)stride + idx] - + frame2[idy * (int)block_width + idx]; + diff_sse[index] = diff * diff; + ++index; + } + } + } + assert(index > 0); + + modifier = 0; + for (idx = 0; idx < 9; ++idx) + modifier += diff_sse[idx]; - modifier = src_byte - pixel_value; - // This is an integer approximation of: - // float coeff = (3.0 * modifer * modifier) / pow(2, strength); - // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff); - modifier *= modifier; modifier *= 3; + modifier /= index; + + ++frame2; modifier += rounding; modifier >>= strength; @@ -383,55 +425,58 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int adj_strength = strength + 2 * (mbd->bd - 8); // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset, - f->y_stride, - predictor, 16, 16, adj_strength, - filter_weight, - accumulator, count); - vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset, - f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, - adj_strength, - filter_weight, accumulator + 256, - count + 256); - vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset, - f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, - adj_strength, filter_weight, - accumulator + 512, count + 512); + vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset, + f->y_stride, + predictor, 16, 16, adj_strength, + filter_weight, + accumulator, count); + vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, + f->uv_stride, predictor + 256, + mb_uv_width, mb_uv_height, + adj_strength, + filter_weight, accumulator + 256, + count + 256); + vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + 512, + mb_uv_width, mb_uv_height, + adj_strength, filter_weight, + accumulator + 512, count + 512); } else { // Apply the filter (YUV) - vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, + vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, + strength, filter_weight, + accumulator, count); + vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, + f->uv_stride, + predictor + 256, + mb_uv_width, mb_uv_height, strength, + filter_weight, accumulator + 256, + count + 256); + vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, + f->uv_stride, + predictor + 512, + mb_uv_width, mb_uv_height, strength, + filter_weight, accumulator + 512, + count + 512); + } +#else + // Apply the filter (YUV) + // TODO(jingning): Need SIMD optimization for this. + vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, strength, filter_weight, accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, + vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 256, count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, + vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 512, count + 512); - } -#else - // Apply the filter (YUV) - vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, - strength, filter_weight, - accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, - mb_uv_width, mb_uv_height, strength, - filter_weight, accumulator + 256, - count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, - mb_uv_width, mb_uv_height, strength, - filter_weight, accumulator + 512, - count + 512); #endif // CONFIG_VP9_HIGHBITDEPTH } } diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 93df92a9e..22d52a2af 100644 --- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -123,8 +123,10 @@ SECTION .text %define sec_str sec_stridemp ; Store bilin_filter and pw_8 location in stack - GET_GOT eax - add esp, 4 ; restore esp + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif lea ecx, [GLOBAL(bilin_filter_m)] mov g_bilin_filterm, ecx @@ -140,8 +142,10 @@ SECTION .text %define block_height heightd ; Store bilin_filter and pw_8 location in stack - GET_GOT eax - add esp, 4 ; restore esp + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif lea ecx, [GLOBAL(bilin_filter_m)] mov g_bilin_filterm, ecx diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm index edbf05e33..1809a3637 100644 --- a/vpx_dsp/x86/intrapred_sse2.asm +++ b/vpx_dsp/x86/intrapred_sse2.asm @@ -47,9 +47,9 @@ cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset INIT_XMM sse2 cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp GET_GOT goffsetq - movifnidn leftq, leftmp pxor m1, m1 movd m0, [leftq] psadbw m0, m1 @@ -143,9 +143,9 @@ cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset INIT_XMM sse2 cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp GET_GOT goffsetq - movifnidn leftq, leftmp pxor m1, m1 movq m0, [leftq] DEFINE_ARGS dst, stride, stride3 @@ -239,14 +239,11 @@ cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 - pxor m2, m2 mova m0, [aboveq] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 4 psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 movhlps m2, m0 paddw m0, m2 paddw m0, [GLOBAL(pw2_16)] @@ -271,14 +268,11 @@ cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 - pxor m2, m2 mova m0, [leftq] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 4 psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 movhlps m2, m0 paddw m0, m2 paddw m0, [GLOBAL(pw2_16)] diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm index 05dcff75e..c655e4b34 100644 --- a/vpx_dsp/x86/subpel_variance_sse2.asm +++ b/vpx_dsp/x86/subpel_variance_sse2.asm @@ -139,8 +139,10 @@ SECTION .text %define sec_str sec_stridemp ;Store bilin_filter and pw_8 location in stack - GET_GOT eax - add esp, 4 ; restore esp + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif lea ecx, [GLOBAL(bilin_filter_m)] mov g_bilin_filterm, ecx @@ -156,8 +158,10 @@ SECTION .text %define block_height heightd ;Store bilin_filter and pw_8 location in stack - GET_GOT eax - add esp, 4 ; restore esp + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif lea ecx, [GLOBAL(bilin_filter_m)] mov g_bilin_filterm, ecx diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm index c94b76a06..708fa101c 100644 --- a/vpx_ports/x86_abi_support.asm +++ b/vpx_ports/x86_abi_support.asm @@ -189,7 +189,6 @@ %if ABI_IS_32BIT %if CONFIG_PIC=1 %ifidn __OUTPUT_FORMAT__,elf32 - %define GET_GOT_SAVE_ARG 1 %define WRT_PLT wrt ..plt %macro GET_GOT 1 extern _GLOBAL_OFFSET_TABLE_ @@ -208,7 +207,6 @@ %define RESTORE_GOT pop %1 %endmacro %elifidn __OUTPUT_FORMAT__,macho32 - %define GET_GOT_SAVE_ARG 1 %macro GET_GOT 1 push %1 call %%get_got