Implement SSE2 block_error.

Change vp9_block_error() to return a 64bit error variable, change all
callers to expect a 64bit return value (this will prevent overflows,
which we basically don't check for at all right now). Remove duplicate
block_error() function, which fixed that through truncation. Remove
old (incompatible) mmx/sse2 block_error SIMD versions and replace with
a new one that returns a 64bit value.

Encoding time of first 50 frames of bus @ 1500kbps goes from 3min29 to
3min23, i.e. a 3% overall speedup.

Change-Id: Ib71ac5508b5ee8a80f1753cd85d72df1629abe68
This commit is contained in:
Ronald S. Bultje 2013-06-21 12:54:52 -07:00
parent 7756e9892b
commit 54b2a59623
7 changed files with 158 additions and 219 deletions

View File

@ -529,9 +529,8 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
specialize vp9_get_mb_ss mmx sse2
# ENCODEMB INVOKE
prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
specialize vp9_block_error mmx sse2
vp9_block_error_sse2=vp9_block_error_xmm
prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"
specialize vp9_block_error sse2
prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
specialize vp9_subtract_block sse2

View File

@ -582,7 +582,7 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
}
static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
TOKENEXTRA **tp, int *totalrate, int *totaldist,
TOKENEXTRA **tp, int *totalrate, int64_t *totaldist,
BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
@ -1195,7 +1195,7 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
}
static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
int *rate, int *dist) {
int *rate, int64_t *dist) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
@ -1211,7 +1211,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
BLOCK_SIZE_TYPE subsize;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
int r = 0, d = 0;
int r = 0;
int64_t d = 0;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@ -1252,7 +1253,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
get_block_context(x, subsize));
if (mi_row + (bh >> 1) <= cm->mi_rows) {
int rt, dt;
int rt;
int64_t dt;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*(get_sb_index(xd, subsize)) = 1;
@ -1270,7 +1272,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
get_block_context(x, subsize));
if (mi_col + (bs >> 1) <= cm->mi_cols) {
int rt, dt;
int rt;
int64_t dt;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*(get_sb_index(xd, subsize)) = 1;
@ -1289,7 +1292,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int x_idx = (i & 1) * (bs >> 2);
int y_idx = (i >> 1) * (bs >> 2);
int jj = i >> 1, ii = i & 0x01;
int rt, dt;
int rt;
int64_t dt;
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
@ -1323,7 +1327,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
// results, for encoding speed-up.
static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
int *dist) {
int64_t *dist) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
@ -1334,7 +1338,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
TOKENEXTRA *tp_orig = *tp;
int i, pl;
BLOCK_SIZE_TYPE subsize;
int srate = INT_MAX, sdist = INT_MAX;
int srate = INT_MAX;
int64_t sdist = INT_MAX;
if (bsize < BLOCK_SIZE_SB8X8)
if (xd->ab_index != 0) {
@ -1351,14 +1356,16 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
|| (cpi->sf.use_partitions_greater_than
&& bsize > cpi->sf.greater_than_block_size)) {
if (bsize >= BLOCK_SIZE_SB8X8) {
int r4 = 0, d4 = 0;
int r4 = 0;
int64_t d4 = 0;
subsize = get_subsize(bsize, PARTITION_SPLIT);
*(get_sb_partitioning(x, bsize)) = subsize;
for (i = 0; i < 4; ++i) {
int x_idx = (i & 1) * (ms >> 1);
int y_idx = (i >> 1) * (ms >> 1);
int r = 0, d = 0;
int r = 0;
int64_t d = 0;
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
@ -1386,8 +1393,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
&& bsize <= cpi->sf.less_than_block_size)) {
// PARTITION_HORZ
if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
int r2, d2;
int r = 0, d = 0;
int r2, r = 0;
int64_t d2, d = 0;
subsize = get_subsize(bsize, PARTITION_HORZ);
*(get_sb_index(xd, subsize)) = 0;
pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
@ -1418,13 +1425,15 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
// PARTITION_VERT
if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
int r2, d2;
int r2;
int64_t d2;
subsize = get_subsize(bsize, PARTITION_VERT);
*(get_sb_index(xd, subsize)) = 0;
pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
get_block_context(x, subsize));
if (mi_col + (ms >> 1) < cm->mi_cols) {
int r = 0, d = 0;
int r = 0;
int64_t d = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
@ -1450,7 +1459,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
// PARTITION_NONE
if ((mi_row + (ms >> 1) < cm->mi_rows) &&
(mi_col + (ms >> 1) < cm->mi_cols)) {
int r, d;
int r;
int64_t d;
pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
get_block_context(x, bsize));
if (bsize >= BLOCK_SIZE_SB8X8) {
@ -1497,7 +1507,8 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
// Code each SB in the row
for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
mi_col += 64 / MI_SIZE) {
int dummy_rate, dummy_dist;
int dummy_rate;
int64_t dummy_dist;
if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
cpi->sf.use_one_partition_size_always ) {
const int idx_str = cm->mode_info_stride * mi_row + mi_col;

View File

@ -274,12 +274,14 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
}
}
int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
int i, error = 0;
int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
intptr_t block_size) {
int i;
int64_t error = 0;
for (i = 0; i < block_size; i++) {
int this_diff = coeff[i] - dqcoeff[i];
error += this_diff * this_diff;
error += (unsigned)this_diff * this_diff;
}
return error;
@ -417,7 +419,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
int (*r)[2], int *rate,
int *d, int *distortion,
int64_t *d, int64_t *distortion,
int *s, int *skip,
int64_t txfm_cache[NB_TXFM_MODES],
TX_SIZE max_txfm_size) {
@ -496,27 +498,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
rd[TX_4X4][1] : rd[TX_8X8][1];
}
static int block_error(int16_t *coeff, int16_t *dqcoeff,
int block_size, int shift) {
int i;
int64_t error = 0;
for (i = 0; i < block_size; i++) {
int this_diff = coeff[i] - dqcoeff[i];
error += (unsigned)this_diff * this_diff;
}
error >>= shift;
return error > INT_MAX ? INT_MAX : (int)error;
}
static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
int shift) {
const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
16 << (bwl + bhl), shift);
return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
16 << (bwl + bhl)) >> shift;
}
static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
int shift) {
const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
int64_t sum = 0;
int plane;
@ -524,11 +514,10 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
for (plane = 1; plane < MAX_MB_PLANE; plane++) {
const int subsampling = x->e_mbd.plane[plane].subsampling_x +
x->e_mbd.plane[plane].subsampling_y;
sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
16 << (bwl + bhl - subsampling), 0);
sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
16 << (bwl + bhl - subsampling));
}
sum >>= shift;
return sum > INT_MAX ? INT_MAX : (int)sum;
return sum >> shift;
}
struct rdcost_block_args {
@ -586,7 +575,8 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
}
static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int *distortion, int *skippable,
int *rate, int64_t *distortion,
int *skippable,
BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
xd->mode_info_context->mbmi.txfm_size = tx_size;
@ -602,11 +592,12 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
}
static void super_block_yrd(VP9_COMP *cpi,
MACROBLOCK *x, int *rate, int *distortion,
MACROBLOCK *x, int *rate, int64_t *distortion,
int *skip, BLOCK_SIZE_TYPE bs,
int64_t txfm_cache[NB_TXFM_MODES]) {
VP9_COMMON *const cm = &cpi->common;
int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
int64_t d[TX_SIZE_MAX_SB];
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@ -651,13 +642,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
int *bmode_costs,
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
int *bestrate, int *bestratey,
int *bestdistortion,
int64_t *bestdistortion,
BLOCK_SIZE_TYPE bsize) {
MB_PREDICTION_MODE mode;
MACROBLOCKD *xd = &x->e_mbd;
int64_t best_rd = INT64_MAX;
int rate = 0;
int distortion;
int64_t distortion;
VP9_COMMON *const cm = &cpi->common;
const int src_stride = x->plane[0].src.stride;
uint8_t *src, *dst;
@ -777,7 +768,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
int *Rate, int *rate_y,
int *Distortion, int64_t best_rd) {
int64_t *Distortion, int64_t best_rd) {
int i, j;
MACROBLOCKD *const xd = &mb->e_mbd;
BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
@ -785,7 +776,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
int bh = 1 << b_height_log2(bsize);
int idx, idy;
int cost = 0;
int distortion = 0;
int64_t distortion = 0;
int tot_rate_y = 0;
int64_t total_rd = 0;
ENTROPY_CONTEXT t_above[4], t_left[4];
@ -802,7 +793,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
const int mis = xd->mode_info_stride;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
int UNINITIALIZED_IS_SAFE(d);
int64_t UNINITIALIZED_IS_SAFE(d);
i = idy * 2 + idx;
if (xd->frame_type == KEY_FRAME) {
@ -844,14 +835,14 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
int *distortion, int *skippable,
int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize,
int64_t txfm_cache[NB_TXFM_MODES]) {
MB_PREDICTION_MODE mode;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
MACROBLOCKD *const xd = &x->e_mbd;
int this_rate, this_rate_tokenonly;
int this_distortion, s;
int this_rate, this_rate_tokenonly, s;
int64_t this_distortion;
int64_t best_rd = INT64_MAX, this_rd;
TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
int i;
@ -912,7 +903,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int *distortion,
int *rate, int64_t *distortion,
int *skippable, BLOCK_SIZE_TYPE bsize,
TX_SIZE uv_tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
@ -927,7 +918,7 @@ static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
}
static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int *distortion, int *skippable,
int *rate, int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@ -952,13 +943,13 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
int *distortion, int *skippable,
int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize) {
MB_PREDICTION_MODE mode;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
int64_t best_rd = INT64_MAX, this_rd;
int this_rate_tokenonly, this_rate;
int this_distortion, s;
int this_rate_tokenonly, this_rate, s;
int64_t this_distortion;
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
@ -1101,7 +1092,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
MACROBLOCK *x,
int i,
int *labelyrate,
int *distortion,
int64_t *distortion,
ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl) {
int k;
@ -1126,7 +1117,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
xd->plane[0].dst.buf,
xd->plane[0].dst.stride);
int thisdistortion = 0;
int64_t thisdistortion = 0;
int thisrate = 0;
*labelyrate = 0;
@ -1189,7 +1180,7 @@ typedef struct {
int64_t segment_rd;
int r;
int d;
int64_t d;
int segment_yrate;
MB_PREDICTION_MODE modes[4];
int_mv mvs[4], second_mvs[4];
@ -1281,21 +1272,18 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
BEST_SEG_INFO *bsi,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
int i, j;
int br = 0, bd = 0;
int i, j, br = 0, rate = 0, sbr = 0, idx, idy;
int64_t bd = 0, sbd = 0;
MB_PREDICTION_MODE this_mode;
MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
const int label_count = 4;
int64_t this_segment_rd = 0, other_segment_rd;
int label_mv_thresh;
int rate = 0;
int sbr = 0, sbd = 0;
int segmentyrate = 0;
int best_eobs[4] = { 0 };
BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
int bwl = b_width_log2(bsize), bw = 1 << bwl;
int bhl = b_height_log2(bsize), bh = 1 << bhl;
int idx, idy;
vp9_variance_fn_ptr_t *v_fn_ptr;
ENTROPY_CONTEXT t_above[4], t_left[4];
ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
@ -1340,7 +1328,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
// search for the best motion vector on this segment
for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
int64_t this_rd;
int distortion;
int64_t distortion;
int labelyrate;
ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
const struct buf_2d orig_src = x->plane[0].src;
@ -1527,7 +1515,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_rd,
int *returntotrate,
int *returnyrate,
int *returndistortion,
int64_t *returndistortion,
int *skippable, int mvthresh,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
@ -1921,7 +1909,7 @@ static double model_dist_norm(double x) {
}
static void model_rd_from_var_lapndz(int var, int n, int qstep,
int *rate, int *dist) {
int *rate, int64_t *dist) {
// This function models the rate and distortion for a Laplacian
// source with given variance when quantized with a uniform quantizer
// with given stepsize. The closed form expression is:
@ -1958,12 +1946,13 @@ static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int *out_dist_sum) {
int *out_rate_sum, int64_t *out_dist_sum) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
unsigned int sse;
int i, rate_sum = 0, dist_sum = 0;
int i, rate_sum = 0;
int64_t dist_sum = 0;
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblock_plane *const p = &x->plane[i];
@ -1973,7 +1962,8 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
const int bw = plane_block_width(bsize, pd);
const int bh = plane_block_height(bsize, pd);
const enum BlockSize bs = get_block_size(bw, bh);
int rate, dist;
int rate;
int64_t dist;
cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
pd->dst.buf, pd->dst.stride, &sse);
model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
@ -2238,9 +2228,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize,
int64_t txfm_cache[],
int *rate2, int *distortion, int *skippable,
int *rate_y, int *distortion_y,
int *rate_uv, int *distortion_uv,
int *rate2, int64_t *distortion,
int *skippable,
int *rate_y, int64_t *distortion_y,
int *rate_uv, int64_t *distortion_uv,
int *mode_excluded, int *disable_skip,
INTERPOLATIONFILTERTYPE *best_filter,
int_mv *frame_mv,
@ -2344,7 +2335,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
*best_filter = EIGHTTAP;
} else {
int i, newbest;
int tmp_rate_sum = 0, tmp_dist_sum = 0;
int tmp_rate_sum = 0;
int64_t tmp_dist_sum = 0;
for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
int rs = 0;
const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
@ -2359,7 +2351,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (interpolating_intpel_seen && is_intpel_interp) {
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
} else {
int rate_sum = 0, dist_sum = 0;
int rate_sum = 0;
int64_t dist_sum = 0;
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
@ -2503,19 +2496,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int *returnrate, int *returndist,
int *returnrate, int64_t *returndist,
BLOCK_SIZE_TYPE bsize,
PICK_MODE_CONTEXT *ctx) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
int rate_y = 0, rate_uv;
int rate_y_tokenonly = 0, rate_uv_tokenonly;
int dist_y = 0, dist_uv;
int y_skip = 0, uv_skip;
int rate_y = 0, rate_uv = 0;
int rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
int64_t dist_y = 0, dist_uv = 0;
int y_skip = 0, uv_skip = 0;
int64_t txfm_cache[NB_TXFM_MODES], err;
MB_PREDICTION_MODE mode;
TX_SIZE txfm_size;
int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
int rate4x4_y, rate4x4_y_tokenonly;
int64_t dist4x4_y;
int64_t err4x4 = INT64_MAX;
int i;
@ -2566,7 +2560,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int *returnrate,
int *returndistortion,
int64_t *returndistortion,
BLOCK_SIZE_TYPE bsize,
PICK_MODE_CONTEXT *ctx) {
VP9_COMMON *cm = &cpi->common;
@ -2601,7 +2595,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];
int64_t dist_uv[TX_SIZE_MAX_SB];
int skip_uv[TX_SIZE_MAX_SB];
MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
struct scale_factors scale_factor[4];
unsigned int ref_frame_mask = 0;
@ -2704,7 +2699,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int disable_skip = 0;
int compmode_cost = 0;
int rate2 = 0, rate_y = 0, rate_uv = 0;
int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
int skippable;
int64_t txfm_cache[NB_TXFM_MODES];
int i;
@ -2891,11 +2886,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
distortion2 = distortion_y + distortion_uv;
} else if (this_mode == SPLITMV) {
const int is_comp_pred = mbmi->ref_frame[1] > 0;
int rate, distortion;
int rate;
int64_t distortion;
int64_t this_rd_thresh;
int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
int64_t tmp_best_distortion = INT_MAX;
int tmp_best_skippable = 0;
int switchable_filter_index;
int_mv *second_ref = is_comp_pred ?
&mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;

View File

@ -20,12 +20,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int *r, int *d, BLOCK_SIZE_TYPE bsize,
int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
PICK_MODE_CONTEXT *ctx);
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int *r, int *d, BLOCK_SIZE_TYPE bsize,
int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
PICK_MODE_CONTEXT *ctx);
void vp9_init_me_luts();

View File

@ -1,125 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
global sym(vp9_block_error_xmm) PRIVATE
sym(vp9_block_error_xmm):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
push rsi
push rdi
; end prologue
mov rsi, arg(0) ;coeff_ptr
mov rdi, arg(1) ;dcoef_ptr
movdqa xmm0, [rsi]
movdqa xmm1, [rdi]
movdqa xmm2, [rsi+16]
movdqa xmm3, [rdi+16]
psubw xmm0, xmm1
psubw xmm2, xmm3
pmaddwd xmm0, xmm0
pmaddwd xmm2, xmm2
paddd xmm0, xmm2
pxor xmm5, xmm5
movdqa xmm1, xmm0
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm0, 8
paddd xmm0, xmm1
movq rax, xmm0
pop rdi
pop rsi
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
global sym(vp9_block_error_mmx) PRIVATE
sym(vp9_block_error_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 2
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;coeff_ptr
pxor mm7, mm7
mov rdi, arg(1) ;dcoef_ptr
movq mm3, [rsi]
movq mm4, [rdi]
movq mm5, [rsi+8]
movq mm6, [rdi+8]
pxor mm1, mm1 ; from movd mm1, dc ; dc =0
movq mm2, mm7
psubw mm5, mm6
por mm1, mm2
pmaddwd mm5, mm5
pcmpeqw mm1, mm7
psubw mm3, mm4
pand mm1, mm3
pmaddwd mm1, mm1
paddd mm1, mm5
movq mm3, [rsi+16]
movq mm4, [rdi+16]
movq mm5, [rsi+24]
movq mm6, [rdi+24]
psubw mm5, mm6
pmaddwd mm5, mm5
psubw mm3, mm4
pmaddwd mm3, mm3
paddd mm3, mm5
paddd mm1, mm3
movq mm0, mm1
psrlq mm1, 32
paddd mm0, mm1
movq rax, mm0
pop rdi
pop rsi
; begin epilog
UNSHADOW_ARGS
pop rbp
ret

View File

@ -0,0 +1,57 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION .text
; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)
INIT_XMM sse2
cglobal block_error, 3, 3, 6, uqc, dqc, size
pxor m4, m4 ; accumulator
pxor m5, m5 ; dedicated zero register
lea uqcq, [uqcq+sizeq*2]
lea dqcq, [dqcq+sizeq*2]
neg sizeq
.loop:
mova m0, [uqcq+sizeq*2]
mova m2, [dqcq+sizeq*2]
mova m1, [uqcq+sizeq*2+mmsize]
mova m3, [dqcq+sizeq*2+mmsize]
psubw m0, m2
psubw m1, m3
; individual errors are max. 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
pmaddwd m0, m0
pmaddwd m1, m1
; accumulate in 64bit
punpckldq m2, m0, m5
punpckhdq m0, m5
punpckldq m3, m1, m5
punpckhdq m1, m5
paddq m4, m2
paddq m4, m0
paddq m4, m3
paddq m4, m1
add sizeq, mmsize
jl .loop
; accumulate horizontally and store in return value
movhlps m5, m4
paddq m4, m5
%if ARCH_X86_64
movq rax, m4
%else
pshufd m5, m4, 0x1
movd eax, m4
movd edx, m5
%endif
RET

View File

@ -85,12 +85,12 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c