Make coefficient skip condition an explicit RD choice.

This commit replaces zrun_zbin_boost, a method of biasing non-zero
coefficients following runs of zero-coefficients to be rounded towards
zero, with an explicit skip-block choice in the RD loop.

The logic is basically that if individual coefficients should be rounded
towards zero (from a RD point of view), the trellis/optimize loop should
take care of it. If whole blocks should be zero (from a RD point of
view), a single RD check is much more efficient than a complete
serialization of the quantization loop.

Quality change: derf +0.5% psnr, +1.6% ssim; yt +0.6% psnr, +1.1% ssim.
SIMD for quantize will follow in a separate patch. Results for other
test sets pending.

Change-Id: Ife5fa641163ac5150ac428011e87188f1937c1f4
This commit is contained in:
Ronald S. Bultje 2013-06-27 17:41:54 -07:00
parent 1b5421f3c5
commit af660715c0
6 changed files with 115 additions and 91 deletions

View File

@ -558,7 +558,7 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
specialize vp9_get_mb_ss mmx sse2
# ENCODEMB INVOKE
prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"
prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
specialize vp9_block_error sse2
prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"

View File

@ -68,7 +68,6 @@ struct macroblock_plane {
int16_t *quant;
uint8_t *quant_shift;
int16_t *zbin;
int16_t *zrun_zbin_boost;
int16_t *round;
// Zbin Over Quant value

View File

@ -268,11 +268,7 @@ typedef struct VP9_COMP {
DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);
#endif
DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
MACROBLOCK mb;
VP9_COMMON common;

View File

@ -21,8 +21,7 @@
extern int enc_debug;
#endif
static void quantize(int16_t *zbin_boost_orig_ptr,
int16_t *coeff_ptr, int n_coeffs, int skip_block,
static void quantize(int16_t *coeff_ptr, int n_coeffs, int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
uint8_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
@ -31,8 +30,6 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
int zero_run = 0;
int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
int zero_flag = n_coeffs;
vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@ -65,8 +62,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
rc = scan[i];
z = coeff_ptr[rc];
zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
zero_run += (zero_run < 15);
zbin = (zbins[rc != 0]);
sz = (z >> 31); // sign of z
x = (z ^ sz) - sz;
@ -81,7 +77,6 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
if (y) {
eob = i; // last nonzero coeffs
zero_run = 0; // set zero_run
}
}
}
@ -90,8 +85,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
}
// This function works well for large transform size.
static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
int16_t *coeff_ptr, int n_coeffs, int skip_block,
static void quantize_sparse(int16_t *coeff_ptr, int n_coeffs, int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr,
int16_t *quant_ptr, uint8_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
@ -101,10 +95,7 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
int zero_run = 0;
int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
int idx = 0;
int pre_idx = 0;
vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@ -135,11 +126,8 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
rc = scan[idx_arr[i]];
// Calculate ZBIN
zero_run += idx_arr[i] - pre_idx;
if(zero_run > 15) zero_run = 15;
zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
zbin = (zbins[rc != 0]);
pre_idx = idx_arr[i];
z = coeff_ptr[rc] * 2;
sz = (z >> 31); // sign of z
x = (z ^ sz) - sz; // x = abs(z)
@ -155,7 +143,6 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
if (y) {
eob = idx_arr[i]; // last nonzero coeffs
zero_run = -1; // set zero_run
}
}
}
@ -189,8 +176,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
// Save index of picked coefficient in pre-scan pass.
int idx_arr[1024];
quantize_sparse(mb->plane[plane].zrun_zbin_boost,
BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
n_coeffs, mb->skip_block,
mb->plane[plane].zbin,
mb->plane[plane].round,
@ -204,8 +190,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
scan, idx_arr);
}
else {
quantize(mb->plane[plane].zrun_zbin_boost,
BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
quantize(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
n_coeffs, mb->skip_block,
mb->plane[plane].zbin,
mb->plane[plane].round,
@ -226,8 +211,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
const int *pt_scan = get_scan_4x4(tx_type);
quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
quantize(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
16, mb->skip_block,
mb->plane[pb_idx.plane].zbin,
mb->plane[pb_idx.plane].round,
@ -261,9 +245,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
#endif
int q;
static const int zbin_boost[16] = { 0, 0, 0, 8, 8, 8, 10, 12,
14, 16, 20, 24, 28, 32, 36, 40 };
for (q = 0; q < QINDEX_RANGE; q++) {
int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
int qrounding_factor = 48;
@ -277,14 +258,12 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.y_dequant[q][0] = quant_val;
cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.uv_dequant[q][0] = quant_val;
cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
#if CONFIG_ALPHA
quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
@ -292,7 +271,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
cpi->common.a_dequant[q][0] = quant_val;
cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;
#endif
quant_val = vp9_ac_quant(q, 0);
@ -310,15 +288,11 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;
cpi->zrun_zbin_boost_y[q][i] =
ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,
quant_uv_val);
cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
cpi->zrun_zbin_boost_uv[q][i] =
ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
#if CONFIG_ALPHA
invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,
@ -326,8 +300,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
cpi->a_zbin[q][rc] =
ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;
cpi->zrun_zbin_boost_a[q][i] =
ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);
#endif
}
}
@ -348,7 +320,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
x->plane[0].zbin = cpi->y_zbin[qindex];
x->plane[0].round = cpi->y_round[qindex];
x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];
x->plane[0].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
@ -361,7 +332,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
x->plane[i].zbin = cpi->uv_zbin[qindex];
x->plane[i].round = cpi->uv_round[qindex];
x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
x->plane[i].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
}
@ -371,7 +341,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
x->plane[3].zbin = cpi->a_zbin[qindex];
x->plane[3].round = cpi->a_round[qindex];
x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];
x->plane[3].zbin_extra = (int16_t)zbin_extra;
x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
#endif

View File

@ -283,15 +283,17 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
}
int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
intptr_t block_size) {
intptr_t block_size, int64_t *ssz) {
int i;
int64_t error = 0;
int64_t error = 0, sqcoeff = 0;
for (i = 0; i < block_size; i++) {
int this_diff = coeff[i] - dqcoeff[i];
error += (unsigned)this_diff * this_diff;
sqcoeff += (unsigned) coeff[i] * coeff[i];
}
*ssz = sqcoeff;
return error;
}
@ -501,27 +503,31 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
}
static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
int shift) {
int shift, int64_t *sse) {
struct macroblockd_plane *p = &x->e_mbd.plane[0];
const int bw = plane_block_width(bsize, p);
const int bh = plane_block_height(bsize, p);
return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
bw * bh) >> shift;
int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
bw * bh, sse) >> shift;
*sse >>= shift;
return e;
}
static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
int shift) {
int64_t sum = 0;
int shift, int64_t *sse) {
int64_t sum = 0, this_sse;
int plane;
*sse = 0;
for (plane = 1; plane < MAX_MB_PLANE; plane++) {
struct macroblockd_plane *p = &x->e_mbd.plane[plane];
const int bw = plane_block_width(bsize, p);
const int bh = plane_block_height(bsize, p);
sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
bw * bh);
bw * bh, &this_sse);
*sse += this_sse;
}
*sse >>= shift;
return sum >> shift;
}
@ -581,7 +587,7 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int64_t *distortion,
int *skippable,
int *skippable, int64_t *sse,
BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
xd->mode_info_context->mbmi.txfm_size = tx_size;
@ -591,18 +597,18 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
else
vp9_xform_quant_sby(cm, x, bsize);
*distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
*distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2, sse);
*rate = rdcost_plane(cm, x, 0, bsize, tx_size);
*skippable = vp9_sby_is_skippable(xd, bsize);
}
static void super_block_yrd(VP9_COMP *cpi,
MACROBLOCK *x, int *rate, int64_t *distortion,
int *skip, BLOCK_SIZE_TYPE bs,
int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
int64_t txfm_cache[NB_TXFM_MODES]) {
VP9_COMMON *const cm = &cpi->common;
int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
int64_t d[TX_SIZE_MAX_SB];
int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@ -621,25 +627,27 @@ static void super_block_yrd(VP9_COMP *cpi,
mbmi->txfm_size = TX_4X4;
}
vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
super_block_yrd_for_txfm(cm, x, rate, distortion, skip, &sse[0], bs,
mbmi->txfm_size);
return;
}
if (bs >= BLOCK_SIZE_SB32X32)
super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
bs, TX_32X32);
&sse[TX_32X32], bs, TX_32X32);
if (bs >= BLOCK_SIZE_MB16X16)
super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
bs, TX_16X16);
super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
TX_8X8);
super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
TX_4X4);
&sse[TX_16X16], bs, TX_16X16);
super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
&sse[TX_8X8], bs, TX_8X8);
super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
&sse[TX_4X4], bs, TX_4X4);
choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
skip, txfm_cache,
TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
- (bs < BLOCK_SIZE_MB16X16));
if (psse)
*psse = sse[mbmi->txfm_size];
}
static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@ -688,6 +696,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
for (idy = 0; idy < bh; ++idy) {
for (idx = 0; idx < bw; ++idx) {
int64_t ssz;
block = ib + idy * 2 + idx;
xd->mode_info_context->bmi[block].as_mode.first = mode;
src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
@ -718,7 +728,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
tempa + idx, templ + idy, TX_4X4, 16);
distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
block, 16), 16) >> 2;
block, 16),
16, &ssz) >> 2;
if (best_tx_type != DCT_DCT)
vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
@ -881,7 +892,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
x->e_mbd.mode_info_context->mbmi.mode = mode;
super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
bsize, local_txfm_cache);
this_rate = this_rate_tokenonly + bmode_costs[mode];
@ -914,22 +925,25 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int64_t *distortion,
int *skippable, BLOCK_SIZE_TYPE bsize,
int *skippable, int64_t *sse,
BLOCK_SIZE_TYPE bsize,
TX_SIZE uv_tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
int64_t dummy;
if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
vp9_encode_intra_block_uv(cm, x, bsize);
else
vp9_xform_quant_sbuv(cm, x, bsize);
*distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
*distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
sse ? sse : &dummy);
*rate = rdcost_uv(cm, x, bsize, uv_tx_size);
*skippable = vp9_sbuv_is_skippable(xd, bsize);
}
static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize) {
int64_t *sse, BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
@ -937,7 +951,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
if (mbmi->ref_frame[0] > INTRA_FRAME)
vp9_subtract_sbuv(x, bsize);
super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
uv_txfm_size);
}
@ -954,7 +968,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
&this_distortion, &s, bsize);
&this_distortion, &s, NULL, bsize);
this_rate = this_rate_tokenonly +
x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@ -1151,6 +1165,8 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
k = i;
for (idy = 0; idy < bh / 4; ++idy) {
for (idx = 0; idx < bw / 4; ++idx) {
int64_t ssz;
k += (idy * 2 + idx);
src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
x->plane[0].src_diff);
@ -1159,7 +1175,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
x->quantize_b_4x4(x, k, DCT_DCT, 16);
thisdistortion += vp9_block_error(coeff,
BLOCK_OFFSET(xd->plane[0].dqcoeff,
k, 16), 16);
k, 16), 16, &ssz);
thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
ta + (k & 1),
tl + (k >> 1), TX_4X4, 16);
@ -2238,7 +2254,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
INTERPOLATIONFILTERTYPE *best_filter,
int_mv *frame_mv,
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES]) {
int_mv single_newmv[MAX_REF_FRAMES],
int64_t *psse) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@ -2467,17 +2484,19 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (!x->skip) {
int skippable_y, skippable_uv;
int64_t sseuv = INT_MAX;
// Y cost and distortion
super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
bsize, txfm_cache);
*rate2 += *rate_y;
*distortion += *distortion_y;
super_block_uvrd(cm, x, rate_uv, distortion_uv,
&skippable_uv, bsize);
&skippable_uv, &sseuv, bsize);
*psse += sseuv;
*rate2 += *rate_uv;
*distortion += *distortion_uv;
*skippable = skippable_y && skippable_uv;
@ -2611,6 +2630,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int bws = (1 << bwsl) / 4; // mode_info step for subsize
int bhsl = b_height_log2(bsize);
int bhs = (1 << bhsl) / 4; // mode_info step for subsize
int best_skip2 = 0;
for (i = 0; i < 4; i++) {
int j;
@ -2702,6 +2722,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int skippable;
int64_t txfm_cache[NB_TXFM_MODES];
int i;
int this_skip2 = 0;
int64_t total_sse = INT_MAX;
for (i = 0; i < NB_TXFM_MODES; ++i)
txfm_cache[i] = INT64_MAX;
@ -2863,7 +2885,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
txfm_cache[i] = txfm_cache[ONLY_4X4];
} else if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
bsize, txfm_cache);
uv_tx = mbmi->txfm_size;
@ -2989,7 +3011,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE_SB8X8);
vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
&uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
&uv_skippable, NULL, BLOCK_SIZE_SB8X8, TX_4X4);
rate2 += rate_uv;
distortion2 += distortion_uv;
skippable = skippable && uv_skippable;
@ -3017,7 +3039,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
&mode_excluded, &disable_skip,
&tmp_best_filter, frame_mv[this_mode],
mi_row, mi_col,
single_newmv);
single_newmv, &total_sse);
if (this_rd == INT64_MAX)
continue;
}
@ -3062,10 +3084,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
rate2 += prob_skip_cost;
}
}
} else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&
this_mode != SPLITMV) {
if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
// Add in the cost of the no skip flag.
int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
PRED_MBSKIP), 0);
rate2 += prob_skip_cost;
} else {
int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
PRED_MBSKIP), 1);
rate2 += prob_skip_cost;
distortion2 = total_sse;
assert(total_sse >= 0);
rate2 -= (rate_y + rate_uv);
rate_y = 0;
rate_uv = 0;
this_skip2 = 1;
}
} else if (mb_skip_allowed) {
// Add in the cost of the no skip flag.
int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
PRED_MBSKIP), 0);
PRED_MBSKIP), 0);
rate2 += prob_skip_cost;
}
@ -3119,6 +3160,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*returndistortion = distortion2;
best_rd = this_rd;
best_mbmode = *mbmi;
best_skip2 = this_skip2;
best_partition = *x->partition_info;
if (this_mode == I4X4_PRED || this_mode == SPLITMV)
@ -3301,6 +3343,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// macroblock modes
*mbmi = best_mbmode;
x->skip |= best_skip2;
if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
for (i = 0; i < 4; i++)

View File

@ -12,45 +12,62 @@
SECTION .text
; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)
; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
; int64_t *ssz)
INIT_XMM sse2
cglobal block_error, 3, 3, 6, uqc, dqc, size
pxor m4, m4 ; accumulator
cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
pxor m4, m4 ; sse accumulator
pxor m6, m6 ; ssz accumulator
pxor m5, m5 ; dedicated zero register
lea uqcq, [uqcq+sizeq*2]
lea dqcq, [dqcq+sizeq*2]
neg sizeq
.loop:
mova m0, [uqcq+sizeq*2]
mova m2, [dqcq+sizeq*2]
mova m1, [uqcq+sizeq*2+mmsize]
mova m3, [dqcq+sizeq*2+mmsize]
mova m2, [uqcq+sizeq*2]
mova m0, [dqcq+sizeq*2]
mova m3, [uqcq+sizeq*2+mmsize]
mova m1, [dqcq+sizeq*2+mmsize]
psubw m0, m2
psubw m1, m3
; individual errors are max. 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
pmaddwd m0, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
; accumulate in 64bit
punpckldq m2, m0, m5
punpckldq m7, m0, m5
punpckhdq m0, m5
punpckldq m3, m1, m5
punpckhdq m1, m5
paddq m4, m2
paddq m4, m7
punpckldq m7, m1, m5
paddq m4, m0
paddq m4, m3
punpckhdq m1, m5
paddq m4, m7
punpckldq m7, m2, m5
paddq m4, m1
punpckhdq m2, m5
paddq m6, m7
punpckldq m7, m3, m5
paddq m6, m2
punpckhdq m3, m5
paddq m6, m7
paddq m6, m3
add sizeq, mmsize
jl .loop
; accumulate horizontally and store in return value
movhlps m5, m4
movhlps m7, m6
paddq m4, m5
paddq m6, m7
%if ARCH_X86_64
movq rax, m4
movq [sszq], m6
%else
mov eax, sszm
pshufd m5, m4, 0x1
movq [eax], m6
movd eax, m4
movd edx, m5
%endif