Merge "Refactor block_yrd function for RTC coding mode"

This commit is contained in:
Jingning Han 2015-04-01 14:54:24 -07:00 committed by Gerrit Code Review
commit 7acb2a8795
4 changed files with 101 additions and 19 deletions

View File

@ -1171,6 +1171,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
specialize qw/vp9_block_error_fp sse2/;
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";

View File

@ -557,6 +557,20 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
*out_dist_sum += dist << 4;
}
#if CONFIG_VP9_HIGHBITDEPTH
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
int *skippable, int64_t *sse, int plane,
BLOCK_SIZE bsize, TX_SIZE tx_size) {
MACROBLOCKD *xd = &x->e_mbd;
unsigned int var_y, sse_y;
(void)plane;
(void)tx_size;
model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
*sse = INT_MAX;
*skippable = 0;
return;
}
#else
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
int *skippable, int64_t *sse, int plane,
BLOCK_SIZE bsize, TX_SIZE tx_size) {
@ -574,23 +588,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
#if CONFIG_VP9_HIGHBITDEPTH
unsigned int var_y, sse_y;
model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
*sse = INT_MAX;
*skippable = 0;
return;
#else
(void)cpi;
#endif
vp9_subtract_plane(x, bsize, plane);
*skippable = 1;
*rate = 0;
*dist = 0;
*sse = 0;
// Keep track of the row and column of the blocks we use so that we know
// if we are in the unrestricted motion border.
for (r = 0; r < max_blocks_high; r += block_step) {
@ -604,7 +604,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
int i, j;
const int16_t *src_diff;
int64_t this_sse;
txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
@ -641,16 +640,36 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
assert(0);
break;
}
*skippable &= (*eob == 0);
}
block += step;
}
}
*dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
if (*skippable && *sse < INT64_MAX) {
*dist = (*sse << 6) >> shift;
*sse = *dist;
return;
}
block = 0;
*rate = 0;
*dist = 0;
*sse = (*sse << 6) >> shift;
for (r = 0; r < max_blocks_high; r += block_step) {
for (c = 0; c < num_4x4_w; c += block_step) {
if (c < max_blocks_wide) {
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
if (*eob == 1)
*rate += (int)abs(qcoeff[0]);
else if (*eob > 1)
*rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
*sse += (this_sse >> shift);
*skippable &= (*eob == 0);
*dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
}
block += step;
}
@ -659,6 +678,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
*rate <<= 8;
*rate *= 6;
}
#endif
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
@ -866,7 +886,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
int i, j;
int rate;
int64_t dist;
int64_t this_sse;
int64_t this_sse = INT64_MAX;
int is_skippable;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@ -1328,6 +1348,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
if (bsize <= BLOCK_16X16) {
this_sse = (int64_t)sse_y;
block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
&this_sse, 0, bsize, mbmi->tx_size);
x->skip_txfm[0] = is_skippable;

View File

@ -292,6 +292,18 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
return error;
}
int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
int block_size) {
int i;
int64_t error = 0;
for (i = 0; i < block_size; i++) {
const int diff = coeff[i] - dqcoeff[i];
error += diff * diff;
}
return error;
}
#if CONFIG_VP9_HIGHBITDEPTH
int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,

View File

@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
movd edx, m5
%endif
RET
; Compute the sum of squared difference between two int16_t vectors.
; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
; intptr_t block_size)
INIT_XMM sse2
cglobal block_error_fp, 3, 3, 8, uqc, dqc, size
pxor m4, m4 ; sse accumulator
pxor m5, m5 ; dedicated zero register
lea uqcq, [uqcq+sizeq*2]
lea dqcq, [dqcq+sizeq*2]
neg sizeq
.loop:
mova m2, [uqcq+sizeq*2]
mova m0, [dqcq+sizeq*2]
mova m3, [uqcq+sizeq*2+mmsize]
mova m1, [dqcq+sizeq*2+mmsize]
psubw m0, m2
psubw m1, m3
; individual errors are max. 15bit+sign, so squares are 30bit, and
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
pmaddwd m0, m0
pmaddwd m1, m1
; accumulate in 64bit
punpckldq m7, m0, m5
punpckhdq m0, m5
paddq m4, m7
punpckldq m7, m1, m5
paddq m4, m0
punpckhdq m1, m5
paddq m4, m7
paddq m4, m1
add sizeq, mmsize
jl .loop
; accumulate horizontally and store in return value
movhlps m5, m4
paddq m4, m5
%if ARCH_X86_64
movq rax, m4
%else
pshufd m5, m4, 0x1
movd eax, m4
movd edx, m5
%endif
RET