Merge "Refactor block_yrd function for RTC coding mode"
This commit is contained in:
commit
7acb2a8795
@ -1171,6 +1171,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
|
||||
specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
|
||||
specialize qw/vp9_block_error_fp sse2/;
|
||||
|
||||
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
|
||||
|
||||
|
@ -557,6 +557,20 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
|
||||
*out_dist_sum += dist << 4;
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
|
||||
int *skippable, int64_t *sse, int plane,
|
||||
BLOCK_SIZE bsize, TX_SIZE tx_size) {
|
||||
MACROBLOCKD *xd = &x->e_mbd;
|
||||
unsigned int var_y, sse_y;
|
||||
(void)plane;
|
||||
(void)tx_size;
|
||||
model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
|
||||
*sse = INT_MAX;
|
||||
*skippable = 0;
|
||||
return;
|
||||
}
|
||||
#else
|
||||
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
|
||||
int *skippable, int64_t *sse, int plane,
|
||||
BLOCK_SIZE bsize, TX_SIZE tx_size) {
|
||||
@ -574,23 +588,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
|
||||
const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
|
||||
xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
unsigned int var_y, sse_y;
|
||||
model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
|
||||
*sse = INT_MAX;
|
||||
*skippable = 0;
|
||||
return;
|
||||
#else
|
||||
(void)cpi;
|
||||
#endif
|
||||
|
||||
vp9_subtract_plane(x, bsize, plane);
|
||||
|
||||
*skippable = 1;
|
||||
*rate = 0;
|
||||
*dist = 0;
|
||||
*sse = 0;
|
||||
|
||||
// Keep track of the row and column of the blocks we use so that we know
|
||||
// if we are in the unrestricted motion border.
|
||||
for (r = 0; r < max_blocks_high; r += block_step) {
|
||||
@ -604,7 +604,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
|
||||
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
|
||||
int i, j;
|
||||
const int16_t *src_diff;
|
||||
int64_t this_sse;
|
||||
txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
|
||||
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
|
||||
|
||||
@ -641,16 +640,36 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
|
||||
assert(0);
|
||||
break;
|
||||
}
|
||||
*skippable &= (*eob == 0);
|
||||
}
|
||||
block += step;
|
||||
}
|
||||
}
|
||||
|
||||
*dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
|
||||
if (*skippable && *sse < INT64_MAX) {
|
||||
*dist = (*sse << 6) >> shift;
|
||||
*sse = *dist;
|
||||
return;
|
||||
}
|
||||
|
||||
block = 0;
|
||||
*rate = 0;
|
||||
*dist = 0;
|
||||
*sse = (*sse << 6) >> shift;
|
||||
for (r = 0; r < max_blocks_high; r += block_step) {
|
||||
for (c = 0; c < num_4x4_w; c += block_step) {
|
||||
if (c < max_blocks_wide) {
|
||||
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
|
||||
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
|
||||
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
|
||||
uint16_t *const eob = &p->eobs[block];
|
||||
|
||||
if (*eob == 1)
|
||||
*rate += (int)abs(qcoeff[0]);
|
||||
else if (*eob > 1)
|
||||
*rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
|
||||
|
||||
*sse += (this_sse >> shift);
|
||||
*skippable &= (*eob == 0);
|
||||
*dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
|
||||
}
|
||||
block += step;
|
||||
}
|
||||
@ -659,6 +678,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
|
||||
*rate <<= 8;
|
||||
*rate *= 6;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
|
||||
MACROBLOCK *x, MACROBLOCKD *xd,
|
||||
@ -866,7 +886,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
|
||||
int i, j;
|
||||
int rate;
|
||||
int64_t dist;
|
||||
int64_t this_sse;
|
||||
int64_t this_sse = INT64_MAX;
|
||||
int is_skippable;
|
||||
|
||||
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
|
||||
@ -1328,6 +1348,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
|
||||
}
|
||||
|
||||
if (bsize <= BLOCK_16X16) {
|
||||
this_sse = (int64_t)sse_y;
|
||||
block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
|
||||
&this_sse, 0, bsize, mbmi->tx_size);
|
||||
x->skip_txfm[0] = is_skippable;
|
||||
|
@ -292,6 +292,18 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
|
||||
return error;
|
||||
}
|
||||
|
||||
int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
|
||||
int block_size) {
|
||||
int i;
|
||||
int64_t error = 0;
|
||||
|
||||
for (i = 0; i < block_size; i++) {
|
||||
const int diff = coeff[i] - dqcoeff[i];
|
||||
error += diff * diff;
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
|
||||
|
@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
|
||||
movd edx, m5
|
||||
%endif
|
||||
RET
|
||||
|
||||
; Compute the sum of squared difference between two int16_t vectors.
|
||||
; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
|
||||
; intptr_t block_size)
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal block_error_fp, 3, 3, 8, uqc, dqc, size
|
||||
pxor m4, m4 ; sse accumulator
|
||||
pxor m5, m5 ; dedicated zero register
|
||||
lea uqcq, [uqcq+sizeq*2]
|
||||
lea dqcq, [dqcq+sizeq*2]
|
||||
neg sizeq
|
||||
.loop:
|
||||
mova m2, [uqcq+sizeq*2]
|
||||
mova m0, [dqcq+sizeq*2]
|
||||
mova m3, [uqcq+sizeq*2+mmsize]
|
||||
mova m1, [dqcq+sizeq*2+mmsize]
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
; individual errors are max. 15bit+sign, so squares are 30bit, and
|
||||
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
|
||||
pmaddwd m0, m0
|
||||
pmaddwd m1, m1
|
||||
; accumulate in 64bit
|
||||
punpckldq m7, m0, m5
|
||||
punpckhdq m0, m5
|
||||
paddq m4, m7
|
||||
punpckldq m7, m1, m5
|
||||
paddq m4, m0
|
||||
punpckhdq m1, m5
|
||||
paddq m4, m7
|
||||
paddq m4, m1
|
||||
add sizeq, mmsize
|
||||
jl .loop
|
||||
|
||||
; accumulate horizontally and store in return value
|
||||
movhlps m5, m4
|
||||
paddq m4, m5
|
||||
%if ARCH_X86_64
|
||||
movq rax, m4
|
||||
%else
|
||||
pshufd m5, m4, 0x1
|
||||
movd eax, m4
|
||||
movd edx, m5
|
||||
%endif
|
||||
RET
|
||||
|
Loading…
x
Reference in New Issue
Block a user