Add a vector form of routine vp9_model_rd_from_var_lapndz
Add routine vp9_model_rd_from_var_lapndz_vec and call it from model_rd_for_sb to model the rate and distortion for MAX_MB_PLANE Laplacian sources in parallel. The caller ensures that all sources have non-zero variance. Measured a 18% to 25% reduction in retired instructions, and 17% to 24% reduction in instruction execution cost with different compilers for the Laplacian modeling. No change in behavior. TEST=Verified that encoded files match bit for bit, with and without this change. BUG=b/33678225 Change-Id: I6b76947f21c659a349adb896e13e99f6e3f951e6
This commit is contained in:
parent
83ba1880bf
commit
976ddb61d3
@ -312,63 +312,62 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: The tables below must be of the same size.
|
||||
|
||||
// The functions described below are sampled at the four most significant
|
||||
// bits of x^2 + 8 / 256.
|
||||
|
||||
// Normalized rate:
|
||||
// This table models the rate for a Laplacian source with given variance
|
||||
// when quantized with a uniform quantizer with given stepsize. The
|
||||
// closed form expression is:
|
||||
// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
|
||||
// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
|
||||
// and H(x) is the binary entropy function.
|
||||
static const int rate_tab_q10[] = {
|
||||
65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044,
|
||||
3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037,
|
||||
2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179,
|
||||
2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398,
|
||||
1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, 911, 864, 821, 781, 745,
|
||||
680, 623, 574, 530, 490, 455, 424, 395, 345, 304, 269, 239, 213,
|
||||
190, 171, 154, 126, 104, 87, 73, 61, 52, 44, 38, 28, 21,
|
||||
16, 12, 10, 8, 6, 5, 3, 2, 1, 1, 1, 0, 0,
|
||||
};
|
||||
|
||||
// Normalized distortion:
|
||||
// This table models the normalized distortion for a Laplacian source
|
||||
// with given variance when quantized with a uniform quantizer
|
||||
// with given stepsize. The closed form expression is:
|
||||
// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
|
||||
// where x = qpstep / sqrt(variance).
|
||||
// Note the actual distortion is Dn * variance.
|
||||
static const int dist_tab_q10[] = {
|
||||
0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5,
|
||||
6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 21,
|
||||
24, 26, 29, 31, 34, 36, 39, 44, 49, 54, 59, 64, 69,
|
||||
73, 78, 88, 97, 106, 115, 124, 133, 142, 151, 167, 184, 200,
|
||||
215, 231, 245, 260, 274, 301, 327, 351, 375, 397, 418, 439, 458,
|
||||
495, 528, 559, 587, 613, 637, 659, 680, 717, 749, 777, 801, 823,
|
||||
842, 859, 874, 899, 919, 936, 949, 960, 969, 977, 983, 994, 1001,
|
||||
1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
|
||||
};
|
||||
static const int xsq_iq_q10[] = {
|
||||
0, 4, 8, 12, 16, 20, 24, 28, 32,
|
||||
40, 48, 56, 64, 72, 80, 88, 96, 112,
|
||||
128, 144, 160, 176, 192, 208, 224, 256, 288,
|
||||
320, 352, 384, 416, 448, 480, 544, 608, 672,
|
||||
736, 800, 864, 928, 992, 1120, 1248, 1376, 1504,
|
||||
1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296,
|
||||
3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136,
|
||||
7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328,
|
||||
16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736,
|
||||
36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696,
|
||||
81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808,
|
||||
180192, 196576, 212960, 229344, 245728,
|
||||
};
|
||||
|
||||
static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
|
||||
// NOTE: The tables below must be of the same size.
|
||||
|
||||
// The functions described below are sampled at the four most significant
|
||||
// bits of x^2 + 8 / 256.
|
||||
|
||||
// Normalized rate:
|
||||
// This table models the rate for a Laplacian source with given variance
|
||||
// when quantized with a uniform quantizer with given stepsize. The
|
||||
// closed form expression is:
|
||||
// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
|
||||
// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
|
||||
// and H(x) is the binary entropy function.
|
||||
static const int rate_tab_q10[] = {
|
||||
65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
|
||||
4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
|
||||
3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
|
||||
2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
|
||||
1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
|
||||
911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424,
|
||||
395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87,
|
||||
73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6,
|
||||
5, 3, 2, 1, 1, 1, 0, 0,
|
||||
};
|
||||
|
||||
// Normalized distortion:
|
||||
// This table models the normalized distortion for a Laplacian source
|
||||
// with given variance when quantized with a uniform quantizer
|
||||
// with given stepsize. The closed form expression is:
|
||||
// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
|
||||
// where x = qpstep / sqrt(variance).
|
||||
// Note the actual distortion is Dn * variance.
|
||||
static const int dist_tab_q10[] = {
|
||||
0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5,
|
||||
5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17,
|
||||
18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54,
|
||||
59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142,
|
||||
151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351,
|
||||
375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659,
|
||||
680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936,
|
||||
949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017,
|
||||
1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
|
||||
};
|
||||
static const int xsq_iq_q10[] = {
|
||||
0, 4, 8, 12, 16, 20, 24, 28, 32,
|
||||
40, 48, 56, 64, 72, 80, 88, 96, 112,
|
||||
128, 144, 160, 176, 192, 208, 224, 256, 288,
|
||||
320, 352, 384, 416, 448, 480, 544, 608, 672,
|
||||
736, 800, 864, 928, 992, 1120, 1248, 1376, 1504,
|
||||
1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296,
|
||||
3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136,
|
||||
7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328,
|
||||
16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736,
|
||||
36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696,
|
||||
81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808,
|
||||
180192, 196576, 212960, 229344, 245728,
|
||||
};
|
||||
const int tmp = (xsq_q10 >> 2) + 8;
|
||||
const int k = get_msb(tmp) - 3;
|
||||
const int xq = (k << 3) + ((tmp >> k) & 0x7);
|
||||
@ -379,6 +378,24 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
|
||||
*d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
|
||||
}
|
||||
|
||||
static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
|
||||
int r_q10[MAX_MB_PLANE],
|
||||
int d_q10[MAX_MB_PLANE]) {
|
||||
int i;
|
||||
const int one_q10 = 1 << 10;
|
||||
for (i = 0; i < MAX_MB_PLANE; ++i) {
|
||||
const int tmp = (xsq_q10[i] >> 2) + 8;
|
||||
const int k = get_msb(tmp) - 3;
|
||||
const int xq = (k << 3) + ((tmp >> k) & 0x7);
|
||||
const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
|
||||
const int b_q10 = one_q10 - a_q10;
|
||||
r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
|
||||
d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
|
||||
}
|
||||
}
|
||||
|
||||
static const uint32_t MAX_XSQ_Q10 = 245727;
|
||||
|
||||
void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
|
||||
unsigned int qstep, int *rate,
|
||||
int64_t *dist) {
|
||||
@ -393,7 +410,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
|
||||
*dist = 0;
|
||||
} else {
|
||||
int d_q10, r_q10;
|
||||
static const uint32_t MAX_XSQ_Q10 = 245727;
|
||||
const uint64_t xsq_q10_64 =
|
||||
(((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
|
||||
const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
|
||||
@ -403,6 +419,30 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
|
||||
}
|
||||
}
|
||||
|
||||
// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
|
||||
// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
|
||||
void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
|
||||
unsigned int n_log2[MAX_MB_PLANE],
|
||||
unsigned int qstep[MAX_MB_PLANE],
|
||||
int64_t *rate_sum, int64_t *dist_sum) {
|
||||
int i;
|
||||
int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
|
||||
for (i = 0; i < MAX_MB_PLANE; ++i) {
|
||||
const uint64_t xsq_q10_64 =
|
||||
(((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
|
||||
var[i];
|
||||
xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
|
||||
}
|
||||
model_rd_norm_vec(xsq_q10, r_q10, d_q10);
|
||||
for (i = 0; i < MAX_MB_PLANE; ++i) {
|
||||
int rate =
|
||||
ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
|
||||
int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
|
||||
*rate_sum += rate;
|
||||
*dist_sum += dist;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
|
||||
const struct macroblockd_plane *pd,
|
||||
ENTROPY_CONTEXT t_above[16],
|
||||
|
@ -140,6 +140,11 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
|
||||
void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
|
||||
unsigned int qstep, int *rate, int64_t *dist);
|
||||
|
||||
void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
|
||||
unsigned int n_log2[MAX_MB_PLANE],
|
||||
unsigned int qstep[MAX_MB_PLANE],
|
||||
int64_t *rate_sum, int64_t *dist_sum);
|
||||
|
||||
int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
|
||||
const MACROBLOCKD *const xd);
|
||||
|
||||
|
@ -164,17 +164,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
|
||||
const int ref = xd->mi[0]->ref_frame[0];
|
||||
unsigned int sse;
|
||||
unsigned int var = 0;
|
||||
unsigned int sum_sse = 0;
|
||||
int64_t total_sse = 0;
|
||||
int skip_flag = 1;
|
||||
const int shift = 6;
|
||||
int rate;
|
||||
int64_t dist;
|
||||
const int dequant_shift =
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
3;
|
||||
unsigned int qstep_vec[MAX_MB_PLANE];
|
||||
unsigned int nlog2_vec[MAX_MB_PLANE];
|
||||
unsigned int sum_sse_vec[MAX_MB_PLANE];
|
||||
int any_zero_sum_sse = 0;
|
||||
|
||||
x->pred_sse[ref] = 0;
|
||||
|
||||
@ -186,6 +188,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
|
||||
const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
|
||||
const int64_t dc_thr = p->quant_thred[0] >> shift;
|
||||
const int64_t ac_thr = p->quant_thred[1] >> shift;
|
||||
unsigned int sum_sse = 0;
|
||||
// The low thresholds are used to measure if the prediction errors are
|
||||
// low enough so that we can skip the mode search.
|
||||
const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
|
||||
@ -196,8 +199,6 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
|
||||
int lw = b_width_log2_lookup[unit_size] + 2;
|
||||
int lh = b_height_log2_lookup[unit_size] + 2;
|
||||
|
||||
sum_sse = 0;
|
||||
|
||||
for (idy = 0; idy < bh; ++idy) {
|
||||
for (idx = 0; idx < bw; ++idx) {
|
||||
uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
|
||||
@ -233,12 +234,18 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
|
||||
}
|
||||
|
||||
total_sse += sum_sse;
|
||||
sum_sse_vec[i] = sum_sse;
|
||||
any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
|
||||
qstep_vec[i] = pd->dequant[1] >> dequant_shift;
|
||||
nlog2_vec[i] = num_pels_log2_lookup[bs];
|
||||
}
|
||||
|
||||
// Fast approximate the modelling function.
|
||||
if (cpi->sf.simple_model_rd_from_var) {
|
||||
// Fast approximate the modelling function.
|
||||
if (cpi->sf.simple_model_rd_from_var) {
|
||||
for (i = 0; i < MAX_MB_PLANE; ++i) {
|
||||
int64_t rate;
|
||||
const int64_t square_error = sum_sse;
|
||||
int quantizer = (pd->dequant[1] >> dequant_shift);
|
||||
const int64_t square_error = sum_sse_vec[i];
|
||||
int quantizer = qstep_vec[i];
|
||||
|
||||
if (quantizer < 120)
|
||||
rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
|
||||
@ -247,12 +254,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
|
||||
dist = (square_error * quantizer) >> 8;
|
||||
rate_sum += rate;
|
||||
dist_sum += dist;
|
||||
}
|
||||
} else {
|
||||
if (any_zero_sum_sse) {
|
||||
for (i = 0; i < MAX_MB_PLANE; ++i) {
|
||||
int rate;
|
||||
vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
|
||||
&rate, &dist);
|
||||
rate_sum += rate;
|
||||
dist_sum += dist;
|
||||
}
|
||||
} else {
|
||||
vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
|
||||
pd->dequant[1] >> dequant_shift, &rate,
|
||||
&dist);
|
||||
rate_sum += rate;
|
||||
dist_sum += dist;
|
||||
vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
|
||||
&rate_sum, &dist_sum);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user