Expand unconstrained nodes in pack_mb_tokens and loop on zeros.

Reduces Linux perf estimated cycle count for pack_mb_tokens on a
lossless encode on my desktop from 61858501855 to 48154040219 or from
26% of the overall profile to 21%.

Change-Id: I9ca3426d7e3272bc7f7030abda4f0d0cec87fb4a
This commit is contained in:
Alex Converse
2015-11-02 16:28:10 -08:00
parent 57cae22c1e
commit 70eb870cfe

View File

@@ -123,72 +123,66 @@ static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w,
static void pack_mb_tokens(vpx_writer *w, static void pack_mb_tokens(vpx_writer *w,
TOKENEXTRA **tp, const TOKENEXTRA *const stop, TOKENEXTRA **tp, const TOKENEXTRA *const stop,
vpx_bit_depth_t bit_depth) { vpx_bit_depth_t bit_depth) {
TOKENEXTRA *p = *tp; const TOKENEXTRA *p;
const vp9_extra_bit *const extra_bits =
while (p < stop && p->token != EOSB_TOKEN) {
const int t = p->token;
const struct vp9_token *const a = &vp9_coef_encodings[t];
int i = 0;
int v = a->value;
int n = a->len;
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
const vp9_extra_bit *b; (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 :
if (bit_depth == VPX_BITS_12) (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 :
b = &vp9_extra_bits_high12[t]; vp9_extra_bits;
else if (bit_depth == VPX_BITS_10)
b = &vp9_extra_bits_high10[t];
else
b = &vp9_extra_bits[t];
#else #else
const vp9_extra_bit *const b = &vp9_extra_bits[t]; vp9_extra_bits;
(void) bit_depth; (void) bit_depth;
#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9_HIGHBITDEPTH
/* skip one or two nodes */ for (p = *tp; p < stop && p->token != EOSB_TOKEN; ++p) {
if (p->skip_eob_node) { if (p->token == EOB_TOKEN) {
n -= p->skip_eob_node; vpx_write(w, 0, p->context_tree[0]);
i = 2 * p->skip_eob_node; continue;
}
vpx_write(w, 1, p->context_tree[0]);
while (p->token == ZERO_TOKEN) {
vpx_write(w, 0, p->context_tree[1]);
++p;
if (p == stop || p->token == EOSB_TOKEN) {
*tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
return;
}
} }
// TODO(jbb): expanding this can lead to big gains. It allows {
// much better branch prediction and would enable us to avoid numerous const int t = p->token;
// lookups and compares. const vpx_prob *const context_tree = p->context_tree;
assert(t != ZERO_TOKEN);
// If we have a token that's in the constrained set, the coefficient tree assert(t != EOB_TOKEN);
// is split into two treed writes. The first treed write takes care of the assert(t != EOSB_TOKEN);
// unconstrained nodes. The second treed write takes care of the vpx_write(w, 1, context_tree[1]);
// constrained nodes. if (t == ONE_TOKEN) {
if (t >= TWO_TOKEN && t < EOB_TOKEN) { vpx_write(w, 0, context_tree[2]);
int len = UNCONSTRAINED_NODES - p->skip_eob_node; vpx_write_bit(w, p->extra & 1);
int bits = v >> (n - len); } else { // t >= TWO_TOKEN && t < EOB_TOKEN
vp9_write_tree(w, vp9_coef_tree, p->context_tree, bits, len, i); const struct vp9_token *const a = &vp9_coef_encodings[t];
const int v = a->value;
const int n = a->len;
const int e = p->extra;
vpx_write(w, 1, context_tree[2]);
vp9_write_tree(w, vp9_coef_con_tree, vp9_write_tree(w, vp9_coef_con_tree,
vp9_pareto8_full[p->context_tree[PIVOT_NODE] - 1], vp9_pareto8_full[context_tree[PIVOT_NODE] - 1], v,
v, n - len, 0); n - UNCONSTRAINED_NODES, 0);
} else { if (t >= CATEGORY1_TOKEN) {
vp9_write_tree(w, vp9_coef_tree, p->context_tree, v, n, i); const vp9_extra_bit *const b = &extra_bits[t];
}
if (b->base_val) {
const int e = p->extra, l = b->len;
if (l) {
const unsigned char *pb = b->prob; const unsigned char *pb = b->prob;
int v = e >> 1; int v = e >> 1;
int n = l; /* number of bits in v, assumed nonzero */ int n = b->len; // number of bits in v, assumed nonzero
do { do {
const int bb = (v >> --n) & 1; const int bb = (v >> --n) & 1;
vpx_write(w, bb, *pb++); vpx_write(w, bb, *pb++);
} while (n); } while (n);
} }
vpx_write_bit(w, e & 1); vpx_write_bit(w, e & 1);
} }
++p;
} }
}
*tp = p + (p->token == EOSB_TOKEN); *tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
} }
static void write_segment_id(vpx_writer *w, const struct segmentation *seg, static void write_segment_id(vpx_writer *w, const struct segmentation *seg,