Add a vector form of routine vp9_model_rd_from_var_lapndz

Add routine vp9_model_rd_from_var_lapndz_vec and call it from model_rd_for_sb to model the rate and distortion for MAX_MB_PLANE Laplacian sources in parallel. The caller ensures that all sources have non-zero variance. Measured a 18% to 25% reduction in retired instructions, and 17% to 24% reduction in instruction execution cost with different compilers for the Laplacian modeling. No change in behavior. TEST=Verified that encoded files match bit for bit, with and without this change. BUG=b/33678225 Change-Id: I6b76947f21c659a349adb896e13e99f6e3f951e6
2016-12-14 12:07:34 -08:00 · 2016-12-14 12:07:34 -08:00 · 976ddb61d3
commit 976ddb61d3
parent 83ba1880bf
3 changed files with 129 additions and 70 deletions
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@ -312,63 +312,62 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
  }
 }

+// NOTE: The tables below must be of the same size.
+
+// The functions described below are sampled at the four most significant
+// bits of x^2 + 8 / 256.
+
+// Normalized rate:
+// This table models the rate for a Laplacian source with given variance
+// when quantized with a uniform quantizer with given stepsize. The
+// closed form expression is:
+// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+// and H(x) is the binary entropy function.
+static const int rate_tab_q10[] = {
+  65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044,
+  3958,  3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037,
+  2952,  2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179,
+  2130,  2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398,
+  1342,  1290, 1243, 1199, 1159, 1086, 1021, 963,  911,  864,  821,  781,  745,
+  680,   623,  574,  530,  490,  455,  424,  395,  345,  304,  269,  239,  213,
+  190,   171,  154,  126,  104,  87,   73,   61,   52,   44,   38,   28,   21,
+  16,    12,   10,   8,    6,    5,    3,    2,    1,    1,    1,    0,    0,
+};
+
+// Normalized distortion:
+// This table models the normalized distortion for a Laplacian source
+// with given variance when quantized with a uniform quantizer
+// with given stepsize. The closed form expression is:
+// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+// where x = qpstep / sqrt(variance).
+// Note the actual distortion is Dn * variance.
+static const int dist_tab_q10[] = {
+  0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,    5,
+  6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,   18,   21,
+  24,   26,   29,   31,   34,   36,   39,   44,   49,   54,   59,   64,   69,
+  73,   78,   88,   97,   106,  115,  124,  133,  142,  151,  167,  184,  200,
+  215,  231,  245,  260,  274,  301,  327,  351,  375,  397,  418,  439,  458,
+  495,  528,  559,  587,  613,  637,  659,  680,  717,  749,  777,  801,  823,
+  842,  859,  874,  899,  919,  936,  949,  960,  969,  977,  983,  994,  1001,
+  1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+};
+static const int xsq_iq_q10[] = {
+  0,      4,      8,      12,     16,     20,     24,     28,     32,
+  40,     48,     56,     64,     72,     80,     88,     96,     112,
+  128,    144,    160,    176,    192,    208,    224,    256,    288,
+  320,    352,    384,    416,    448,    480,    544,    608,    672,
+  736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
+  1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
+  3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
+  7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
+  16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
+  36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
+  81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
+  180192, 196576, 212960, 229344, 245728,
+};
+
 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
-  // NOTE: The tables below must be of the same size.
-
-  // The functions described below are sampled at the four most significant
-  // bits of x^2 + 8 / 256.
-
-  // Normalized rate:
-  // This table models the rate for a Laplacian source with given variance
-  // when quantized with a uniform quantizer with given stepsize. The
-  // closed form expression is:
-  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
-  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
-  // and H(x) is the binary entropy function.
-  static const int rate_tab_q10[] = {
-    65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
-    4044,  3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
-    3133,  3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
-    2290,  2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
-    1608,  1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
-    911,   864,  821,  781,  745,  680,  623,  574,  530,  490,  455,  424,
-    395,   345,  304,  269,  239,  213,  190,  171,  154,  126,  104,  87,
-    73,    61,   52,   44,   38,   28,   21,   16,   12,   10,   8,    6,
-    5,     3,    2,    1,    1,    1,    0,    0,
-  };
-
-  // Normalized distortion:
-  // This table models the normalized distortion for a Laplacian source
-  // with given variance when quantized with a uniform quantizer
-  // with given stepsize. The closed form expression is:
-  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
-  // where x = qpstep / sqrt(variance).
-  // Note the actual distortion is Dn * variance.
-  static const int dist_tab_q10[] = {
-    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,
-    5,    6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,
-    18,   21,   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,
-    59,   64,   69,   73,   78,   88,   97,   106,  115,  124,  133,  142,
-    151,  167,  184,  200,  215,  231,  245,  260,  274,  301,  327,  351,
-    375,  397,  418,  439,  458,  495,  528,  559,  587,  613,  637,  659,
-    680,  717,  749,  777,  801,  823,  842,  859,  874,  899,  919,  936,
-    949,  960,  969,  977,  983,  994,  1001, 1006, 1010, 1013, 1015, 1017,
-    1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
-  };
-  static const int xsq_iq_q10[] = {
-    0,      4,      8,      12,     16,     20,     24,     28,     32,
-    40,     48,     56,     64,     72,     80,     88,     96,     112,
-    128,    144,    160,    176,    192,    208,    224,    256,    288,
-    320,    352,    384,    416,    448,    480,    544,    608,    672,
-    736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
-    1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
-    3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
-    7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
-    16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
-    36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
-    81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
-    180192, 196576, 212960, 229344, 245728,
-  };
  const int tmp = (xsq_q10 >> 2) + 8;
  const int k = get_msb(tmp) - 3;
  const int xq = (k << 3) + ((tmp >> k) & 0x7);
@ -379,6 +378,24 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }

+static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
+                              int r_q10[MAX_MB_PLANE],
+                              int d_q10[MAX_MB_PLANE]) {
+  int i;
+  const int one_q10 = 1 << 10;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int tmp = (xsq_q10[i] >> 2) + 8;
+    const int k = get_msb(tmp) - 3;
+    const int xq = (k << 3) + ((tmp >> k) & 0x7);
+    const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
+    const int b_q10 = one_q10 - a_q10;
+    r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+    d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+  }
+}
+
+static const uint32_t MAX_XSQ_Q10 = 245727;
+
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                  unsigned int qstep, int *rate,
                                  int64_t *dist) {
@ -393,7 +410,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
    *dist = 0;
  } else {
    int d_q10, r_q10;
-    static const uint32_t MAX_XSQ_Q10 = 245727;
    const uint64_t xsq_q10_64 =
        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
    const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
@ -403,6 +419,30 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
  }
 }

+// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
+// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
+void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
+                                      unsigned int n_log2[MAX_MB_PLANE],
+                                      unsigned int qstep[MAX_MB_PLANE],
+                                      int64_t *rate_sum, int64_t *dist_sum) {
+  int i;
+  int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const uint64_t xsq_q10_64 =
+        (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
+        var[i];
+    xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
+  }
+  model_rd_norm_vec(xsq_q10, r_q10, d_q10);
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    int rate =
+        ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
+    int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
+    *rate_sum += rate;
+    *dist_sum += dist;
+  }
+}
+
 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                              const struct macroblockd_plane *pd,
                              ENTROPY_CONTEXT t_above[16],
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@ -140,6 +140,11 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
                                  unsigned int qstep, int *rate, int64_t *dist);

+void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
+                                      unsigned int n_log2[MAX_MB_PLANE],
+                                      unsigned int qstep[MAX_MB_PLANE],
+                                      int64_t *rate_sum, int64_t *dist_sum);
+
 int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
                            const MACROBLOCKD *const xd);

--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -164,17 +164,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
  const int ref = xd->mi[0]->ref_frame[0];
  unsigned int sse;
  unsigned int var = 0;
-  unsigned int sum_sse = 0;
  int64_t total_sse = 0;
  int skip_flag = 1;
  const int shift = 6;
-  int rate;
  int64_t dist;
  const int dequant_shift =
 #if CONFIG_VP9_HIGHBITDEPTH
      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
                                                    3;
+  unsigned int qstep_vec[MAX_MB_PLANE];
+  unsigned int nlog2_vec[MAX_MB_PLANE];
+  unsigned int sum_sse_vec[MAX_MB_PLANE];
+  int any_zero_sum_sse = 0;

  x->pred_sse[ref] = 0;

@ -186,6 +188,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
    const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
    const int64_t dc_thr = p->quant_thred[0] >> shift;
    const int64_t ac_thr = p->quant_thred[1] >> shift;
+    unsigned int sum_sse = 0;
    // The low thresholds are used to measure if the prediction errors are
    // low enough so that we can skip the mode search.
    const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
@ -196,8 +199,6 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
    int lw = b_width_log2_lookup[unit_size] + 2;
    int lh = b_height_log2_lookup[unit_size] + 2;

-    sum_sse = 0;
-
    for (idy = 0; idy < bh; ++idy) {
      for (idx = 0; idx < bw; ++idx) {
        uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
@ -233,12 +234,18 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
    }

    total_sse += sum_sse;
+    sum_sse_vec[i] = sum_sse;
+    any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
+    qstep_vec[i] = pd->dequant[1] >> dequant_shift;
+    nlog2_vec[i] = num_pels_log2_lookup[bs];
+  }

-    // Fast approximate the modelling function.
-    if (cpi->sf.simple_model_rd_from_var) {
+  // Fast approximate the modelling function.
+  if (cpi->sf.simple_model_rd_from_var) {
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
      int64_t rate;
-      const int64_t square_error = sum_sse;
-      int quantizer = (pd->dequant[1] >> dequant_shift);
+      const int64_t square_error = sum_sse_vec[i];
+      int quantizer = qstep_vec[i];

      if (quantizer < 120)
        rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
@ -247,12 +254,19 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
      dist = (square_error * quantizer) >> 8;
      rate_sum += rate;
      dist_sum += dist;
+    }
+  } else {
+    if (any_zero_sum_sse) {
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        int rate;
+        vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
+                                     &rate, &dist);
+        rate_sum += rate;
+        dist_sum += dist;
+      }
    } else {
-      vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
-                                   pd->dequant[1] >> dequant_shift, &rate,
-                                   &dist);
-      rate_sum += rate;
-      dist_sum += dist;
+      vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
+                                       &rate_sum, &dist_sum);
    }
  }