Use variance metric for integral projection vector match

This commit replaces the SAD with variance as metric for the integral projection vector match. It improves the search accuracy in the presence of slight light change. The average speed -6 compression performance for rtc set is improved by 1.7%. No speed changes are observed for the test clips. Change-Id: I71c1d27e42de2aa429fb3564e6549bba1c7d6d4d
2015-02-27 13:35:22 -08:00
parent f4e0eb17e8
commit 1790d45252
5 changed files with 53 additions and 45 deletions
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1115,8 +1115,8 @@ specialize qw/vp9_int_pro_row sse2/;
 add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
 specialize qw/vp9_int_pro_col sse2/;

-add_proto qw/int vp9_vector_sad/, "int16_t const *ref, int16_t const *src, const int width";
-specialize qw/vp9_vector_sad sse2/;
+add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
+specialize qw/vp9_vector_var sse2/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -37,6 +37,7 @@ void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
    hbuf[idx] = 0;
    for (i = 0; i < height; ++i)
      hbuf[idx] += ref[i * ref_stride];
+    hbuf[idx] /= 32;
    ++ref;
  }
 }
@@ -46,16 +47,23 @@ int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
  int16_t sum = 0;
  for (idx = 0; idx < width; ++idx)
    sum += ref[idx];
-  return sum;
+  return sum / 32;
 }

-int vp9_vector_sad_c(int16_t const *ref, int16_t const *src,
-                     const int width) {
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
+                     const int bwl) {
  int i;
-  int this_sad = 0;
-  for (i = 0; i < width; ++i)
-    this_sad += abs(ref[i] - src[i]);
-  return this_sad;
+  int width = 4 << bwl;
+  int sse = 0, mean = 0, var;
+
+  for (i = 0; i < width; ++i) {
+    int diff = ref[i] - src[i];
+    mean += diff;
+    sse += diff * diff;
+  }
+
+  var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
 }

 #if CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -519,14 +519,14 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
 #endif

 #if GLOBAL_MOTION
-static int vector_match(int16_t *ref, int16_t *src, int length) {
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
  int best_sad = INT_MAX;
  int this_sad;
  int d;
  int center, offset = 0;
-  int bw = length;  // redundant variable, to be changed in the experiments.
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
  for (d = 0; d <= bw; d += 16) {
-    this_sad = vp9_vector_sad(&ref[d], src, length);
+    this_sad = vp9_vector_var(&ref[d], src, bwl);
    if (this_sad < best_sad) {
      best_sad = this_sad;
      offset = d;
@@ -539,7 +539,7 @@ static int vector_match(int16_t *ref, int16_t *src, int length) {
    // check limit
    if (this_pos < 0 || this_pos > bw)
      continue;
-    this_sad = vp9_vector_sad(&ref[this_pos], src, length);
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
    if (this_sad < best_sad) {
      best_sad = this_sad;
      center = this_pos;
@@ -552,7 +552,7 @@ static int vector_match(int16_t *ref, int16_t *src, int length) {
    // check limit
    if (this_pos < 0 || this_pos > bw)
      continue;
-    this_sad = vp9_vector_sad(&ref[this_pos], src, length);
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
    if (this_sad < best_sad) {
      best_sad = this_sad;
      center = this_pos;
@@ -565,7 +565,7 @@ static int vector_match(int16_t *ref, int16_t *src, int length) {
    // check limit
    if (this_pos < 0 || this_pos > bw)
      continue;
-    this_sad = vp9_vector_sad(&ref[this_pos], src, length);
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
    if (this_sad < best_sad) {
      best_sad = this_sad;
      center = this_pos;
@@ -578,7 +578,7 @@ static int vector_match(int16_t *ref, int16_t *src, int length) {
    // check limit
    if (this_pos < 0 || this_pos > bw)
      continue;
-    this_sad = vp9_vector_sad(&ref[this_pos], src, length);
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
    if (this_sad < best_sad) {
      best_sad = this_sad;
      center = this_pos;
@@ -638,8 +638,8 @@ static void motion_estimation(VP9_COMP *cpi, MACROBLOCK *x,
  }

  // Find the best match per 1-D search
-  tmp_mv->col = vector_match(hbuf, src_hbuf, bw);
-  tmp_mv->row = vector_match(vbuf, src_vbuf, bh);
+  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);

  best_sad = INT_MAX;
  this_mv = *tmp_mv;
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -192,7 +192,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 cond_cost_list(cpi, cost_list),
                                 x->nmvjointcost, x->mvcost,
                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
-    x->pred_mv[ref] = tmp_mv->as_mv;
  }

  if (scaled_ref_frame) {
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -90,6 +90,9 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
  s0 = _mm_adds_epu16(s0, t0);
  s1 = _mm_adds_epu16(s1, t1);

+  s0 = _mm_srai_epi16(s0, 5);
+  s1 = _mm_srai_epi16(s1, 5);
+
  _mm_store_si128((__m128i *)hbuf, s0);
  hbuf += 8;
  _mm_store_si128((__m128i *)hbuf, s1);
@@ -112,51 +115,49 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
  s1 = _mm_srli_si128(s0, 8);
  s0 = _mm_adds_epu16(s0, s1);

-  return _mm_extract_epi16(s0, 0);
+  return (_mm_extract_epi16(s0, 0)) >> 5;
 }

-int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
-                        const int width) {
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+                        const int bwl) {
  int idx;
-  __m128i zero = _mm_setzero_si128();
-  __m128i sum;
+  int width = 4 << bwl;
+  int16_t mean;
  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
  __m128i v1 = _mm_load_si128((const __m128i *)src);
  __m128i diff = _mm_subs_epi16(v0, v1);
-  __m128i sign = _mm_srai_epi16(diff, 15);
-
-  diff = _mm_xor_si128(diff, sign);
-  sum = _mm_sub_epi16(diff, sign);
+  __m128i sum = diff;
+  __m128i sse = _mm_madd_epi16(diff, diff);

  ref += 8;
  src += 8;

-  v0 = _mm_unpacklo_epi16(sum, zero);
-  v1 = _mm_unpackhi_epi16(sum, zero);
-  sum = _mm_add_epi32(v0, v1);
-
  for (idx = 8; idx < width; idx += 8) {
    v0 = _mm_loadu_si128((const __m128i *)ref);
    v1 = _mm_load_si128((const __m128i *)src);
    diff = _mm_subs_epi16(v0, v1);
-    sign = _mm_srai_epi16(diff, 15);
-    diff = _mm_xor_si128(diff, sign);
-    diff = _mm_sub_epi16(diff, sign);

-    v0 = _mm_unpacklo_epi16(diff, zero);
-    v1 = _mm_unpackhi_epi16(diff, zero);
-
-    sum = _mm_add_epi32(sum, v0);
-    sum = _mm_add_epi32(sum, v1);
+    sum = _mm_add_epi16(sum, diff);
+    v0  = _mm_madd_epi16(diff, diff);
+    sse = _mm_add_epi32(sse, v0);

    ref += 8;
    src += 8;
  }

-  v0 = _mm_srli_si128(sum, 8);
-  sum = _mm_add_epi32(sum, v0);
-  v0 = _mm_srli_epi64(sum, 32);
-  sum = _mm_add_epi32(sum, v0);
+  v0  = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi16(sum, v0);
+  v0  = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi16(sum, v0);
+  v0  = _mm_srli_epi32(sum, 16);
+  sum = _mm_add_epi16(sum, v0);

-  return _mm_cvtsi128_si32(sum);
+  v1  = _mm_srli_si128(sse, 8);
+  sse = _mm_add_epi32(sse, v1);
+  v1  = _mm_srli_epi64(sse, 32);
+  sse = _mm_add_epi32(sse, v1);
+
+  mean = _mm_extract_epi16(sum, 0);
+
+  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
 }