Merge "Refactor block_yrd function for RTC coding mode"

2015-04-01 14:54:24 -07:00 · 2015-04-01 14:54:24 -07:00 · 7acb2a8795
commit 7acb2a8795
parent a0043c6d30 1470529f62
4 changed files with 101 additions and 19 deletions
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@ -1171,6 +1171,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
  specialize qw/vp9_block_error avx2/, "$sse2_x86inc";

+  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+  specialize qw/vp9_block_error_fp sse2/;
+
  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";

--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@ -557,6 +557,20 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
  *out_dist_sum += dist << 4;
 }

+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var_y, sse_y;
+  (void)plane;
+  (void)tx_size;
+  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  *sse = INT_MAX;
+  *skippable = 0;
+  return;
+}
+#else
 static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
                      int *skippable, int64_t *sse, int plane,
                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
@ -574,23 +588,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));

-#if CONFIG_VP9_HIGHBITDEPTH
-  unsigned int var_y, sse_y;
-  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
-  *sse = INT_MAX;
-  *skippable = 0;
-  return;
-#else
  (void)cpi;
-#endif
-
  vp9_subtract_plane(x, bsize, plane);
-
  *skippable = 1;
-  *rate = 0;
-  *dist = 0;
-  *sse = 0;
-
  // Keep track of the row and column of the blocks we use so that we know
  // if we are in the unrestricted motion border.
  for (r = 0; r < max_blocks_high; r += block_step) {
@ -604,7 +604,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
        int i, j;
        const int16_t *src_diff;
-        int64_t this_sse;
        txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
        src_diff = &p->src_diff[4 * (j * diff_stride + i)];

@ -641,16 +640,36 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
            assert(0);
            break;
        }
+        *skippable &= (*eob == 0);
+      }
+      block += step;
+    }
+  }

-        *dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
+  if (*skippable && *sse < INT64_MAX) {
+    *dist = (*sse << 6) >> shift;
+    *sse = *dist;
+    return;
+  }
+
+  block = 0;
+  *rate = 0;
+  *dist = 0;
+  *sse = (*sse << 6) >> shift;
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];

        if (*eob == 1)
          *rate += (int)abs(qcoeff[0]);
        else if (*eob > 1)
          *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);

-        *sse += (this_sse >> shift);
-        *skippable &= (*eob == 0);
+        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
      }
      block += step;
    }
@ -659,6 +678,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
  *rate <<= 8;
  *rate *= 6;
 }
+#endif

 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
@ -866,7 +886,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
  int i, j;
  int rate;
  int64_t dist;
-  int64_t this_sse;
+  int64_t this_sse = INT64_MAX;
  int is_skippable;

  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@ -1328,6 +1348,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    }

    if (bsize <= BLOCK_16X16) {
+      this_sse = (int64_t)sse_y;
      block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
                &this_sse, 0, bsize, mbmi->tx_size);
      x->skip_txfm[0] = is_skippable;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -292,6 +292,18 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
  return error;
 }

+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+  }
+
+  return error;
+}

 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
  movd    edx, m5
 %endif
  RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 8, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m7
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET