Alternate rounding

Improves performance on derf by 0.89% for 10-bit internal and by 0.55% for 12-bit internal, both for 8-bit sources. Change-Id: I181fd9fb10e2259233d67cdd7933fb3cae334afc
2014-06-05 03:46:57 -07:00
parent 091829d376
commit e91d29dea3
5 changed files with 8 additions and 16 deletions
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -72,7 +72,7 @@ void vp9_high_subtract_block_c(int rows, int cols,
 #if CONFIG_HIGH_TRANSFORMS
      diff[c] = src[c] - pred[c];
 #else
-      diff[c] = ((src[c] + rnd) >> shift) - ((pred[c] + rnd) >> shift);
+      diff[c] = (src[c] - pred[c] + rnd) >> shift;
 #endif
    }

@@ -93,7 +93,7 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 #if CONFIG_VP9_HIGH
  if (x->e_mbd.cur_buf->flags&YV12_FLAG_HIGH) {
    vp9_high_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
-                     pd->dst.buf, pd->dst.stride, x->e_mbd.bps);
+                            pd->dst.buf, pd->dst.stride, x->e_mbd.bps);
    return;
  }
 #endif
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1781,16 +1781,12 @@ static int64_t high_get_sse_shift(const uint8_t *a8, int a_stride,
                                  unsigned int input_shift) {
  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  const int offset = (1 << (input_shift - 1));
-  const int max_val = (1 << bit_depth) - 1;
  int64_t total_sse = 0;
  int x, y;
  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      int64_t diff;
-      int16_t pix_val = (b[x] + offset);
-      pix_val = pix_val > max_val ? max_val : pix_val;
-      diff = (a[x] >> input_shift) - (pix_val >> input_shift);
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
      total_sse += diff * diff;
    }
    a += a_stride;
--- a/vp9/encoder/vp9_ssim.c
+++ b/vp9/encoder/vp9_ssim.c
@@ -67,13 +67,11 @@ void vp9_high_ssim_parms_8x8_shift_c(uint16_t *s, int sp, uint16_t *r, int rp,
                                     uint32_t *sum_sxr, unsigned int bps,
                                     unsigned int shift) {
  int i, j;
-  const int offset = (1 << (shift - 1));
  const int max_val = (1 << bps) - 1;
  for (i = 0; i < 8; i++, s += sp, r += rp) {
    for (j = 0; j < 8; j++) {
      int sj = s[j];
-      int rj = r[j] + offset;
-      rj = ((rj > max_val ? max_val : rj) >> shift) << shift;
+      int rj = r[j];
      *sum_s += sj;
      *sum_r += rj;
      *sum_sq_s += sj * sj;
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -522,9 +522,6 @@ static FILE *open_outfile(const char *name) {
 static void img_convert_16_to_8(vpx_image_t *dst, vpx_image_t *src,
                                int output_shift) {
  int plane;
-  int offset = 0;
-  if (output_shift  > 0)
-    offset = 1 << (output_shift - 1);
  if (src->fmt != dst->fmt + VPX_IMG_FMT_HIGH ||
      dst->d_w != src->d_w || dst->d_h != src->d_h ||
      dst->x_chroma_shift != src->x_chroma_shift ||
@@ -553,8 +550,7 @@ static void img_convert_16_to_8(vpx_image_t *dst, vpx_image_t *src,
      uint16_t *p_src = (uint16_t *)(src->planes[plane] +
                                     y * src->stride[plane]);
      for (x = 0; x < w; x++) {
-        int pix_val = (*p_src++ + offset) >> output_shift;
-        *p_dst++ = pix_val > 255 ? 255 : pix_val;
+        *p_dst++ = *p_src++ >> output_shift;
      }
    }
  }
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1630,6 +1630,8 @@ static float usec_to_fps(uint64_t usec, unsigned int frames) {
 #if CONFIG_VP9_HIGH
 static void img_convert_8_to_16(vpx_image_t  *dst, vpx_image_t *src,
                                int input_shift) {
+  // Note the offset is 1 less than half
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
  int plane;
  if (dst->fmt != src->fmt + VPX_IMG_FMT_HIGH ||
      dst->w != src->w || dst->h != src->h ||
@@ -1659,7 +1661,7 @@ static void img_convert_8_to_16(vpx_image_t  *dst, vpx_image_t *src,
      uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
                                     y * dst->stride[plane]);
      for (x = 0; x < w; x++) {
-        *p_dst++ = *p_src++ << input_shift;
+        *p_dst++ = (*p_src++ << input_shift) + offset;
      }
    }
  }