Reduce memory footprint for CLPF decoding.

Instead of having CLPF write to an entire new frame and copy the result back into the original frame, make the filter able to work in-place by keeping a buffer of size frame_width*filter_block_size and delay the write-back by one filter_block_size row. This reduces the cycles spent in the filter to ~75%. Change-Id: I78ca74380c45492daa8935d08d766851edb5fbc1
2016-08-24 13:00:04 +02:00
parent 34dac00adc
commit e8224c7ad5
6 changed files with 136 additions and 94 deletions
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -587,7 +587,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint
 specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
 if (aom_config("CONFIG_CLPF") eq "yes") {
-  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
+  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
  specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
  add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
  specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -27,30 +27,30 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
  return (8 + delta - (delta < 0)) >> 4;
 }
-void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0,
+void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
-                      int y0, int sizex, int sizey, int width, int height,
+                      int dstride, int x0, int y0, int sizex, int sizey,
-                      unsigned int strength) {
+                      int width, int height, unsigned int strength) {
  int x, y;
  for (y = y0; y < y0 + sizey; y++) {
    for (x = x0; x < x0 + sizex; x++) {
-      int X = src[y * stride + x];
+      int X = src[y * sstride + x];
-      int A = src[AOMMAX(0, y - 1) * stride + x];
+      int A = src[AOMMAX(0, y - 1) * sstride + x];
-      int B = src[y * stride + AOMMAX(0, x - 2)];
+      int B = src[y * sstride + AOMMAX(0, x - 2)];
-      int C = src[y * stride + AOMMAX(0, x - 1)];
+      int C = src[y * sstride + AOMMAX(0, x - 1)];
-      int D = src[y * stride + AOMMIN(width - 1, x + 1)];
+      int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
-      int E = src[y * stride + AOMMIN(width - 1, x + 2)];
+      int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
-      int F = src[AOMMIN(height - 1, y + 1) * stride + x];
+      int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
      int delta;
      delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
-      dst[y * stride + x] = X + delta;
+      dst[y * dstride + x] = X + delta;
    }
  }
 }
 // Return number of filtered blocks
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
-                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                   const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org,
-                   int enable_fb_flag, unsigned int strength,
+                   AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
                   unsigned int fb_size_log2, uint8_t *blocks,
                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
                                   const YV12_BUFFER_CONFIG *,
@@ -59,23 +59,45 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
  /* Constrained low-pass filter (CLPF) */
  int c, k, l, m, n;
  const int bs = MI_SIZE;
-  int width = rec->y_crop_width;
+  const int width = rec->y_crop_width;
-  int height = rec->y_crop_height;
+  const int height = rec->y_crop_height;
  int xpos, ypos;
-  int stride_y = rec->y_stride;
+  const int sstride = rec->y_stride;
-  int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
+  int dstride = orig_dst->y_stride;
-  int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
+  const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
  const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
  int block_index = 0;
  uint8_t *cache = NULL;
  uint8_t **cache_ptr = NULL;
  uint8_t **cache_dst = NULL;
  int cache_idx = 0;
  const int cache_size = num_fb_hor << (2 * fb_size_log2);
  const int cache_blocks = cache_size / (bs * bs);
  YV12_BUFFER_CONFIG dst = *orig_dst;
  // Make buffer space for in-place filtering
  if (rec->y_buffer == dst.y_buffer) {
    CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
    CHECK_MEM_ERROR(cm, cache_ptr,
                    aom_malloc(cache_blocks * sizeof(*cache_ptr)));
    CHECK_MEM_ERROR(cm, cache_dst,
                    aom_malloc(cache_blocks * sizeof(*cache_dst)));
    memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
    dst.y_buffer = cache;
    dstride = bs;
  }
  // Iterate over all filter blocks
  for (k = 0; k < num_fb_ver; k++) {
    for (l = 0; l < num_fb_hor; l++) {
      int h, w;
      int allskip = 1;
      const int xoff = l << fb_size_log2;
      const int yoff = k << fb_size_log2;
      for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
        for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
-          xpos = (l << fb_size_log2) + n * bs;
+          xpos = xoff + n * bs;
-          ypos = (k << fb_size_log2) + m * bs;
+          ypos = yoff + m * bs;
          if (xpos < width && ypos < height) {
            allskip &=
                cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
@@ -96,31 +118,57 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
        // Iterate over all smaller blocks inside the filter block
        for (m = 0; m < (h + bs - 1) / bs; m++) {
          for (n = 0; n < (w + bs - 1) / bs; n++) {
-            xpos = (l << fb_size_log2) + n * bs;
+            xpos = xoff + n * bs;
-            ypos = (k << fb_size_log2) + m * bs;
+            ypos = yoff + m * bs;
            if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
-                     ->mbmi.skip) {
+                     ->mbmi.skip) {  // Not skip block
-              // Not skip block, apply the filter
+              // Temporary buffering needed if filtering in-place
-              aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos,
+              if (cache) {
-                             bs, bs, width, height, strength);
+                if (cache_ptr[cache_idx]) {
                  // Copy filtered block back into the frame
                  for (c = 0; c < bs; c++)
                    *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
                }
                cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
                dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
                cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos;
                if (++cache_idx >= cache_blocks) cache_idx = 0;
              }
              // Apply the filter
              aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride,
                             xpos, ypos, bs, bs, width, height, strength);
            } else {  // Skip block, copy instead
-              for (c = 0; c < bs; c++)
+              if (!cache)
-                *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) =
+                for (c = 0; c < bs; c++)
-                    *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos);
+                  *(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *(
                      uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos);
            }
          }
        }
      } else {  // Entire filter block is skip, copy
-        for (m = 0; m < h; m++)
+        if (!cache)
-          memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y +
+          for (m = 0; m < h; m++)
-                     (l << fb_size_log2),
+            memcpy(dst.y_buffer + (yoff + m) * dstride + xoff,
-                 rec->y_buffer + ((k << fb_size_log2) + m) * stride_y +
+                   rec->y_buffer + (yoff + m) * sstride + xoff, w);
                     (l << fb_size_log2),
                 w);
      }
      block_index += !allskip;  // Count number of blocks filtered
    }
  }
  if (cache) {
    // Copy remaining blocks into the frame
    for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
         cache_idx++)
      for (c = 0; c < bs; c++)
        *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
            *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
    aom_free(cache);
    aom_free(cache_ptr);
  }
  return block_index;
 }
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@@ -18,7 +18,7 @@
 int av1_clpf_maxbits(const AV1_COMMON *cm);
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
 int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
-                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
                   int enable_fb_flag, unsigned int strength,
                   unsigned int fb_size_log2, uint8_t *blocks,
                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -11,11 +11,11 @@
 #include "./aom_dsp_rtcd.h"
-static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
+static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
-                       int y0, int sizey, int width, int height,
+                       int dstride, int x0, int y0, int sizey, int width,
-                       unsigned int strength) {
+                       int height, unsigned int strength) {
-  dst += x0 + y0 * stride;
+  dst += x0 + y0 * dstride;
-  src += x0 + y0 * stride;
+  src += x0 + y0 * sstride;
  {
    int bottom = height - 2 - y0;
    const v128 sp = v128_dup_8(strength);
@@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
      for (y = 0; y < sizey; y += 2) {
        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
        v128 o = v128_from_v64(l1, l2);
        const v128 x = v128_add_8(c128, o);
        const v128 a = v128_add_8(
            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
        const v128 b = v128_shuffle_8(x, b_shuff);
        const v128 c = v128_shuffle_8(x, c_shuff);
        const v128 d = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
        const v128 e = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
        const v128 f = v128_add_8(
            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
        const v128 tmp =
            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                        delta, v128_zero()))),
                   4));
        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
-        src += stride * 2;
+        src += sstride * 2;
-        dst += stride * 2;
+        dst += dstride * 2;
      }
    } else if (!(width - x0 - 8)) {  // Clip right
      const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
@@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
      for (y = 0; y < sizey; y += 2) {
        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
        v128 o = v128_from_v64(l1, l2);
        const v128 x = v128_add_8(c128, o);
        const v128 a = v128_add_8(
            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
        const v128 b = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
        const v128 c = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
        const v128 d = v128_shuffle_8(x, d_shuff);
        const v128 e = v128_shuffle_8(x, e_shuff);
        const v128 f = v128_add_8(
            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
        const v128 tmp =
            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                        delta, v128_zero()))),
                   4));
        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
-        src += stride * 2;
+        src += sstride * 2;
-        dst += stride * 2;
+        dst += dstride * 2;
      }
    } else {  // No left/right clipping
      int y;
      for (y = 0; y < sizey; y += 2) {
        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
        v128 o = v128_from_v64(l1, l2);
        const v128 x = v128_add_8(c128, o);
        const v128 a = v128_add_8(
            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
        const v128 b = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
        const v128 c = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
        const v128 d = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
        const v128 e = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
        const v128 f = v128_add_8(
            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
        const v128 tmp =
            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                        delta, v128_zero()))),
                   4));
        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
-        src += stride * 2;
+        src += sstride * 2;
-        dst += stride * 2;
+        dst += dstride * 2;
      }
    }
  }
 }
-void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
+void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
-                               int x0, int y0, int sizex, int sizey, int width,
+                               int dstride, int x0, int y0, int sizex,
-                               int height, unsigned int strength) {
+                               int sizey, int width, int height,
                               unsigned int strength) {
  // TODO(stemidts):
  // A sizex different from 8 will only be needed if CLPF is extended to chroma.
  // This will only be used if 4:2:0 and width not a multiple of 16 and along
@@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
  // this case.  If not extended to chroma, this test will be redundant.
  if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
    // Fallback to C for odd sizes
-    aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
+    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
-                     strength);
+                     height, strength);
  } else {
-    clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
+    clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
               strength);
  }
 }
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3929,19 +3929,10 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
 #if CONFIG_CLPF
  if (cm->clpf_strength && !cm->skip_loop_filter) {
-    YV12_BUFFER_CONFIG dst;  // Buffer for the result
+    const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
-
+    av1_clpf_frame(frame, frame, 0, cm, !!cm->clpf_size,
    dst = pbi->cur_buf->buf;
    CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
    av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
                   cm->clpf_strength + (cm->clpf_strength == 3),
                   4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
    // Copy result
    memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
           dst.y_height * dst.y_stride);
    aom_free(dst.y_buffer);
  }
  if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
 #endif
--- a/test/clpf_test.cc
+++ b/test/clpf_test.cc
@@ -26,9 +26,9 @@ using libaom_test::ACMRandom;
 namespace {
-typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride,
+typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
-                             int x0, int y0, int sizex, int sizey, int width,
+                             int dstride, int x0, int y0, int sizex, int sizey,
-                             int height, unsigned int strength);
+                             int width, int height, unsigned int strength);
 typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
    clpf_block_param_t;
@@ -85,10 +85,10 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
      for (ypos = 0; ypos < size && !error; ypos += h * !error) {
        for (xpos = 0; xpos < size && !error; xpos += w * !error) {
          for (strength = 0; strength < 3 && !error; strength += !error) {
-            ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size,
+            ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
                     1 << strength);
-            ASM_REGISTER_STATE_CHECK(
+            ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h,
-                clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength));
+                                          size, size, 1 << strength));
            for (pos = 0; pos < size * size && !error; pos++) {
              error = ref_d[pos] != d[pos];
@@ -137,7 +137,8 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
    for (ypos = 0; ypos < size; ypos += h) {
      for (xpos = 0; xpos < size; xpos += w) {
        for (strength = 0; strength < 3; strength++) {
-          ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
+          ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size,
                   1 << strength);
        }
      }
    }
@@ -150,7 +151,7 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
    for (ypos = 0; ypos < size; ypos += h) {
      for (xpos = 0; xpos < size; xpos += w) {
        for (strength = 0; strength < 3; strength++) {
-          clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
+          clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength);
        }
      }
    }