Reduce memory footprint for CLPF decoding.

Instead of having CLPF write to an entire new frame and copy the result back into the original frame, make the filter able to work in-place by keeping a buffer of size frame_width*filter_block_size and delay the write-back by one filter_block_size row. This reduces the cycles spent in the filter to ~75%. Change-Id: I78ca74380c45492daa8935d08d766851edb5fbc1
2016-08-24 13:00:04 +02:00
parent 34dac00adc
commit e8224c7ad5
6 changed files with 136 additions and 94 deletions
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -11,11 +11,11 @@

 #include "./aom_dsp_rtcd.h"

-static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
-                       int y0, int sizey, int width, int height,
-                       unsigned int strength) {
-  dst += x0 + y0 * stride;
-  src += x0 + y0 * stride;
+static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
+                       int dstride, int x0, int y0, int sizey, int width,
+                       int height, unsigned int strength) {
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
  {
    int bottom = height - 2 - y0;
    const v128 sp = v128_dup_8(strength);
@@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,

      for (y = 0; y < sizey; y += 2) {
        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
        v128 o = v128_from_v64(l1, l2);
        const v128 x = v128_add_8(c128, o);
        const v128 a = v128_add_8(
            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
        const v128 b = v128_shuffle_8(x, b_shuff);
        const v128 c = v128_shuffle_8(x, c_shuff);
        const v128 d = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
        const v128 e = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
        const v128 f = v128_add_8(
            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));

        const v128 tmp =
            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                        delta, v128_zero()))),
                   4));
        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
      }
    } else if (!(width - x0 - 8)) {  // Clip right
      const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
@@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,

      for (y = 0; y < sizey; y += 2) {
        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
        v128 o = v128_from_v64(l1, l2);
        const v128 x = v128_add_8(c128, o);
        const v128 a = v128_add_8(
            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
        const v128 b = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
        const v128 c = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
        const v128 d = v128_shuffle_8(x, d_shuff);
        const v128 e = v128_shuffle_8(x, e_shuff);
        const v128 f = v128_add_8(
            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));

        const v128 tmp =
            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                        delta, v128_zero()))),
                   4));
        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
      }
    } else {  // No left/right clipping
      int y;
      for (y = 0; y < sizey; y += 2) {
        const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
        v128 o = v128_from_v64(l1, l2);
        const v128 x = v128_add_8(c128, o);
        const v128 a = v128_add_8(
            c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
        const v128 b = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
        const v128 c = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
        const v128 d = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
        const v128 e = v128_add_8(
            c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
        const v128 f = v128_add_8(
            c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));

        const v128 tmp =
            v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                        delta, v128_zero()))),
                   4));
        v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
      }
    }
  }
 }

-void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
-                               int x0, int y0, int sizex, int sizey, int width,
-                               int height, unsigned int strength) {
+void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
+                               int dstride, int x0, int y0, int sizex,
+                               int sizey, int width, int height,
+                               unsigned int strength) {
  // TODO(stemidts):
  // A sizex different from 8 will only be needed if CLPF is extended to chroma.
  // This will only be used if 4:2:0 and width not a multiple of 16 and along
@@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
  // this case.  If not extended to chroma, this test will be redundant.
  if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
    // Fallback to C for odd sizes
-    aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
-                     strength);
+    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
+                     height, strength);
  } else {
-    clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
+    clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
+               strength);
  }
 }