Clean up and speed up CLPF clipping

* Move clipping tests from inside to outside loops * Let sizex and sizey to clpf_block() be the clipped block size rather than both just bs * Make fallback tests to C more accurate Change-Id: Icdc57540ce21b41a95403fdcc37988a4ebf546c7
2016-09-26 12:51:25 +02:00
parent 6116141c23
commit e66fc87c46
2 changed files with 116 additions and 79 deletions
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -153,8 +153,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
        // Iterate over all smaller blocks inside the filter block
        for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
          for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
            int sizex, sizey;
            xpos = xoff + n * bs;
            ypos = yoff + m * bs;
            sizex = AOMMIN(width - xpos, bs);
            sizey = AOMMIN(height - ypos, bs);
            if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
                                     (xpos << subx) / MI_SIZE]
                     ->mbmi.skip) {  // Not skip block
@@ -164,30 +167,49 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
 #if CONFIG_AOM_HIGHBITDEPTH
                if (cm->use_highbitdepth) {
                  uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
-                  for (c = 0; c < bs; c++) {
+                  if (sizex == 8) {
-                    *(uint64_t *)(d + c * sstride) =
+                    for (c = 0; c < sizey; c++) {
-                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
+                      *(uint64_t *)(d + c * sstride) =
-                    if (bs == 8)
+                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
                      *(uint64_t *)(d + c * sstride + 4) =
                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
                    }
                  } else if (sizex == 4) {
                    for (c = 0; c < sizey; c++)
                      *(uint64_t *)(d + c * sstride) =
                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
                  } else {
                    for (c = 0; c < sizey; c++)
                      memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2,
                             sizex);
                  }
                } else {
-                  for (c = 0; c < bs; c++)
+                  if (sizex == 8)
-                    if (bs == 8)
+                    for (c = 0; c < sizey; c++)
                      *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
                          *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
-                    else
+                  else if (sizex == 4)
                    for (c = 0; c < sizey; c++)
                      *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
                          *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
                  else
                    for (c = 0; c < sizey; c++)
                      memcpy(cache_dst[cache_idx] + c * sstride,
                             cache_ptr[cache_idx] + c * bs, sizex);
                }
 #else
-                for (c = 0; c < bs; c++)
+                if (sizex == 8)
-                  if (bs == 8)
+                  for (c = 0; c < sizey; c++)
                    *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
-                  else
+                else if (sizex == 4)
                  for (c = 0; c < sizey; c++)
                    *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
                        *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
                else
                  for (c = 0; c < sizey; c++)
                    memcpy(cache_dst[cache_idx] + c * sstride,
                           cache_ptr[cache_idx] + c * bs, sizex);
 #endif
              }
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -211,15 +233,15 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
              if (cm->use_highbitdepth) {
                aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
                                   CONVERT_TO_SHORTPTR(dst_buffer), sstride,
-                                   dstride, xpos, ypos, bs, bs, width, height,
+                                   dstride, xpos, ypos, sizex, sizey, width,
-                                   strength);
+                                   height, strength);
              } else {
                aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
-                               ypos, bs, bs, width, height, strength);
+                               ypos, sizex, sizey, width, height, strength);
              }
 #else
              aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
-                             ypos, bs, bs, width, height, strength);
+                             ypos, sizex, sizey, width, height, strength);
 #endif
            }
          }
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -76,24 +76,27 @@ static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
    v128 o = v128_from_v64(l1, l2);
    const v128 a =
        v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
    v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
                           v64_load_unaligned(src - 2 * !!x0 + sstride));
    v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
                           v64_load_unaligned(src - !!x0 + sstride));
    v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
                           v64_load_unaligned(src + !!right + sstride));
    v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
                           v64_load_unaligned(src + 2 * !!right + sstride));
    const v128 f = v128_from_v64(
        l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
    v128 b, c, d, e;
-    if (!x0) {  // Left clipping
+    if (x0) {
-      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
+      b = v128_from_v64(v64_load_unaligned(src - 2),
-      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+                        v64_load_unaligned(src - 2 + sstride));
      c = v128_from_v64(v64_load_unaligned(src - 1),
                        v64_load_unaligned(src - 1 + sstride));
    } else {  // Left clipping
      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
    }
-    if (!right) {  // Right clipping
+    if (right) {
-      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
+      d = v128_from_v64(v64_load_unaligned(src + 1),
-      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+                        v64_load_unaligned(src + 1 + sstride));
      e = v128_from_v64(v64_load_unaligned(src + 2),
                        v64_load_unaligned(src + 2 + sstride));
    } else {  // Right clipping
      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
    }
    o = calc_delta(o, a, b, c, d, e, f, sp, sm);
@@ -134,31 +137,34 @@ static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
    const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
    v128 o = v128_from_32(l1, l2, l3, l4);
    const v128 a = v128_from_32(l0, l1, l2, l3);
    v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
                          u32_load_unaligned(src + sstride - 2 * !!x0),
                          u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
                          u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
    v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
                          u32_load_unaligned(src + sstride - !!x0),
                          u32_load_unaligned(src + 2 * sstride - !!x0),
                          u32_load_unaligned(src + 3 * sstride - !!x0));
    v128 d = v128_from_32(u32_load_unaligned(src + !!right),
                          u32_load_unaligned(src + sstride + !!right),
                          u32_load_unaligned(src + 2 * sstride + !!right),
                          u32_load_unaligned(src + 3 * sstride + !!right));
    v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
                          u32_load_unaligned(src + sstride + 2 * !!right),
                          u32_load_unaligned(src + 2 * sstride + 2 * !!right),
                          u32_load_unaligned(src + 3 * sstride + 2 * !!right));
    const v128 f = v128_from_32(l2, l3, l4, l5);
    v128 b, c, d, e;
-    if (!x0) {  // Left clipping
+    if (x0) {
-      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
+      b = v128_from_32(u32_load_unaligned(src - 2),
-      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+                       u32_load_unaligned(src + sstride - 2),
                       u32_load_unaligned(src + 2 * sstride - 2),
                       u32_load_unaligned(src + 3 * sstride - 2));
      c = v128_from_32(u32_load_unaligned(src - 1),
                       u32_load_unaligned(src + sstride - 1),
                       u32_load_unaligned(src + 2 * sstride - 1),
                       u32_load_unaligned(src + 3 * sstride - 1));
    } else {  // Left clipping
      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
    }
-    if (!right) {  // Right clipping
+    if (right) {
-      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
+      d = v128_from_32(u32_load_unaligned(src + 1),
-      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+                       u32_load_unaligned(src + sstride + 1),
                       u32_load_unaligned(src + 2 * sstride + 1),
                       u32_load_unaligned(src + 3 * sstride + 1));
      e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
                       u32_load_unaligned(src + sstride + 2),
                       u32_load_unaligned(src + 2 * sstride + 2),
                       u32_load_unaligned(src + 3 * sstride + 2));
    } else {  // Right clipping
      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
    }
    o = calc_delta(o, a, b, c, d, e, f, sp, sm);
@@ -176,9 +182,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
                               int dstride, int x0, int y0, int sizex,
                               int sizey, int width, int height,
                               unsigned int strength) {
-  if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
+  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
-      (sizey & 3 && sizex == 4) || x0 + 4 > width) {
+    // Fallback to C for odd sizes:
-    // Fallback to C for odd sizes
+    // * block widths not 4 or 8
    // * block heights not a multiple of 4 if the block width is 4
    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
                     height, strength);
  } else {
@@ -255,24 +262,27 @@ SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
    v128 o = v128_from_v64(l1, l2);
    const v128 a =
        v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
    v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
                           v64_load_unaligned(src - 2 * !!x0 + sstride));
    v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
                           v64_load_unaligned(src - !!x0 + sstride));
    v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
                           v64_load_unaligned(src + !!right + sstride));
    v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
                           v64_load_unaligned(src + 2 * !!right + sstride));
    const v128 f = v128_from_v64(
        l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
    v128 b, c, d, e;
-    if (!x0) {  // Left clipping
+    if (x0) {
-      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
+      b = v128_from_v64(v64_load_unaligned(src - 2),
-      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+                        v64_load_unaligned(src - 2 + sstride));
      c = v128_from_v64(v64_load_unaligned(src - 1),
                        v64_load_unaligned(src - 1 + sstride));
    } else {  // Left clipping
      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
    }
-    if (!right) {  // Right clipping
+    if (right) {
-      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
+      d = v128_from_v64(v64_load_unaligned(src + 1),
-      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+                        v64_load_unaligned(src + 1 + sstride));
      e = v128_from_v64(v64_load_unaligned(src + 2),
                        v64_load_unaligned(src + 2 + sstride));
    } else {  // Right clipping
      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
    }
    calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
    src += sstride * 2;
@@ -309,18 +319,21 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
    const v128 o = v128_load_aligned(src);
    const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
    const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
-    v128 b = v128_load_unaligned(src - 2 * !!x0);
+    v128 b, c, d, e;
    v128 c = v128_load_unaligned(src - !!x0);
    v128 d = v128_load_unaligned(src + !!right);
    v128 e = v128_load_unaligned(src + 2 * !!right);
-    if (!x0) {  // Left clipping
+    if (x0) {
-      b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
+      b = v128_load_unaligned(src - 2);
-      c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
+      c = v128_load_unaligned(src - 1);
    } else {  // Left clipping
      b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
    }
-    if (!right) {  // Right clipping
+    if (right) {
-      d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
+      d = v128_load_unaligned(src + 1);
-      e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
+      e = v128_load_unaligned(src + 2);
    } else {  // Right clipping
      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
    }
    calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
    src += sstride;
@@ -332,8 +345,10 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
                                   int sstride, int dstride, int x0, int y0,
                                   int sizex, int sizey, int width, int height,
                                   unsigned int strength) {
-  if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) {
+  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
-    // Fallback to C for odd sizes
+    // Fallback to C for odd sizes:
    // * block width not 4 or 8
    // * block heights not a multiple of 2 if the block width is 4
    aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
                         width, height, strength);
  } else {