VP9: move loopfilter build masks to decode loop

The loopfilter masks are now built in the decode loop. This is done so we can eventually reduce the number of MODE_INFO structs required by the decoder. The encoder builds the masks for the entire frame prior to calling the loopfilter. Change-Id: Ia2146b07e0acb8c50203e586dfae0c4c5b316f11
2015-09-29 05:20:49 -07:00
parent af631e1f19
commit 7718117104
7 changed files with 334 additions and 165 deletions
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -115,6 +115,8 @@ void vp9_free_context_buffers(VP9_COMMON *cm) {
  cm->above_context = NULL;
  vpx_free(cm->above_seg_context);
  cm->above_seg_context = NULL;
  vpx_free(cm->lf.lfm);
  cm->lf.lfm = NULL;
 }
 int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
@@ -149,6 +151,16 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
    cm->above_context_alloc_cols = cm->mi_cols;
  }
  vpx_free(cm->lf.lfm);
  // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region.  The
  // stride and rows are rounded up / truncated to a multiple of 8.
  cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3;
  cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc(
      ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride,
      sizeof(*cm->lf.lfm));
  if (!cm->lf.lfm) goto fail;
  return 0;
 fail:
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -825,6 +825,120 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
 }
 void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row,
                     const int mi_col, LOOP_FILTER_MASK *lfm) {
  int i;
  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
  // for 32x32 transforms also.
  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
  // We do at least 8 tap filter on every 32x32 even if the transform size
  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
  // remove it from the 4x4.
  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
  lfm->left_y[TX_4X4] &= ~left_border;
  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
  lfm->above_y[TX_4X4] &= ~above_border;
  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
  lfm->left_uv[TX_4X4] &= ~left_border_uv;
  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
  lfm->above_uv[TX_4X4] &= ~above_border_uv;
  // We do some special edge handling.
  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
    const uint64_t rows = cm->mi_rows - mi_row;
    // Each pixel inside the border gets a 1,
    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
    // Remove values completely outside our border.
    for (i = 0; i < TX_32X32; i++) {
      lfm->left_y[i] &= mask_y;
      lfm->above_y[i] &= mask_y;
      lfm->left_uv[i] &= mask_uv;
      lfm->above_uv[i] &= mask_uv;
    }
    lfm->int_4x4_y &= mask_y;
    lfm->int_4x4_uv &= mask_uv;
    // We don't apply a wide loop filter on the last uv block row. If set
    // apply the shorter one instead.
    if (rows == 1) {
      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
      lfm->above_uv[TX_16X16] = 0;
    }
    if (rows == 5) {
      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
    }
  }
  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
    const uint64_t columns = cm->mi_cols - mi_col;
    // Each pixel inside the border gets a 1, the multiply copies the border
    // to where we need it.
    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
    // Internal edges are not applied on the last column of the image so
    // we mask 1 more for the internal edges
    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
    // Remove the bits outside the image edge.
    for (i = 0; i < TX_32X32; i++) {
      lfm->left_y[i] &= mask_y;
      lfm->above_y[i] &= mask_y;
      lfm->left_uv[i] &= mask_uv;
      lfm->above_uv[i] &= mask_uv;
    }
    lfm->int_4x4_y &= mask_y;
    lfm->int_4x4_uv &= mask_uv_int;
    // We don't apply a wide loop filter on the last uv column. If set
    // apply the shorter one instead.
    if (columns == 1) {
      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
      lfm->left_uv[TX_16X16] = 0;
    }
    if (columns == 5) {
      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
    }
  }
  // We don't apply a loop filter on the first column in the image, mask that
  // out.
  if (mi_col == 0) {
    for (i = 0; i < TX_32X32; i++) {
      lfm->left_y[i] &= 0xfefefefefefefefeULL;
      lfm->left_uv[i] &= 0xeeee;
    }
  }
  // Assert if we try to apply 2 different loop filters at the same position.
  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 // TODO(JBB): This function only works for yv12.
@@ -855,7 +969,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
  const int shift_8_y[] = {0, 1, 8, 9};
  const int shift_32_uv[] = {0, 2, 8, 10};
  const int shift_16_uv[] = {0, 1, 4, 5};
  int i;
  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
@@ -970,114 +1083,8 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
      }
      break;
  }
  // The largest loopfilter we have is 16x16 so we use the 16x16 mask
  // for 32x32 transforms also.
  lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
  lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
  lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
  lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
-  // We do at least 8 tap filter on every 32x32 even if the transform size
+  vp9_adjust_mask(cm, mi_row, mi_col, lfm);
  // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
  // remove it from the 4x4.
  lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
  lfm->left_y[TX_4X4] &= ~left_border;
  lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
  lfm->above_y[TX_4X4] &= ~above_border;
  lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
  lfm->left_uv[TX_4X4] &= ~left_border_uv;
  lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
  lfm->above_uv[TX_4X4] &= ~above_border_uv;
  // We do some special edge handling.
  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
    const uint64_t rows = cm->mi_rows - mi_row;
    // Each pixel inside the border gets a 1,
    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
    // Remove values completely outside our border.
    for (i = 0; i < TX_32X32; i++) {
      lfm->left_y[i] &= mask_y;
      lfm->above_y[i] &= mask_y;
      lfm->left_uv[i] &= mask_uv;
      lfm->above_uv[i] &= mask_uv;
    }
    lfm->int_4x4_y &= mask_y;
    lfm->int_4x4_uv &= mask_uv;
    // We don't apply a wide loop filter on the last uv block row. If set
    // apply the shorter one instead.
    if (rows == 1) {
      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
      lfm->above_uv[TX_16X16] = 0;
    }
    if (rows == 5) {
      lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
      lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
    }
  }
  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
    const uint64_t columns = cm->mi_cols - mi_col;
    // Each pixel inside the border gets a 1, the multiply copies the border
    // to where we need it.
    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
    const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
    // Internal edges are not applied on the last column of the image so
    // we mask 1 more for the internal edges
    const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
    // Remove the bits outside the image edge.
    for (i = 0; i < TX_32X32; i++) {
      lfm->left_y[i] &= mask_y;
      lfm->above_y[i] &= mask_y;
      lfm->left_uv[i] &= mask_uv;
      lfm->above_uv[i] &= mask_uv;
    }
    lfm->int_4x4_y &= mask_y;
    lfm->int_4x4_uv &= mask_uv_int;
    // We don't apply a wide loop filter on the last uv column. If set
    // apply the shorter one instead.
    if (columns == 1) {
      lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
      lfm->left_uv[TX_16X16] = 0;
    }
    if (columns == 5) {
      lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
      lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
    }
  }
  // We don't apply a loop filter on the first column in the image, mask that
  // out.
  if (mi_col == 0) {
    for (i = 0; i < TX_32X32; i++) {
      lfm->left_y[i] &= 0xfefefefefefefefeULL;
      lfm->left_uv[i] &= 0xeeee;
    }
  }
  // Assert if we try to apply 2 different loop filters at the same position.
  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -1428,6 +1435,7 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
  struct buf_2d *const dst = &plane->dst;
  uint8_t *const dst0 = dst->buf;
  int r, c;
  uint8_t lfl_uv[16];
  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
@@ -1440,8 +1448,8 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
    if (plane->plane_type == 1) {
      for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
-        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+        lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
+        lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
      }
    }
@@ -1457,18 +1465,18 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
        highbd_filter_selectively_vert_row2(
            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+            &lfl_uv[r << 1], (int)cm->bit_depth);
      } else {
        filter_selectively_vert_row2(
            plane->subsampling_x, dst->buf, dst->stride,
            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1]);
+            &lfl_uv[r << 1]);
      }
 #else
      filter_selectively_vert_row2(
          plane->subsampling_x, dst->buf, dst->stride,
          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_uv[r << 1]);
+          &lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
      dst->buf += 16 * dst->stride;
@@ -1509,16 +1517,16 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                      dst->stride, mask_16x16_r, mask_8x8_r,
                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+                                      &lfl_uv[r << 1], (int)cm->bit_depth);
    } else {
      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfm->lfl_uv[r << 1]);
+                               &lfl_uv[r << 1]);
    }
 #else
    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfm->lfl_uv[r << 1]);
+                             &lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    dst->buf += 8 * dst->stride;
@@ -1529,13 +1537,11 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
  }
 }
-void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm,
                          VP9_COMMON *cm,
                             struct macroblockd_plane planes[MAX_MB_PLANE],
                             int start, int stop, int y_only) {
  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
  enum lf_path path;
  LOOP_FILTER_MASK lfm;
  int mi_row, mi_col;
  if (y_only)
@@ -1549,24 +1555,24 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {
      int plane;
      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
      // TODO(JBB): Make setup_mask work for non 420.
-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+      vp9_adjust_mask(cm, mi_row, mi_col, lfm);
                     &lfm);
-      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
      for (plane = 1; plane < num_planes; ++plane) {
        switch (path) {
          case LF_PATH_420:
-            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);
            break;
          case LF_PATH_444:
-            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);
            break;
          case LF_PATH_SLOW:
            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
@@ -1592,10 +1598,132 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
  }
  end_mi_row = start_mi_row + mi_rows_to_filter;
  loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only);
 }
 // Used by the encoder to build the loopfilter masks.
 void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level,
                          int partial_frame) {
  int start_mi_row, end_mi_row, mi_rows_to_filter;
  int mi_col, mi_row;
  if (!frame_filter_level) return;
  start_mi_row = 0;
  mi_rows_to_filter = cm->mi_rows;
  if (partial_frame && cm->mi_rows > 8) {
    start_mi_row = cm->mi_rows >> 1;
    start_mi_row &= 0xfffffff8;
    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
  }
  end_mi_row = start_mi_row + mi_rows_to_filter;
  vp9_loop_filter_frame_init(cm, frame_filter_level);
-  vp9_loop_filter_rows(frame, cm, xd->plane,
+
-                       start_mi_row, end_mi_row,
+  for (mi_row = start_mi_row; mi_row < end_mi_row; mi_row += MI_BLOCK_SIZE) {
-                       y_only);
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
      // vp9_setup_mask() zeros lfm
      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
                     get_lfm(&cm->lf, mi_row, mi_col));
    }
  }
 }
 // 8x8 blocks in a superblock.  A "1" represents the first block in a 16x16
 // or greater area.
 static const uint8_t first_block_in_16x16[8][8] = {
  {1, 0, 1, 0, 1, 0, 1, 0},
  {0, 0, 0, 0, 0, 0, 0, 0},
  {1, 0, 1, 0, 1, 0, 1, 0},
  {0, 0, 0, 0, 0, 0, 0, 0},
  {1, 0, 1, 0, 1, 0, 1, 0},
  {0, 0, 0, 0, 0, 0, 0, 0},
  {1, 0, 1, 0, 1, 0, 1, 0},
  {0, 0, 0, 0, 0, 0, 0, 0}
 };
 // This function sets up the bit masks for a block represented
 // by mi_row, mi_col in a 64x64 region.
 // TODO(SJL): This function only works for yv12.
 void vp9_build_mask(VP9_COMMON *cm, const MB_MODE_INFO *mbmi, int mi_row,
                    int mi_col, int bw, int bh) {
  const BLOCK_SIZE block_size = mbmi->sb_type;
  const TX_SIZE tx_size_y = mbmi->tx_size;
  const loop_filter_info_n *const lfi_n = &cm->lf_info;
  const int filter_level = get_filter_level(lfi_n, mbmi);
  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
  LOOP_FILTER_MASK *const lfm = get_lfm(&cm->lf, mi_row, mi_col);
  uint64_t *const left_y = &lfm->left_y[tx_size_y];
  uint64_t *const above_y = &lfm->above_y[tx_size_y];
  uint64_t *const int_4x4_y = &lfm->int_4x4_y;
  uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
  uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
  const int row_in_sb = (mi_row & 7);
  const int col_in_sb = (mi_col & 7);
  const int shift_y = col_in_sb + (row_in_sb << 3);
  const int shift_uv = (col_in_sb >> 1) + ((row_in_sb >> 1) << 2);
  const int build_uv = first_block_in_16x16[row_in_sb][col_in_sb];
  if (!filter_level) {
    return;
  } else {
    int index = shift_y;
    int i;
    for (i = 0; i < bh; i++) {
      memset(&lfm->lfl_y[index], filter_level, bw);
      index += 8;
    }
  }
  // These set 1 in the current block size for the block size edges.
  // For instance if the block size is 32x16, we'll set:
  //    above =   1111
  //              0000
  //    and
  //    left  =   1000
  //          =   1000
  // NOTE : In this example the low bit is left most ( 1000 ) is stored as
  //        1,  not 8...
  //
  // U and V set things on a 16 bit scale.
  //
  *above_y |= above_prediction_mask[block_size] << shift_y;
  *left_y |= left_prediction_mask[block_size] << shift_y;
  if (build_uv) {
    *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
    *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
  }
  // If the block has no coefficients and is not intra we skip applying
  // the loop filter on block edges.
  if (mbmi->skip && is_inter_block(mbmi))
    return;
  // Add a mask for the transform size. The transform size mask is set to
  // be correct for a 64x64 prediction block size. Mask to match the size of
  // the block we are working on and then shift it into place.
  *above_y |= (size_mask[block_size] &
               above_64x64_txform_mask[tx_size_y]) << shift_y;
  *left_y |= (size_mask[block_size] &
              left_64x64_txform_mask[tx_size_y]) << shift_y;
  if (build_uv) {
    *above_uv |= (size_mask_uv[block_size] &
                  above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
    *left_uv |= (size_mask_uv[block_size] &
                 left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
  }
  // Try to determine what to do with the internal 4x4 block boundaries.  These
  // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the
  // internal ones can be skipped and don't depend on the prediction block size.
  if (tx_size_y == TX_4X4)
    *int_4x4_y |= (size_mask[block_size] & -1ULL) << shift_y;
  if (build_uv && tx_size_uv == TX_4X4)
    *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
 }
 void vp9_loop_filter_data_reset(
@@ -1609,9 +1737,17 @@ void vp9_loop_filter_data_reset(
  memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
 }
 void vp9_reset_lfm(VP9_COMMON *cm) {
  if (cm->lf.filter_level) {
    memset(cm->lf.lfm, 0,
           ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *
            sizeof(*cm->lf.lfm));
  }
 }
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
  (void)unused;
-  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+  loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                   lf_data->start, lf_data->stop, lf_data->y_only);
  return 1;
 }
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -35,24 +35,6 @@ enum lf_path {
  LF_PATH_SLOW,
 };
 struct loopfilter {
  int filter_level;
  int sharpness_level;
  int last_sharpness_level;
  uint8_t mode_ref_delta_enabled;
  uint8_t mode_ref_delta_update;
  // 0 = Intra, Last, GF, ARF
  signed char ref_deltas[MAX_REF_LF_DELTAS];
  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
  // 0 = ZERO_MV, MV
  signed char mode_deltas[MAX_MODE_LF_DELTAS];
  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
 };
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
@@ -83,9 +65,29 @@ typedef struct {
  uint16_t above_uv[TX_SIZES];
  uint16_t int_4x4_uv;
  uint8_t lfl_y[64];
  uint8_t lfl_uv[16];
 } LOOP_FILTER_MASK;
 struct loopfilter {
  int filter_level;
  int sharpness_level;
  int last_sharpness_level;
  uint8_t mode_ref_delta_enabled;
  uint8_t mode_ref_delta_update;
  // 0 = Intra, Last, GF, ARF
  signed char ref_deltas[MAX_REF_LF_DELTAS];
  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
  // 0 = ZERO_MV, MV
  signed char mode_deltas[MAX_MODE_LF_DELTAS];
  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
  LOOP_FILTER_MASK *lfm;
  int lfm_stride;
 };
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;
@@ -116,7 +118,7 @@ void vp9_filter_block_plane_non420(struct VP9Common *cm,
 void vp9_loop_filter_init(struct VP9Common *cm);
 // Update the loop filter for the current frame.
-// This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame()
+// This should be called before vp9_loop_filter_frame(), vp9_build_mask_frame()
 // calls this function directly.
 void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
@@ -126,11 +128,19 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
                           int filter_level,
                           int y_only, int partial_frame);
-// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
+// Get the superblock lfm for a given mi_row, mi_col.
-void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
+static INLINE LOOP_FILTER_MASK *get_lfm(const struct loopfilter *lf,
-                          struct VP9Common *cm,
+                                        const int mi_row, const int mi_col) {
-                          struct macroblockd_plane planes[MAX_MB_PLANE],
+  return &lf->lfm[(mi_col >> 3) + ((mi_row >> 3) * lf->lfm_stride)];
-                          int start, int stop, int y_only);
+}
 void vp9_build_mask(struct VP9Common *cm, const MB_MODE_INFO *mbmi, int mi_row,
                    int mi_col, int bw, int bh);
 void vp9_adjust_mask(struct VP9Common *const cm, const int mi_row,
                     const int mi_col, LOOP_FILTER_MASK *lfm);
 void vp9_build_mask_frame(struct VP9Common *cm, int frame_filter_level,
                          int partial_frame);
 void vp9_reset_lfm(struct VP9Common *const cm);
 typedef struct LoopFilterWorkerData {
  YV12_BUFFER_CONFIG *frame_buffer;
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -109,29 +109,27 @@ void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
  for (mi_row = start; mi_row < stop;
       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
    MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) {
      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
      LOOP_FILTER_MASK lfm;
      int plane;
      sync_read(lf_sync, r, c);
      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-      // TODO(JBB): Make setup_mask work for non 420.
+      vp9_adjust_mask(cm, mi_row, mi_col, lfm);
      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
                     &lfm);
-      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
      for (plane = 1; plane < num_planes; ++plane) {
        switch (path) {
          case LF_PATH_420:
-            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm);
            break;
          case LF_PATH_444:
-            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm);
            break;
          case LF_PATH_SLOW:
            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -896,6 +896,10 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
  }
  xd->corrupted |= vpx_reader_has_error(r);
  if (cm->lf.filter_level) {
    vp9_build_mask(cm, mbmi, mi_row, mi_col, bw, bh);
  }
 }
 static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
@@ -1464,6 +1468,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
  memset(cm->above_seg_context, 0,
         sizeof(*cm->above_seg_context) * aligned_cols);
  vp9_reset_lfm(cm);
  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
  if (pbi->tile_data == NULL ||
@@ -1662,6 +1668,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
  memset(cm->above_seg_context, 0,
         sizeof(*cm->above_seg_context) * aligned_mi_cols);
  vp9_reset_lfm(cm);
  // Load tile data into tile_buffers
  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2755,6 +2755,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
  struct loopfilter *lf = &cm->lf;
  if (xd->lossless) {
      lf->filter_level = 0;
  } else {
@@ -2771,6 +2772,8 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
  }
  if (lf->filter_level > 0) {
    vp9_build_mask_frame(cm, lf->filter_level, 0);
    if (cpi->num_workers > 1)
      vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
                               lf->filter_level, 0, 0,
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -40,6 +40,8 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
  VP9_COMMON *const cm = &cpi->common;
  int64_t filt_err;
  vp9_build_mask_frame(cm, filt_level, partial_frame);
  if (cpi->num_workers > 1)
    vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
                             filt_level, 1, partial_frame,