diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a2b9a75d7..5f7384be7 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -587,7 +587,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint
 specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
 
 if (aom_config("CONFIG_CLPF") eq "yes") {
-  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
+  add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
   specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
   add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
   specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 799af0184..1ca60e056 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -27,30 +27,30 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
   return (8 + delta - (delta < 0)) >> 4;
 }
 
-void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0,
-                      int y0, int sizex, int sizey, int width, int height,
-                      unsigned int strength) {
+void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
+                      int dstride, int x0, int y0, int sizex, int sizey,
+                      int width, int height, unsigned int strength) {
   int x, y;
   for (y = y0; y < y0 + sizey; y++) {
     for (x = x0; x < x0 + sizex; x++) {
-      int X = src[y * stride + x];
-      int A = src[AOMMAX(0, y - 1) * stride + x];
-      int B = src[y * stride + AOMMAX(0, x - 2)];
-      int C = src[y * stride + AOMMAX(0, x - 1)];
-      int D = src[y * stride + AOMMIN(width - 1, x + 1)];
-      int E = src[y * stride + AOMMIN(width - 1, x + 2)];
-      int F = src[AOMMIN(height - 1, y + 1) * stride + x];
+      int X = src[y * sstride + x];
+      int A = src[AOMMAX(0, y - 1) * sstride + x];
+      int B = src[y * sstride + AOMMAX(0, x - 2)];
+      int C = src[y * sstride + AOMMAX(0, x - 1)];
+      int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
+      int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
+      int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
       int delta;
       delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
-      dst[y * stride + x] = X + delta;
+      dst[y * dstride + x] = X + delta;
     }
   }
 }
 
 // Return number of filtered blocks
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
-                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                   int enable_fb_flag, unsigned int strength,
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
+                   const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org,
+                   AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
                    unsigned int fb_size_log2, uint8_t *blocks,
                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
                                    const YV12_BUFFER_CONFIG *,
@@ -59,23 +59,45 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
   /* Constrained low-pass filter (CLPF) */
   int c, k, l, m, n;
   const int bs = MI_SIZE;
-  int width = rec->y_crop_width;
-  int height = rec->y_crop_height;
+  const int width = rec->y_crop_width;
+  const int height = rec->y_crop_height;
   int xpos, ypos;
-  int stride_y = rec->y_stride;
-  int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
-  int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
+  const int sstride = rec->y_stride;
+  int dstride = orig_dst->y_stride;
+  const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
+  const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
   int block_index = 0;
+  uint8_t *cache = NULL;
+  uint8_t **cache_ptr = NULL;
+  uint8_t **cache_dst = NULL;
+  int cache_idx = 0;
+  const int cache_size = num_fb_hor << (2 * fb_size_log2);
+  const int cache_blocks = cache_size / (bs * bs);
+  YV12_BUFFER_CONFIG dst = *orig_dst;
+
+  // Make buffer space for in-place filtering
+  if (rec->y_buffer == dst.y_buffer) {
+    CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
+    CHECK_MEM_ERROR(cm, cache_ptr,
+                    aom_malloc(cache_blocks * sizeof(*cache_ptr)));
+    CHECK_MEM_ERROR(cm, cache_dst,
+                    aom_malloc(cache_blocks * sizeof(*cache_dst)));
+    memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
+    dst.y_buffer = cache;
+    dstride = bs;
+  }
 
   // Iterate over all filter blocks
   for (k = 0; k < num_fb_ver; k++) {
     for (l = 0; l < num_fb_hor; l++) {
       int h, w;
       int allskip = 1;
+      const int xoff = l << fb_size_log2;
+      const int yoff = k << fb_size_log2;
       for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
         for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
-          xpos = (l << fb_size_log2) + n * bs;
-          ypos = (k << fb_size_log2) + m * bs;
+          xpos = xoff + n * bs;
+          ypos = yoff + m * bs;
           if (xpos < width && ypos < height) {
             allskip &=
                 cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
@@ -96,31 +118,57 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
         // Iterate over all smaller blocks inside the filter block
         for (m = 0; m < (h + bs - 1) / bs; m++) {
           for (n = 0; n < (w + bs - 1) / bs; n++) {
-            xpos = (l << fb_size_log2) + n * bs;
-            ypos = (k << fb_size_log2) + m * bs;
+            xpos = xoff + n * bs;
+            ypos = yoff + m * bs;
             if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
-                     ->mbmi.skip) {
-              // Not skip block, apply the filter
-              aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos,
-                             bs, bs, width, height, strength);
+                     ->mbmi.skip) {  // Not skip block
+              // Temporary buffering needed if filtering in-place
+              if (cache) {
+                if (cache_ptr[cache_idx]) {
+                  // Copy filtered block back into the frame
+                  for (c = 0; c < bs; c++)
+                    *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+                        *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+                }
+                cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
+                dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
+                cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos;
+                if (++cache_idx >= cache_blocks) cache_idx = 0;
+              }
+
+              // Apply the filter
+              aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride,
+                             xpos, ypos, bs, bs, width, height, strength);
+
             } else {  // Skip block, copy instead
-              for (c = 0; c < bs; c++)
-                *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) =
-                    *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos);
+              if (!cache)
+                for (c = 0; c < bs; c++)
+                  *(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *(
+                      uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos);
             }
           }
         }
       } else {  // Entire filter block is skip, copy
-        for (m = 0; m < h; m++)
-          memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y +
-                     (l << fb_size_log2),
-                 rec->y_buffer + ((k << fb_size_log2) + m) * stride_y +
-                     (l << fb_size_log2),
-                 w);
+        if (!cache)
+          for (m = 0; m < h; m++)
+            memcpy(dst.y_buffer + (yoff + m) * dstride + xoff,
+                   rec->y_buffer + (yoff + m) * sstride + xoff, w);
       }
       block_index += !allskip;  // Count number of blocks filtered
     }
   }
 
+  if (cache) {
+    // Copy remaining blocks into the frame
+    for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
+         cache_idx++)
+      for (c = 0; c < bs; c++)
+        *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
+            *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
+
+    aom_free(cache);
+    aom_free(cache_ptr);
+  }
+
   return block_index;
 }
diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index 21671a1c1..2fb12d6c6 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@@ -18,7 +18,7 @@
 int av1_clpf_maxbits(const AV1_COMMON *cm);
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
 int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
-                   const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
                    int enable_fb_flag, unsigned int strength,
                    unsigned int fb_size_log2, uint8_t *blocks,
                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index 0df6cd74e..544aa36f7 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -11,11 +11,11 @@
 
 #include "./aom_dsp_rtcd.h"
 
-static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
-                       int y0, int sizey, int width, int height,
-                       unsigned int strength) {
-  dst += x0 + y0 * stride;
-  src += x0 + y0 * stride;
+static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
+                       int dstride, int x0, int y0, int sizey, int width,
+                       int height, unsigned int strength) {
+  dst += x0 + y0 * dstride;
+  src += x0 + y0 * sstride;
   {
     int bottom = height - 2 - y0;
     const v128 sp = v128_dup_8(strength);
@@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
 
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_shuffle_8(x, b_shuff);
         const v128 c = v128_shuffle_8(x, c_shuff);
         const v128 d = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
         const v128 e = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
 
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
       }
     } else if (!(width - x0 - 8)) {  // Clip right
       const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
@@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
 
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
         const v128 c = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
         const v128 d = v128_shuffle_8(x, d_shuff);
         const v128 e = v128_shuffle_8(x, e_shuff);
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
 
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
       }
     } else {  // No left/right clipping
       int y;
       for (y = 0; y < sizey; y += 2) {
         const v64 l1 = v64_load_aligned(src);
-        const v64 l2 = v64_load_aligned(src + stride);
+        const v64 l2 = v64_load_aligned(src + sstride);
         v128 o = v128_from_v64(l1, l2);
         const v128 x = v128_add_8(c128, o);
         const v128 a = v128_add_8(
             c128,
-            v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
+            v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
         const v128 b = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 2),
-                                v64_load_unaligned(src - 2 + stride)));
+                                v64_load_unaligned(src - 2 + sstride)));
         const v128 c = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src - 1),
-                                v64_load_unaligned(src - 1 + stride)));
+                                v64_load_unaligned(src - 1 + sstride)));
         const v128 d = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 1),
-                                v64_load_unaligned(src + 1 + stride)));
+                                v64_load_unaligned(src + 1 + sstride)));
         const v128 e = v128_add_8(
             c128, v128_from_v64(v64_load_unaligned(src + 2),
-                                v64_load_unaligned(src + 2 + stride)));
+                                v64_load_unaligned(src + 2 + sstride)));
         const v128 f = v128_add_8(
             c128, v128_from_v64(l2, v64_load_aligned(
-                                        src + ((y != bottom) + 1) * stride)));
+                                        src + ((y != bottom) + 1) * sstride)));
 
         const v128 tmp =
             v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
@@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
                                                         delta, v128_zero()))),
                    4));
         v64_store_aligned(dst, v128_high_v64(o));
-        v64_store_aligned(dst + stride, v128_low_v64(o));
-        src += stride * 2;
-        dst += stride * 2;
+        v64_store_aligned(dst + dstride, v128_low_v64(o));
+        src += sstride * 2;
+        dst += dstride * 2;
       }
     }
   }
 }
 
-void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
-                               int x0, int y0, int sizex, int sizey, int width,
-                               int height, unsigned int strength) {
+void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
+                               int dstride, int x0, int y0, int sizex,
+                               int sizey, int width, int height,
+                               unsigned int strength) {
   // TODO(stemidts):
   // A sizex different from 8 will only be needed if CLPF is extended to chroma.
   // This will only be used if 4:2:0 and width not a multiple of 16 and along
@@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
   // this case.  If not extended to chroma, this test will be redundant.
   if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
     // Fallback to C for odd sizes
-    aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
-                     strength);
+    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
+                     height, strength);
   } else {
-    clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
+    clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
+               strength);
   }
 }
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index dc18944b3..6b2de8c59 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3929,19 +3929,10 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
 
 #if CONFIG_CLPF
   if (cm->clpf_strength && !cm->skip_loop_filter) {
-    YV12_BUFFER_CONFIG dst;  // Buffer for the result
-
-    dst = pbi->cur_buf->buf;
-    CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
-
-    av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
+    const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
+    av1_clpf_frame(frame, frame, 0, cm, !!cm->clpf_size,
                    cm->clpf_strength + (cm->clpf_strength == 3),
                    4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
-
-    // Copy result
-    memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
-           dst.y_height * dst.y_stride);
-    aom_free(dst.y_buffer);
   }
   if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
 #endif
diff --git a/test/clpf_test.cc b/test/clpf_test.cc
index 786180b6a..755d1f146 100644
--- a/test/clpf_test.cc
+++ b/test/clpf_test.cc
@@ -26,9 +26,9 @@ using libaom_test::ACMRandom;
 
 namespace {
 
-typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride,
-                             int x0, int y0, int sizex, int sizey, int width,
-                             int height, unsigned int strength);
+typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
+                             int dstride, int x0, int y0, int sizex, int sizey,
+                             int width, int height, unsigned int strength);
 
 typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
     clpf_block_param_t;
@@ -85,10 +85,10 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
       for (ypos = 0; ypos < size && !error; ypos += h * !error) {
         for (xpos = 0; xpos < size && !error; xpos += w * !error) {
           for (strength = 0; strength < 3 && !error; strength += !error) {
-            ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size,
+            ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
                      1 << strength);
-            ASM_REGISTER_STATE_CHECK(
-                clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength));
+            ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h,
+                                          size, size, 1 << strength));
 
             for (pos = 0; pos < size * size && !error; pos++) {
               error = ref_d[pos] != d[pos];
@@ -137,7 +137,8 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
     for (ypos = 0; ypos < size; ypos += h) {
       for (xpos = 0; xpos < size; xpos += w) {
         for (strength = 0; strength < 3; strength++) {
-          ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
+          ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size,
+                   1 << strength);
         }
       }
     }
@@ -150,7 +151,7 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
     for (ypos = 0; ypos < size; ypos += h) {
       for (xpos = 0; xpos < size; xpos += w) {
         for (strength = 0; strength < 3; strength++) {
-          clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
+          clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength);
         }
       }
     }