diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index a2b9a75d7..5f7384be7 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -587,7 +587,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; if (aom_config("CONFIG_CLPF") eq "yes") { - add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; + add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/; add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength"; specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/; diff --git a/av1/common/clpf.c b/av1/common/clpf.c index 799af0184..1ca60e056 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -27,30 +27,30 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) { return (8 + delta - (delta < 0)) >> 4; } -void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0, - int y0, int sizex, int sizey, int width, int height, - unsigned int strength) { +void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizex, int sizey, + int width, int height, unsigned int strength) { int x, y; for (y = y0; y < y0 + sizey; y++) { for (x = x0; x < x0 + sizex; x++) { - int X = src[y * stride + x]; - int A = src[AOMMAX(0, y - 1) * stride + x]; - int B = src[y * stride + AOMMAX(0, x - 2)]; - int C = src[y * stride + AOMMAX(0, x - 1)]; - int D = src[y * stride + AOMMIN(width - 1, x + 1)]; - int E = src[y * stride + AOMMIN(width - 1, x + 2)]; - int F = src[AOMMIN(height - 1, y + 1) * stride + x]; + int X = src[y * sstride + x]; + int A = src[AOMMAX(0, y - 1) * sstride + x]; + int B = src[y * sstride + AOMMAX(0, x - 2)]; + int C = src[y * sstride + AOMMAX(0, x - 1)]; + int D = src[y * sstride + AOMMIN(width - 1, x + 1)]; + int E = src[y * sstride + AOMMIN(width - 1, x + 2)]; + int F = src[AOMMIN(height - 1, y + 1) * sstride + x]; int delta; delta = av1_clpf_sample(X, A, B, C, D, E, F, strength); - dst[y * stride + x] = X + delta; + dst[y * dstride + x] = X + delta; } } } // Return number of filtered blocks -int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, - const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - int enable_fb_flag, unsigned int strength, +int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst, + const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, + AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, unsigned int fb_size_log2, uint8_t *blocks, int (*decision)(int, int, const YV12_BUFFER_CONFIG *, const YV12_BUFFER_CONFIG *, @@ -59,23 +59,45 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, /* Constrained low-pass filter (CLPF) */ int c, k, l, m, n; const int bs = MI_SIZE; - int width = rec->y_crop_width; - int height = rec->y_crop_height; + const int width = rec->y_crop_width; + const int height = rec->y_crop_height; int xpos, ypos; - int stride_y = rec->y_stride; - int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2; - int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2; + const int sstride = rec->y_stride; + int dstride = orig_dst->y_stride; + const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2; + const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2; int block_index = 0; + uint8_t *cache = NULL; + uint8_t **cache_ptr = NULL; + uint8_t **cache_dst = NULL; + int cache_idx = 0; + const int cache_size = num_fb_hor << (2 * fb_size_log2); + const int cache_blocks = cache_size / (bs * bs); + YV12_BUFFER_CONFIG dst = *orig_dst; + + // Make buffer space for in-place filtering + if (rec->y_buffer == dst.y_buffer) { + CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size)); + CHECK_MEM_ERROR(cm, cache_ptr, + aom_malloc(cache_blocks * sizeof(*cache_ptr))); + CHECK_MEM_ERROR(cm, cache_dst, + aom_malloc(cache_blocks * sizeof(*cache_dst))); + memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst)); + dst.y_buffer = cache; + dstride = bs; + } // Iterate over all filter blocks for (k = 0; k < num_fb_ver; k++) { for (l = 0; l < num_fb_hor; l++) { int h, w; int allskip = 1; + const int xoff = l << fb_size_log2; + const int yoff = k << fb_size_log2; for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) { for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) { - xpos = (l << fb_size_log2) + n * bs; - ypos = (k << fb_size_log2) + m * bs; + xpos = xoff + n * bs; + ypos = yoff + m * bs; if (xpos < width && ypos < height) { allskip &= cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] @@ -96,31 +118,57 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, // Iterate over all smaller blocks inside the filter block for (m = 0; m < (h + bs - 1) / bs; m++) { for (n = 0; n < (w + bs - 1) / bs; n++) { - xpos = (l << fb_size_log2) + n * bs; - ypos = (k << fb_size_log2) + m * bs; + xpos = xoff + n * bs; + ypos = yoff + m * bs; if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] - ->mbmi.skip) { - // Not skip block, apply the filter - aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos, - bs, bs, width, height, strength); + ->mbmi.skip) { // Not skip block + // Temporary buffering needed if filtering in-place + if (cache) { + if (cache_ptr[cache_idx]) { + // Copy filtered block back into the frame + for (c = 0; c < bs; c++) + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + } + cache_ptr[cache_idx] = cache + cache_idx * bs * bs; + dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; + cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos; + if (++cache_idx >= cache_blocks) cache_idx = 0; + } + + // Apply the filter + aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride, + xpos, ypos, bs, bs, width, height, strength); + } else { // Skip block, copy instead - for (c = 0; c < bs; c++) - *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) = - *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos); + if (!cache) + for (c = 0; c < bs; c++) + *(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *( + uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos); } } } } else { // Entire filter block is skip, copy - for (m = 0; m < h; m++) - memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y + - (l << fb_size_log2), - rec->y_buffer + ((k << fb_size_log2) + m) * stride_y + - (l << fb_size_log2), - w); + if (!cache) + for (m = 0; m < h; m++) + memcpy(dst.y_buffer + (yoff + m) * dstride + xoff, + rec->y_buffer + (yoff + m) * sstride + xoff, w); } block_index += !allskip; // Count number of blocks filtered } } + if (cache) { + // Copy remaining blocks into the frame + for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx]; + cache_idx++) + for (c = 0; c < bs; c++) + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + + aom_free(cache); + aom_free(cache_ptr); + } + return block_index; } diff --git a/av1/common/clpf.h b/av1/common/clpf.h index 21671a1c1..2fb12d6c6 100644 --- a/av1/common/clpf.h +++ b/av1/common/clpf.h @@ -18,7 +18,7 @@ int av1_clpf_maxbits(const AV1_COMMON *cm); int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b); int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, - const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, unsigned int fb_size_log2, uint8_t *blocks, int (*decision)(int, int, const YV12_BUFFER_CONFIG *, diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index 0df6cd74e..544aa36f7 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h @@ -11,11 +11,11 @@ #include "./aom_dsp_rtcd.h" -static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, - int y0, int sizey, int width, int height, - unsigned int strength) { - dst += x0 + y0 * stride; - src += x0 + y0 * stride; +static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizey, int width, + int height, unsigned int strength) { + dst += x0 + y0 * dstride; + src += x0 + y0 * sstride; { int bottom = height - 2 - y0; const v128 sp = v128_dup_8(strength); @@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, for (y = 0; y < sizey; y += 2) { const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + stride); + const v64 l2 = v64_load_aligned(src + sstride); v128 o = v128_from_v64(l1, l2); const v128 x = v128_add_8(c128, o); const v128 a = v128_add_8( c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); const v128 b = v128_shuffle_8(x, b_shuff); const v128 c = v128_shuffle_8(x, c_shuff); const v128 d = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src + 1), - v64_load_unaligned(src + 1 + stride))); + v64_load_unaligned(src + 1 + sstride))); const v128 e = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src + 2), - v64_load_unaligned(src + 2 + stride))); + v64_load_unaligned(src + 2 + sstride))); const v128 f = v128_add_8( c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * stride))); + src + ((y != bottom) + 1) * sstride))); const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), @@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, delta, v128_zero()))), 4)); v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + stride, v128_low_v64(o)); - src += stride * 2; - dst += stride * 2; + v64_store_aligned(dst + dstride, v128_low_v64(o)); + src += sstride * 2; + dst += dstride * 2; } } else if (!(width - x0 - 8)) { // Clip right const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), @@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, for (y = 0; y < sizey; y += 2) { const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + stride); + const v64 l2 = v64_load_aligned(src + sstride); v128 o = v128_from_v64(l1, l2); const v128 x = v128_add_8(c128, o); const v128 a = v128_add_8( c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); const v128 b = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src - 2), - v64_load_unaligned(src - 2 + stride))); + v64_load_unaligned(src - 2 + sstride))); const v128 c = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src - 1), - v64_load_unaligned(src - 1 + stride))); + v64_load_unaligned(src - 1 + sstride))); const v128 d = v128_shuffle_8(x, d_shuff); const v128 e = v128_shuffle_8(x, e_shuff); const v128 f = v128_add_8( c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * stride))); + src + ((y != bottom) + 1) * sstride))); const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), @@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, delta, v128_zero()))), 4)); v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + stride, v128_low_v64(o)); - src += stride * 2; - dst += stride * 2; + v64_store_aligned(dst + dstride, v128_low_v64(o)); + src += sstride * 2; + dst += dstride * 2; } } else { // No left/right clipping int y; for (y = 0; y < sizey; y += 2) { const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + stride); + const v64 l2 = v64_load_aligned(src + sstride); v128 o = v128_from_v64(l1, l2); const v128 x = v128_add_8(c128, o); const v128 a = v128_add_8( c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); const v128 b = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src - 2), - v64_load_unaligned(src - 2 + stride))); + v64_load_unaligned(src - 2 + sstride))); const v128 c = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src - 1), - v64_load_unaligned(src - 1 + stride))); + v64_load_unaligned(src - 1 + sstride))); const v128 d = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src + 1), - v64_load_unaligned(src + 1 + stride))); + v64_load_unaligned(src + 1 + sstride))); const v128 e = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src + 2), - v64_load_unaligned(src + 2 + stride))); + v64_load_unaligned(src + 2 + sstride))); const v128 f = v128_add_8( c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * stride))); + src + ((y != bottom) + 1) * sstride))); const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), @@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, delta, v128_zero()))), 4)); v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + stride, v128_low_v64(o)); - src += stride * 2; - dst += stride * 2; + v64_store_aligned(dst + dstride, v128_low_v64(o)); + src += sstride * 2; + dst += dstride * 2; } } } } -void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride, - int x0, int y0, int sizex, int sizey, int width, - int height, unsigned int strength) { +void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizex, + int sizey, int width, int height, + unsigned int strength) { // TODO(stemidts): // A sizex different from 8 will only be needed if CLPF is extended to chroma. // This will only be used if 4:2:0 and width not a multiple of 16 and along @@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride, // this case. If not extended to chroma, this test will be redundant. if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) { // Fallback to C for odd sizes - aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height, - strength); + aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width, + height, strength); } else { - clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength); + clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height, + strength); } } diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index dc18944b3..6b2de8c59 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c @@ -3929,19 +3929,10 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data, #if CONFIG_CLPF if (cm->clpf_strength && !cm->skip_loop_filter) { - YV12_BUFFER_CONFIG dst; // Buffer for the result - - dst = pbi->cur_buf->buf; - CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height)); - - av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size, + const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf; + av1_clpf_frame(frame, frame, 0, cm, !!cm->clpf_size, cm->clpf_strength + (cm->clpf_strength == 3), 4 + cm->clpf_size, cm->clpf_blocks, clpf_bit); - - // Copy result - memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer, - dst.y_height * dst.y_stride); - aom_free(dst.y_buffer); } if (cm->clpf_blocks) aom_free(cm->clpf_blocks); #endif diff --git a/test/clpf_test.cc b/test/clpf_test.cc index 786180b6a..755d1f146 100644 --- a/test/clpf_test.cc +++ b/test/clpf_test.cc @@ -26,9 +26,9 @@ using libaom_test::ACMRandom; namespace { -typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride, - int x0, int y0, int sizex, int sizey, int width, - int height, unsigned int strength); +typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizex, int sizey, + int width, int height, unsigned int strength); typedef std::tr1::tuple clpf_block_param_t; @@ -85,10 +85,10 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) { for (ypos = 0; ypos < size && !error; ypos += h * !error) { for (xpos = 0; xpos < size && !error; xpos += w * !error) { for (strength = 0; strength < 3 && !error; strength += !error) { - ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size, + ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size, 1 << strength); - ASM_REGISTER_STATE_CHECK( - clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength)); + ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h, + size, size, 1 << strength)); for (pos = 0; pos < size * size && !error; pos++) { error = ref_d[pos] != d[pos]; @@ -137,7 +137,8 @@ TEST_P(ClpfSpeedTest, TestSpeed) { for (ypos = 0; ypos < size; ypos += h) { for (xpos = 0; xpos < size; xpos += w) { for (strength = 0; strength < 3; strength++) { - ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength); + ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size, + 1 << strength); } } } @@ -150,7 +151,7 @@ TEST_P(ClpfSpeedTest, TestSpeed) { for (ypos = 0; ypos < size; ypos += h) { for (xpos = 0; xpos < size; xpos += w) { for (strength = 0; strength < 3; strength++) { - clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength); + clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength); } } }