Extend CLPF to chroma.

Objective quality impact (low latency):

PSNR YCbCr:      0.13%     -1.37%     -1.79%
   PSNRHVS:      0.03%
      SSIM:      0.24%
    MSSSIM:      0.10%
 CIEDE2000:     -0.83%

Change-Id: I8ddf0def569286775f0f9d4d4005932766a7fc27
This commit is contained in:
Steinar Midtskogen
2016-09-13 16:37:13 +02:00
committed by Yaowu Xu
parent 9021d09f9a
commit ecf9a0c821
12 changed files with 697 additions and 887 deletions

View File

@@ -590,16 +590,16 @@ if (aom_config("CONFIG_CLPF") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift"; add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size";
specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift"; add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size";
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
} }
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength"; add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size";
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum"; add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size";
specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/;
} }

View File

@@ -8,9 +8,10 @@
* Media Patent License 1.0 was not distributed with this source code in the * Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent. * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/ */
#include <assert.h>
#include "av1/common/clpf.h" #include "av1/common/clpf.h"
#include "./aom_dsp_rtcd.h" #include "./aom_dsp_rtcd.h"
#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_dsp_common.h"
int av1_clpf_maxbits(const AV1_COMMON *cm) { int av1_clpf_maxbits(const AV1_COMMON *cm) {
@@ -72,21 +73,24 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
#endif #endif
// Return number of filtered blocks // Return number of filtered blocks
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, int av1_clpf_frame(
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org,
int enable_fb_flag, unsigned int strength, AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks, unsigned int fb_size_log2, uint8_t *blocks, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *, int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *, const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int,
const AV1_COMMON *cm, int, int, int, int, unsigned int, unsigned int, uint8_t *, int)) {
unsigned int, unsigned int, uint8_t *)) {
/* Constrained low-pass filter (CLPF) */ /* Constrained low-pass filter (CLPF) */
int c, k, l, m, n; int c, k, l, m, n;
const int bs = MI_SIZE; const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
const int width = frame->y_crop_width; const int suby = plane != AOM_PLANE_Y && frame->subsampling_y;
const int height = frame->y_crop_height; const int bs = (subx || suby) ? 4 : 8;
const int bslog = get_msb(bs);
int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width;
int height =
plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height;
int xpos, ypos; int xpos, ypos;
const int sstride = frame->y_stride; const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride;
int dstride = bs; int dstride = bs;
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2; const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2; const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
@@ -97,9 +101,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
int cache_idx = 0; int cache_idx = 0;
const int cache_size = num_fb_hor << (2 * fb_size_log2); const int cache_size = num_fb_hor << (2 * fb_size_log2);
const int cache_blocks = cache_size / (bs * bs); const int cache_blocks = cache_size / (bs * bs);
YV12_BUFFER_CONFIG dst = *frame; uint8_t *src_buffer =
plane != AOM_PLANE_Y
assert(bs == 8); // Optimised code assumes this. ? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
: frame->y_buffer;
uint8_t *dst_buffer;
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
strength <<= (cm->bit_depth - 8); strength <<= (cm->bit_depth - 8);
@@ -108,10 +114,10 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
// Make buffer space for in-place filtering // Make buffer space for in-place filtering
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth)); CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth));
dst.y_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache; dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
#else #else
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size)); CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
dst.y_buffer = cache; dst_buffer = cache;
#endif #endif
CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr))); CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr)));
CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst))); CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst)));
@@ -130,7 +136,8 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
ypos = yoff + m * bs; ypos = yoff + m * bs;
if (xpos < width && ypos < height) { if (xpos < width && ypos < height) {
allskip &= allskip &=
cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip; ->mbmi.skip;
} }
} }
@@ -144,13 +151,14 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
if (!allskip && // Do not filter the block if all is skip encoded if (!allskip && // Do not filter the block if all is skip encoded
(!enable_fb_flag || (!enable_fb_flag ||
decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength, decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
fb_size_log2, blocks + block_index))) { fb_size_log2, blocks + block_index, plane))) {
// Iterate over all smaller blocks inside the filter block // Iterate over all smaller blocks inside the filter block
for (m = 0; m < (h + bs - 1) / bs; m++) { for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
for (n = 0; n < (w + bs - 1) / bs; n++) { for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
xpos = xoff + n * bs; xpos = xoff + n * bs;
ypos = yoff + m * bs; ypos = yoff + m * bs;
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip) { // Not skip block ->mbmi.skip) { // Not skip block
// Temporary buffering needed if filtering in-place // Temporary buffering needed if filtering in-place
if (cache_ptr[cache_idx]) { if (cache_ptr[cache_idx]) {
@@ -161,50 +169,59 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
for (c = 0; c < bs; c++) { for (c = 0; c < bs; c++) {
*(uint64_t *)(d + c * sstride) = *(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
if (bs == 8)
*(uint64_t *)(d + c * sstride + 4) = *(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
} }
} else { } else {
for (c = 0; c < bs; c++) for (c = 0; c < bs; c++)
if (bs == 8)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs); *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
} }
#else #else
for (c = 0; c < bs; c++) for (c = 0; c < bs; c++)
if (bs == 8)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs); *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
#endif #endif
} }
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) { if (cm->use_highbitdepth) {
cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2; cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2;
dst.y_buffer = dst_buffer =
CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos; CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos;
} else { } else {
cache_ptr[cache_idx] = cache + cache_idx * bs * bs; cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
} }
#else #else
cache_ptr[cache_idx] = cache + cache_idx * bs * bs; cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
#endif #endif
cache_dst[cache_idx] = frame->y_buffer + ypos * sstride + xpos; cache_dst[cache_idx] = src_buffer + ypos * sstride + xpos;
if (++cache_idx >= cache_blocks) cache_idx = 0; if (++cache_idx >= cache_blocks) cache_idx = 0;
// Apply the filter // Apply the filter
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) { if (cm->use_highbitdepth) {
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(frame->y_buffer), aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
CONVERT_TO_SHORTPTR(dst.y_buffer), sstride, CONVERT_TO_SHORTPTR(dst_buffer), sstride,
dstride, xpos, ypos, bs, bs, width, height, dstride, xpos, ypos, bs, bs, width, height,
strength); strength);
} else { } else {
aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride, aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
xpos, ypos, bs, bs, width, height, strength); ypos, bs, bs, width, height, strength);
} }
#else #else
aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride, aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
xpos, ypos, bs, bs, width, height, strength); ypos, bs, bs, width, height, strength);
#endif #endif
} }
} }
@@ -223,16 +240,25 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
for (c = 0; c < bs; c++) { for (c = 0; c < bs; c++) {
*(uint64_t *)(d + c * sstride) = *(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
if (bs == 8)
*(uint64_t *)(d + c * sstride + 4) = *(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
} }
} else { } else {
for (c = 0; c < bs; c++) for (c = 0; c < bs; c++)
if (bs == 4)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs); *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
} }
#else #else
for (c = 0; c < bs; c++) for (c = 0; c < bs; c++)
if (bs == 4)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs); *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
#endif #endif

View File

@@ -20,10 +20,10 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks, unsigned int fb_size_log2, uint8_t *blocks, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *, int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *, const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int, const AV1_COMMON *cm, int, int, int,
unsigned int, unsigned int, uint8_t *)); unsigned int, unsigned int, uint8_t *, int));
#endif #endif

View File

@@ -10,131 +10,165 @@
*/ */
#include "./aom_dsp_rtcd.h" #include "./aom_dsp_rtcd.h"
#include "aom_ports/mem.h"
SIMD_INLINE void calc_delta(v128 o, v128 x, v128 a, v128 b, v128 c, v128 d, // delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
v128 e, v128 f, uint8_t *dst, v128 sp, v128 sm, // 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
int dstride) { // 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
// The difference will be 9 bit, offset by 128 so we can use saturated
// sub to avoid going to 16 bit temporarily before "strength" clipping.
const v128 c128 = v128_dup_8(128);
const v128 x = v128_add_8(c128, o);
const v128 c8 = v128_dup_8(8); const v128 c8 = v128_dup_8(8);
const v128 tmp = const v128 tmp = v128_add_8(
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, c), x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, d), x), sp), sm));
const v128 delta = v128_add_8( const v128 delta = v128_add_8(
v128_add_8( v128_add_8(
v128_shl_8( v128_shl_8(
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), v128_max_s8(
v128_min_s8(v128_ssub_s8(v128_add_8(c128, a), x), sp),
sm),
v128_max_s8(
v128_min_s8(v128_ssub_s8(v128_add_8(c128, f), x), sp),
sm)),
2), 2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, b), x), sp),
sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, e), x), sp),
sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp)); v128_add_8(v128_add_8(tmp, tmp), tmp));
o = v128_add_8( return v128_add_8(
o, o,
v128_shr_s8( v128_shr_s8(
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4)); 4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
} }
static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride, // Process blocks of width 8, two lines at a time, 8 bit.
static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width, int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) { int height, unsigned int strength) {
int bottom = height - 2 - y0; const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
const v128 sp = v128_dup_8(strength); const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength); const v128 sm = v128_dup_8(-(int)strength);
const v128 c128 = v128_dup_8(128); DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
int y;
dst += x0 + y0 * dstride; dst += x0 + y0 * dstride;
src += x0 + y0 * sstride; src += x0 + y0 * sstride;
if (!x0) { // Clip left for (y = 0; y < sizey; y += 2) {
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), const v64 l1 = v64_load_aligned(src);
v64_from_64(0x0504030201000000LL)); const v64 l2 = v64_load_aligned(src + sstride);
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), v128 o = v128_from_v64(l1, l2);
v64_from_64(0x0605040302010000LL)); const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
v64_load_unaligned(src - 2 * !!x0 + sstride));
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
v64_load_unaligned(src - !!x0 + sstride));
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
v64_load_unaligned(src + !!right + sstride));
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
}
// Process blocks of width 4, four lines at a time, 8 bit.
static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const int right = width - 4 - x0;
const int bottom = height - 4 - y0;
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
int y; int y;
for (y = 0; y < sizey; y += 2) { dst += x0 + y0 * dstride;
const v64 l1 = v64_load_aligned(src); src += x0 + y0 * sstride;
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < sizey; y += 2) { for (y = 0; y < sizey; y += 4) {
const v64 l1 = v64_load_aligned(src); const uint32_t l0 = u32_load_aligned(src - (y != -y0) * sstride);
const v64 l2 = v64_load_aligned(src + sstride); const uint32_t l1 = u32_load_aligned(src);
v128 o = v128_from_v64(l1, l2); const uint32_t l2 = u32_load_aligned(src + sstride);
const v128 x = v128_add_8(c128, o); const uint32_t l3 = u32_load_aligned(src + 2 * sstride);
const v128 a = v128_add_8( const uint32_t l4 = u32_load_aligned(src + 3 * sstride);
c128, const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); v128 o = v128_from_32(l1, l2, l3, l4);
const v128 b = v128_add_8( const v128 a = v128_from_32(l0, l1, l2, l3);
c128, v128_from_v64(v64_load_unaligned(src - 2), v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
v64_load_unaligned(src - 2 + sstride))); u32_load_unaligned(src + sstride - 2 * !!x0),
const v128 c = v128_add_8( u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
c128, v128_from_v64(v64_load_unaligned(src - 1), u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
v64_load_unaligned(src - 1 + sstride))); v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
const v128 d = v128_shuffle_8(x, d_shuff); u32_load_unaligned(src + sstride - !!x0),
const v128 e = v128_shuffle_8(x, e_shuff); u32_load_unaligned(src + 2 * sstride - !!x0),
const v128 f = v128_add_8( u32_load_unaligned(src + 3 * sstride - !!x0));
c128, v128_from_v64( v128 d = v128_from_32(u32_load_unaligned(src + !!right),
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride))); u32_load_unaligned(src + sstride + !!right),
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride); u32_load_unaligned(src + 2 * sstride + !!right),
src += sstride * 2; u32_load_unaligned(src + 3 * sstride + !!right));
dst += dstride * 2; v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
u32_load_unaligned(src + sstride + 2 * !!right),
u32_load_unaligned(src + 2 * sstride + 2 * !!right),
u32_load_unaligned(src + 3 * sstride + 2 * !!right));
const v128 f = v128_from_32(l2, l3, l4, l5);
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
} }
} else { // No left/right clipping if (!right) { // Right clipping
int y; d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
for (y = 0; y < sizey; y += 2) { e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
} }
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
dst += 4 * dstride;
src += 4 * sstride;
} }
} }
@@ -142,24 +176,23 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int dstride, int x0, int y0, int sizex,
int sizey, int width, int height, int sizey, int width, int height,
unsigned int strength) { unsigned int strength) {
// TODO(stemidts): if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
// A sizex different from 8 will only be needed if CLPF is extended to chroma. (sizey & 3 && sizex == 4) || x0 + 4 > width) {
// This will only be used if 4:2:0 and width not a multiple of 16 and along
// the right edge only, so we can fall back to the plain C implementation in
// this case. If not extended to chroma, this test will be redundant.
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
// Fallback to C for odd sizes // Fallback to C for odd sizes
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width, aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
height, strength); height, strength);
} else { } else {
clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height, (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, y0,
strength); sizey, width, height, strength);
} }
} }
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, // delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
v128 f, uint16_t *dst, v128 sp, v128 sm) { // 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
SIMD_INLINE v128 calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
const v128 c8 = v128_dup_16(8); const v128 c8 = v128_dup_16(8);
const v128 tmp = const v128 tmp =
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm), v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm),
@@ -174,87 +207,138 @@ static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm), v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm),
v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))), v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))),
v128_add_16(v128_add_16(tmp, tmp), tmp)); v128_add_16(v128_add_16(tmp, tmp), tmp));
v128_store_aligned( return v128_add_16(
dst,
v128_add_16(
o, v128_shr_s16( o, v128_shr_s16(
v128_add_16(c8, v128_add_16(delta, v128_cmplt_s16( v128_add_16(
delta, v128_zero()))), c8, v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
4))); 4));
} }
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride, static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
int dstride, int x0, int y0, int sizey, v128 f, uint16_t *dst, v128 sp, v128 sm,
int width, int height, unsigned int strength) { int dstride) {
int y; o = calc_delta_hbd(o, a, b, c, d, e, f, sp, sm);
int bottom = height - 2 - y0; v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
}
static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, uint16_t *dst, v128 sp, v128 sm) {
v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, sp, sm));
}
// Process blocks of width 4, two lines at time.
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizey, int width, int height,
unsigned int strength) {
const v128 sp = v128_dup_16(strength); const v128 sp = v128_dup_16(strength);
const v128 sm = v128_dup_16(-(int)strength); const v128 sm = v128_dup_16(-(int)strength);
const int right = width - 4 - x0;
const int bottom = height - 2 - y0;
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
int y;
dst += x0 + y0 * dstride; dst += x0 + y0 * dstride;
src += x0 + y0 * sstride; src += x0 + y0 * sstride;
if (!x0) { // Clip left for (y = 0; y < sizey; y += 2) {
const v128 b_shuff = v128_from_v64(v64_from_64(0x0b0a090807060504LL), const v64 l1 = v64_load_aligned(src);
v64_from_64(0x0302010001000100LL)); const v64 l2 = v64_load_aligned(src + sstride);
const v128 c_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080706LL), v128 o = v128_from_v64(l1, l2);
v64_from_64(0x0504030201000100LL)); const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
v64_load_unaligned(src - 2 * !!x0 + sstride));
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
v64_load_unaligned(src - !!x0 + sstride));
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
v64_load_unaligned(src + !!right + sstride));
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
}
}
// The most simple case. Start here if you need to understand the functions.
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
int width, int height, unsigned int strength) {
const v128 sp = v128_dup_16(strength);
const v128 sm = v128_dup_16(-(int)strength);
const int right = width - 8 - x0;
const int bottom = height - 2 - y0;
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
// Read 8 set of pixels at a time. Clipping along upper and lower
// edges is handled by reading the upper or lower line twice.
// Clipping along the left and right edges is handled by shuffle
// instructions doing shift and pad.
for (y = 0; y < sizey; y++) { for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src); const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride); const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_shuffle_8(o, b_shuff);
const v128 c = v128_shuffle_8(o, c_shuff);
const v128 d = v128_load_unaligned(src + 1);
const v128 e = v128_load_unaligned(src + 2);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm); v128 b = v128_load_unaligned(src - 2 * !!x0);
v128 c = v128_load_unaligned(src - !!x0);
v128 d = v128_load_unaligned(src + !!right);
v128 e = v128_load_unaligned(src + 2 * !!right);
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride; src += sstride;
dst += dstride; dst += dstride;
} }
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0d0c0b0aLL),
v64_from_64(0x0908070605040302LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0f0e0d0cLL),
v64_from_64(0x0b0a090807060504LL));
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_load_unaligned(src - 2);
const v128 c = v128_load_unaligned(src - 1);
const v128 d = v128_shuffle_8(o, d_shuff);
const v128 e = v128_shuffle_8(o, e_shuff);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
}
} else { // No left/right clipping
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_load_unaligned(src - 2);
const v128 c = v128_load_unaligned(src - 1);
const v128 d = v128_load_unaligned(src + 1);
const v128 e = v128_load_unaligned(src + 2);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
}
}
} }
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0, int sstride, int dstride, int x0, int y0,
int sizex, int sizey, int width, int height, int sizex, int sizey, int width, int height,
unsigned int strength) { unsigned int strength) {
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) { if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) {
// Fallback to C for odd sizes // Fallback to C for odd sizes
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
width, height, strength); width, height, strength);
} else { } else {
clpf_block_hbd(src, dst, sstride, dstride, x0, y0, sizey, width, height, (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
strength); src, dst, sstride, dstride, x0, y0, sizey, width, height, strength);
} }
} }
#endif #endif

View File

@@ -153,7 +153,9 @@ typedef struct AV1Common {
#if CONFIG_CLPF #if CONFIG_CLPF
int clpf_numblocks; int clpf_numblocks;
int clpf_size; int clpf_size;
int clpf_strength; int clpf_strength_y;
int clpf_strength_u;
int clpf_strength_v;
uint8_t *clpf_blocks; uint8_t *clpf_blocks;
#endif #endif

View File

@@ -29,6 +29,7 @@
#include "av1/common/alloccommon.h" #include "av1/common/alloccommon.h"
#if CONFIG_CLPF #if CONFIG_CLPF
#include "aom/aom_image.h"
#include "av1/common/clpf.h" #include "av1/common/clpf.h"
#endif #endif
#include "av1/common/common.h" #include "av1/common/common.h"
@@ -2046,8 +2047,10 @@ static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
#if CONFIG_CLPF #if CONFIG_CLPF
static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
cm->clpf_blocks = 0; cm->clpf_blocks = 0;
cm->clpf_strength = aom_rb_read_literal(rb, 2); cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
if (cm->clpf_strength) { cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
if (cm->clpf_strength_y) {
cm->clpf_size = aom_rb_read_literal(rb, 2); cm->clpf_size = aom_rb_read_literal(rb, 2);
if (cm->clpf_size) { if (cm->clpf_size) {
int i; int i;
@@ -2065,7 +2068,8 @@ static int clpf_bit(UNUSED int k, UNUSED int l,
UNUSED const YV12_BUFFER_CONFIG *org, UNUSED const YV12_BUFFER_CONFIG *org,
UNUSED const AV1_COMMON *cm, UNUSED int block_size, UNUSED const AV1_COMMON *cm, UNUSED int block_size,
UNUSED int w, UNUSED int h, UNUSED unsigned int strength, UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
UNUSED unsigned int fb_size_log2, uint8_t *bit) { UNUSED unsigned int fb_size_log2, uint8_t *bit,
UNUSED int comp) {
return *bit; return *bit;
} }
#endif #endif
@@ -3928,10 +3932,23 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
#endif // CONFIG_LOOP_RESTORATION #endif // CONFIG_LOOP_RESTORATION
#if CONFIG_CLPF #if CONFIG_CLPF
if (cm->clpf_strength && !cm->skip_loop_filter) { if (!cm->skip_loop_filter) {
av1_clpf_frame(&pbi->cur_buf->buf, 0, cm, !!cm->clpf_size, const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
cm->clpf_strength + (cm->clpf_strength == 3), if (cm->clpf_strength_y) {
4 + cm->clpf_size, cm->clpf_blocks, clpf_bit); av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
cm->clpf_strength_y + (cm->clpf_strength_y == 3),
4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
}
if (cm->clpf_strength_u) {
av1_clpf_frame(frame, NULL, cm, 0,
cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
AOM_PLANE_U, NULL);
}
if (cm->clpf_strength_v) {
av1_clpf_frame(frame, NULL, cm, 0,
cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
AOM_PLANE_V, NULL);
}
} }
if (cm->clpf_blocks) aom_free(cm->clpf_blocks); if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
#endif #endif

View File

@@ -2590,8 +2590,10 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
#if CONFIG_CLPF #if CONFIG_CLPF
static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
aom_wb_write_literal(wb, cm->clpf_strength, 2); aom_wb_write_literal(wb, cm->clpf_strength_y, 2);
if (cm->clpf_strength) { aom_wb_write_literal(wb, cm->clpf_strength_u, 2);
aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
if (cm->clpf_strength_y) {
aom_wb_write_literal(wb, cm->clpf_size, 2); aom_wb_write_literal(wb, cm->clpf_size, 2);
if (cm->clpf_size) { if (cm->clpf_size) {
int i; int i;

View File

@@ -11,16 +11,17 @@
#include "av1/common/clpf.h" #include "av1/common/clpf.h"
#include "./aom_dsp_rtcd.h" #include "./aom_dsp_rtcd.h"
#include "aom/aom_image.h"
#include "aom/aom_integer.h" #include "aom/aom_integer.h"
#include "av1/common/quant_common.h" #include "av1/common/quant_common.h"
// Calculate the error of a filtered and unfiltered block // Calculate the error of a filtered and unfiltered block
void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride, void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
int ostride, int x0, int y0, int width, int height, int ostride, int x0, int y0, int width, int height,
int *sum0, int *sum1, unsigned int strength) { int *sum0, int *sum1, unsigned int strength, int size) {
int x, y; int x, y;
for (y = y0; y < y0 + 8; y++) { for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + 8; x++) { for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x]; int O = org[y * ostride + x];
int X = rec[y * rstride + x]; int X = rec[y * rstride + x];
int A = rec[AOMMAX(0, y - 1) * rstride + x]; int A = rec[AOMMAX(0, y - 1) * rstride + x];
@@ -39,11 +40,11 @@ void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org, void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0, int rstride, int ostride, int x0, int y0,
int width, int height, int *sum) { int width, int height, int *sum, int size) {
int x, y; int x, y;
for (y = y0; y < y0 + 8; y++) { for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + 8; x++) { for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x]; int O = org[y * ostride + x];
int X = rec[y * rstride + x]; int X = rec[y * rstride + x];
int A = rec[AOMMAX(0, y - 1) * rstride + x]; int A = rec[AOMMAX(0, y - 1) * rstride + x];
@@ -71,10 +72,10 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org, void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0, int width, int rstride, int ostride, int x0, int y0, int width,
int height, int *sum0, int *sum1, int height, int *sum0, int *sum1,
unsigned int strength, int shift) { unsigned int strength, int shift, int size) {
int x, y; int x, y;
for (y = y0; y < y0 + 8; y++) { for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + 8; x++) { for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x] >> shift; int O = org[y * ostride + x] >> shift;
int X = rec[y * rstride + x] >> shift; int X = rec[y * rstride + x] >> shift;
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift; int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
@@ -94,11 +95,12 @@ void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
// aom_clpf_detect_multi_c() apart from "rec" and "org". // aom_clpf_detect_multi_c() apart from "rec" and "org".
void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org, void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0, int rstride, int ostride, int x0, int y0,
int width, int height, int *sum, int shift) { int width, int height, int *sum, int shift,
int size) {
int x, y; int x, y;
for (y = y0; y < y0 + 8; y++) { for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + 8; x++) { for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x] >> shift; int O = org[y * ostride + x] >> shift;
int X = rec[y * rstride + x] >> shift; int X = rec[y * rstride + x] >> shift;
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift; int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
@@ -125,31 +127,45 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength, int block_size, int w, int h, unsigned int strength,
unsigned int fb_size_log2, uint8_t *res) { unsigned int fb_size_log2, uint8_t *res, int plane) {
int m, n, sum0 = 0, sum1 = 0; int m, n, sum0 = 0, sum1 = 0;
const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
uint8_t *rec_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
: rec->y_buffer;
uint8_t *org_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
: org->y_buffer;
int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
int rec_height =
plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
for (m = 0; m < h; m++) { for (m = 0; m < h; m++) {
for (n = 0; n < w; n++) { for (n = 0; n < w; n++) {
int xpos = (l << fb_size_log2) + n * block_size; int xpos = (l << fb_size_log2) + n * block_size;
int ypos = (k << fb_size_log2) + m * block_size; int ypos = (k << fb_size_log2) + m * block_size;
const int bs = MAX_MIB_SIZE; if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] (xpos << subx) / MI_SIZE]
->mbmi.skip) { ->mbmi.skip) {
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) { if (cm->use_highbitdepth) {
aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), aom_clpf_detect_hbd(
CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride, CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
org->y_stride, xpos, ypos, rec->y_crop_width, rec_stride, org_stride, xpos, ypos, rec_width, rec_height, &sum0,
rec->y_crop_height, &sum0, &sum1, strength, &sum1, strength, cm->bit_depth - 8, block_size);
cm->bit_depth - 8);
} else { } else {
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, aom_clpf_detect(rec_buffer, org_buffer, rec_stride, org_stride, xpos,
org->y_stride, xpos, ypos, rec->y_crop_width, ypos, rec_width, rec_height, &sum0, &sum1, strength,
rec->y_crop_height, &sum0, &sum1, strength); block_size);
} }
#else #else
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, aom_clpf_detect(rec_buffer, org_buffer, rec_stride, org_stride, xpos,
org->y_stride, xpos, ypos, rec->y_crop_width, ypos, rec_width, rec_height, &sum0, &sum1, strength,
rec->y_crop_height, &sum0, &sum1, strength); block_size);
#endif #endif
} }
} }
@@ -161,6 +177,7 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
// Calculate the square error of all filter settings. Result: // Calculate the square error of all filter settings. Result:
// res[0][0] : unfiltered // res[0][0] : unfiltered
// res[0][1-3] : strength=1,2,4, no signals // res[0][1-3] : strength=1,2,4, no signals
// (Only for luma:)
// res[1][0] : (bit count, fb size = 128) // res[1][0] : (bit count, fb size = 128)
// res[1][1-3] : strength=1,2,4, fb size = 128 // res[1][1-3] : strength=1,2,4, fb size = 128
// res[2][0] : (bit count, fb size = 64) // res[2][0] : (bit count, fb size = 64)
@@ -170,12 +187,28 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
unsigned int block_size, unsigned int fb_size_log2, int w, unsigned int block_size, unsigned int fb_size_log2, int w,
int h, int64_t res[4][4]) { int h, int64_t res[4][4], int plane) {
int c, m, n, filtered = 0; int c, m, n, filtered = 0;
int sum[4]; int sum[4];
const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
int bslog = get_msb(block_size); int bslog = get_msb(block_size);
uint8_t *rec_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
: rec->y_buffer;
uint8_t *org_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
: org->y_buffer;
int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
int rec_height =
plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
sum[0] = sum[1] = sum[2] = sum[3] = 0; sum[0] = sum[1] = sum[2] = sum[3] = 0;
if (fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) { if (plane == AOM_PLANE_Y &&
fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered; int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
fb_size_log2--; fb_size_log2--;
@@ -190,16 +223,17 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
oldfiltered = res[i][0]; oldfiltered = res[i][0];
res[i][0] = 0; res[i][0] = 0;
filtered = filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, res); res, plane);
if (1 << (fb_size_log2 - bslog) < w) if (1 << (fb_size_log2 - bslog) < w)
filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size, filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
fb_size_log2, w2, h1, res); fb_size_log2, w2, h1, res, plane);
if (1 << (fb_size_log2 - bslog) < h) { if (1 << (fb_size_log2 - bslog) < h) {
filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size, filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
fb_size_log2, w1, h2, res); fb_size_log2, w1, h2, res, plane);
filtered |= clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), filtered |=
rec, org, cm, block_size, fb_size_log2, w2, h2, res); clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org,
cm, block_size, fb_size_log2, w2, h2, res, plane);
} }
res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]); res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
@@ -213,32 +247,31 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
for (n = 0; n < w; n++) { for (n = 0; n < w; n++) {
int xpos = x + n * block_size; int xpos = x + n * block_size;
int ypos = y + m * block_size; int ypos = y + m * block_size;
if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride + if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
xpos / MAX_MIB_SIZE] (xpos << subx) / MI_SIZE]
->mbmi.skip) { ->mbmi.skip) {
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) { if (cm->use_highbitdepth) {
aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), aom_clpf_detect_multi_hbd(
CONVERT_TO_SHORTPTR(org->y_buffer), CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
rec->y_stride, org->y_stride, xpos, ypos, rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
rec->y_crop_width, rec->y_crop_height, sum, cm->bit_depth - 8, block_size);
cm->bit_depth - 8);
} else { } else {
aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride, aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
org->y_stride, xpos, ypos, rec->y_crop_width, xpos, ypos, rec_width, rec_height, sum,
rec->y_crop_height, sum); block_size);
} }
#else #else
aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride, aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
org->y_stride, xpos, ypos, rec->y_crop_width, xpos, ypos, rec_width, rec_height, sum,
rec->y_crop_height, sum); block_size);
#endif #endif
filtered = 1; filtered = 1;
} }
} }
} }
for (c = 0; c < 4; c++) { for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) {
res[c][0] += sum[0]; res[c][0] += sum[0];
res[c][1] += sum[1]; res[c][1] += sum[1];
res[c][2] += sum[2]; res[c][2] += sum[2];
@@ -249,17 +282,26 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int *best_strength, int *best_bs) { int *best_strength, int *best_bs, int plane) {
int c, j, k, l; int c, j, k, l;
int64_t best, sums[4][4]; int64_t best, sums[4][4];
int width = rec->y_crop_width, height = rec->y_crop_height; int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
const int bs = MAX_MIB_SIZE; int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
const int bs = MI_SIZE;
const int bslog = get_msb(bs);
int fb_size_log2 = get_msb(MAX_FB_SIZE); int fb_size_log2 = get_msb(MAX_FB_SIZE);
int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2; int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2; int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
memset(sums, 0, sizeof(sums)); memset(sums, 0, sizeof(sums));
if (plane != AOM_PLANE_Y)
// Use a block size of MI_SIZE regardless of the subsampling. This
// This is accurate enough to determine the best strength and
// we don't need to add SIMD optimisations for 4x4 blocks.
clpf_rdo(0, 0, rec, org, cm, bs, fb_size_log2, width >> bslog,
height >> bslog, sums, plane);
else
for (k = 0; k < num_fb_ver; k++) { for (k = 0; k < num_fb_ver; k++) {
for (l = 0; l < num_fb_hor; l++) { for (l = 0; l < num_fb_hor; l++) {
// Calculate the block size after frame border clipping // Calculate the block size after frame border clipping
@@ -269,11 +311,14 @@ void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
h += !h << fb_size_log2; h += !h << fb_size_log2;
w += !w << fb_size_log2; w += !w << fb_size_log2;
clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, bs, clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, MI_SIZE,
fb_size_log2, w / bs, h / bs, sums); fb_size_log2, w >> bslog, h >> bslog, sums, plane);
} }
} }
if (plane != AOM_PLANE_Y) // Slightly favour unfiltered chroma
sums[0][0] -= sums[0][0] >> 7;
for (j = 0; j < 4; j++) { for (j = 0; j < 4; j++) {
static const double lambda_square[] = { static const double lambda_square[] = {
// exp(x / 8.5) // exp(x / 8.5)
@@ -290,13 +335,13 @@ void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
// Estimate the bit costs and adjust the square errors // Estimate the bit costs and adjust the square errors
double lambda = double lambda =
lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2]; lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2];
int i, cost = (int)((lambda * (sums[j][0] + 2 + 2 * (j > 0)) + 0.5)); int i, cost = (int)((lambda * (sums[j][0] + 6 + 2 * (j > 0)) + 0.5));
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i; sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i;
} }
best = (int64_t)1 << 62; best = (int64_t)1 << 62;
for (c = 0; c < 4; c++) for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++)
for (j = 0; j < 4; j++) for (j = 0; j < 4; j++)
if ((!c || j) && sums[c][j] < best) best = sums[c][j]; if ((!c || j) && sums[c][j] < best) best = sums[c][j];
best &= 15; best &= 15;

View File

@@ -17,10 +17,10 @@
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength, int block_size, int w, int h, unsigned int strength,
unsigned int fb_size_log2, uint8_t *res); unsigned int fb_size_log2, uint8_t *res, int plane);
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int *best_strength, int *best_bs); int *best_strength, int *best_bs, int plane);
#endif #endif

View File

@@ -9,343 +9,172 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent. * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/ */
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_simd.h" #include "aom_dsp/aom_simd.h"
#include "aom_ports/mem.h"
SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, SIMD_INLINE void calc_diff(v128 o, v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
v128 *f) {
// The difference will be 9 bit, offset by 128 so we can use saturated
// sub to avoid going to 16 bit temporarily before "strength" clipping.
const v128 c128 = v128_dup_8(128);
v128 x = v128_add_8(c128, o);
*a = v128_ssub_s8(v128_add_8(c128, *a), x);
*b = v128_ssub_s8(v128_add_8(c128, *b), x);
*c = v128_ssub_s8(v128_add_8(c128, *c), x);
*d = v128_ssub_s8(v128_add_8(c128, *d), x);
*e = v128_ssub_s8(v128_add_8(c128, *e), x);
*f = v128_ssub_s8(v128_add_8(c128, *f), x);
}
SIMD_INLINE v128 delta_kernel(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) { v128 f, v128 sp, v128 sm) {
const v128 tmp = const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(c, sp), sm),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), v128_max_s8(v128_min_s8(d, sp), sm));
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); const v128 delta = v128_add_8(
v128 delta = v128_add_8( v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, sp), sm),
v128_add_8( v128_max_s8(v128_min_s8(f, sp), sm)),
v128_shl_8(
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2), 2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), v128_add_8(v128_max_s8(v128_min_s8(b, sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), v128_max_s8(v128_min_s8(e, sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp)); v128_add_8(v128_add_8(tmp, tmp), tmp));
return v128_shr_s8( return v128_add_8(
o, v128_shr_s8(
v128_add_8(v128_dup_8(8), v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4); 4));
}
SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
calc_diff(o, &a, &b, &c, &d, &e, &f);
return delta_kernel(o, a, b, c, d, e, f, sp, sm);
}
SIMD_INLINE void clip_sides(v128 *b, v128 *c, v128 *d, v128 *e, int left,
int right) {
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
if (!left) { // Left clipping
*b = v128_shuffle_8(*b, v128_load_aligned(b_shuff));
*c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
}
if (!right) { // Right clipping
*d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
*e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
}
}
SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int bottom, int right, int y, v128 *o, v128 *r,
v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
v128 *f) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
*o = v128_from_v64(k1, k2);
*r = v128_from_v64(l1, l2);
*a = v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1);
*f = v128_from_v64(l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride));
*b = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
v64_load_unaligned(rec - 2 * !!x0 + rstride));
*c = v128_from_v64(v64_load_unaligned(rec - !!x0),
v64_load_unaligned(rec - !!x0 + rstride));
*d = v128_from_v64(v64_load_unaligned(rec + !!right),
v64_load_unaligned(rec + !!right + rstride));
*e = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
v64_load_unaligned(rec + 2 * !!right + rstride));
clip_sides(b, c, d, e, x0, right);
} }
void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0, int rstride, int ostride, int x0, int y0,
int width, int height, int *sum0, int *sum1, int width, int height, int *sum0, int *sum1,
unsigned int strength) { unsigned int strength, int size) {
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
const v128 c128 = v128_dup_8(128);
const v128 sp = v128_dup_8(strength); const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength); const v128 sm = v128_dup_8(-(int)strength);
const int right = width - 8 - x0;
const int bottom = height - 2 - y0; const int bottom = height - 2 - y0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
sum1, strength, size);
return;
}
rec += x0 + y0 * rstride; rec += x0 + y0 * rstride;
org += x0 + y0 * ostride; org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) { for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org); v128 a, b, c, d, e, f, o, r;
const v64 k2 = v64_load_aligned(org + ostride); read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
const v64 l1 = v64_load_aligned(rec); &a, &b, &c, &d, &e, &f);
const v64 l2 = v64_load_aligned(rec + rstride); ssd0 = v128_ssd_u8(ssd0, o, r);
v128 o = v128_from_v64(k1, k2); ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec + 1),
v64_load_unaligned(rec + 1 + rstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec + 2),
v64_load_unaligned(rec + 2 + rstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2; rec += rstride * 2;
org += ostride * 2; org += ostride * 2;
} }
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec - 2),
v64_load_unaligned(rec - 2 + rstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec - 1),
v64_load_unaligned(rec - 1 + rstride)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec - 2),
v64_load_unaligned(rec - 2 + rstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec - 1),
v64_load_unaligned(rec - 1 + rstride)));
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec + 1),
v64_load_unaligned(rec + 1 + rstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec + 2),
v64_load_unaligned(rec + 2 + rstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
}
*sum0 += v128_ssd_u8_sum(ssd0); *sum0 += v128_ssd_u8_sum(ssd0);
*sum1 += v128_ssd_u8_sum(ssd1); *sum1 += v128_ssd_u8_sum(ssd1);
} }
SIMD_INLINE void calc_delta_multi(v128 x, v128 q, v128 o, v128 a, v128 b, SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
v128 c, v128 d, v128 e, v128 f, v128 cp1, v128 d, v128 e, v128 f, ssd128_internal *ssd1,
v128 cm1, v128 cp2, v128 cm2, v128 cp4,
v128 cm4, ssd128_internal *ssd1,
ssd128_internal *ssd2, ssd128_internal *ssd2,
ssd128_internal *ssd3) { ssd128_internal *ssd3) {
v128 tmp, delta1, delta2, delta3; calc_diff(r, &a, &b, &c, &d, &e, &f);
const v128 c8 = v128_dup_8(8); *ssd1 = v128_ssd_u8(*ssd1, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(1),
v128_dup_8(-1)));
a = v128_ssub_s8(a, x); *ssd2 = v128_ssd_u8(*ssd2, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(2),
b = v128_ssub_s8(b, x); v128_dup_8(-2)));
c = v128_ssub_s8(c, x); *ssd3 = v128_ssd_u8(*ssd3, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(4),
d = v128_ssub_s8(d, x); v128_dup_8(-4)));
e = v128_ssub_s8(e, x);
f = v128_ssub_s8(f, x);
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
v128_max_s8(v128_min_s8(d, cp1), cm1));
delta1 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
v128_max_s8(v128_min_s8(f, cp1), cm1)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
v128_max_s8(v128_min_s8(e, cp1), cm1))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
v128_max_s8(v128_min_s8(d, cp2), cm2));
delta2 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
v128_max_s8(v128_min_s8(f, cp2), cm2)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
v128_max_s8(v128_min_s8(e, cp2), cm2))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
v128_max_s8(v128_min_s8(d, cp4), cm4));
delta3 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
v128_max_s8(v128_min_s8(f, cp4), cm4)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
v128_max_s8(v128_min_s8(e, cp4), cm4))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
*ssd1 = v128_ssd_u8(
*ssd1, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta1,
v128_cmplt_s8(delta1, v128_zero()))),
4)));
*ssd2 = v128_ssd_u8(
*ssd2, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta2,
v128_cmplt_s8(delta2, v128_zero()))),
4)));
*ssd3 = v128_ssd_u8(
*ssd3, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta3,
v128_cmplt_s8(delta3, v128_zero()))),
4)));
} }
// Test multiple filter strengths at once. // Test multiple filter strengths at once.
void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0, int rstride, int ostride, int x0, int y0,
int width, int height, int *sum) { int width, int height, int *sum,
const v128 c128 = v128_dup_8(128); int size) {
const v128 cp1 = v128_dup_8(1);
const v128 cm1 = v128_dup_8(-1);
const v128 cp2 = v128_dup_8(2);
const v128 cm2 = v128_dup_8(-2);
const v128 cp4 = v128_dup_8(4);
const v128 cm4 = v128_dup_8(-4);
const int bottom = height - 2 - y0; const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
ssd128_internal ssd0 = v128_ssd_u8_init(); ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init(); ssd128_internal ssd1 = v128_ssd_u8_init();
ssd128_internal ssd2 = v128_ssd_u8_init(); ssd128_internal ssd2 = v128_ssd_u8_init();
ssd128_internal ssd3 = v128_ssd_u8_init(); ssd128_internal ssd3 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
sum, size);
return;
}
rec += x0 + y0 * rstride; rec += x0 + y0 * rstride;
org += x0 + y0 * ostride; org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) { for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org); v128 a, b, c, d, e, f, o, r;
const v64 k2 = v64_load_aligned(org + ostride); read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
const v64 l1 = v64_load_aligned(rec); &a, &b, &c, &d, &e, &f);
const v64 l2 = v64_load_aligned(rec + rstride); ssd0 = v128_ssd_u8(ssd0, o, r);
v128 o = v128_from_v64(k1, k2); calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
v128 b = v128_shuffle_8(x, b_shuff);
v128 c = v128_shuffle_8(x, c_shuff);
v128 d = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec + 1),
v64_load_unaligned(rec + 1 + rstride)));
v128 e = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec + 2),
v64_load_unaligned(rec + 2 + rstride)));
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride; rec += 2 * rstride;
org += 2 * ostride; org += 2 * ostride;
} }
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
v128 b = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec - 2),
v64_load_unaligned(rec - 2 + rstride)));
v128 c = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec - 1),
v64_load_unaligned(rec - 1 + rstride)));
v128 d = v128_shuffle_8(x, d_shuff);
v128 e = v128_shuffle_8(x, e_shuff);
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
v128 b = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec - 2),
v64_load_unaligned(rec - 2 + rstride)));
v128 c = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec - 1),
v64_load_unaligned(rec - 1 + rstride)));
v128 d = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec + 1),
v64_load_unaligned(rec + 1 + rstride)));
v128 e = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec + 2),
v64_load_unaligned(rec + 2 + rstride)));
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
}
sum[0] += v128_ssd_u8_sum(ssd0); sum[0] += v128_ssd_u8_sum(ssd0);
sum[1] += v128_ssd_u8_sum(ssd1); sum[1] += v128_ssd_u8_sum(ssd1);
sum[2] += v128_ssd_u8_sum(ssd2); sum[2] += v128_ssd_u8_sum(ssd2);
@@ -353,155 +182,67 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
} }
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0,
int bottom, int right, int y, v128 *o,
v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
v128 *e, v128 *f, int shift) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
*o = v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
*r = v128_unziplo_8(n1, n2);
*a = v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift), n1);
*f = v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(rec + ((y != bottom) + 1) * rstride),
shift));
*b = v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
*c = v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
*d = v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
*e = v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
clip_sides(b, c, d, e, x0, right);
}
void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org, void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0, int rstride, int ostride, int x0, int y0,
int width, int height, int *sum0, int *sum1, int width, int height, int *sum0, int *sum1,
unsigned int strength, int shift) { unsigned int strength, int shift,
ssd128_internal ssd0 = v128_ssd_u8_init(); int size) {
ssd128_internal ssd1 = v128_ssd_u8_init();
const v128 c128 = v128_dup_8(128);
const v128 sp = v128_dup_8(strength >> shift); const v128 sp = v128_dup_8(strength >> shift);
const v128 sm = v128_dup_8(-(int)(strength >> shift)); const v128 sm = v128_dup_8(-(int)(strength >> shift));
const int bottom = height - 2 - y0; const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
sum0, sum1, strength, shift, size);
return;
}
rec += x0 + y0 * rstride; rec += x0 + y0 * rstride;
org += x0 + y0 * ostride; org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) { for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); v128 a, b, c, d, e, f, o, r;
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
const v128 o = &r, &a, &b, &c, &d, &e, &f, shift);
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), ssd0 = v128_ssd_u8(ssd0, o, r);
v128_shr_u16(v128_load_aligned(org + ostride), shift)); ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2; rec += rstride * 2;
org += ostride * 2; org += ostride * 2;
} }
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
}
*sum0 += v128_ssd_u8_sum(ssd0); *sum0 += v128_ssd_u8_sum(ssd0);
*sum1 += v128_ssd_u8_sum(ssd1); *sum1 += v128_ssd_u8_sum(ssd1);
} }
@@ -510,159 +251,33 @@ void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
const uint16_t *org, int rstride, const uint16_t *org, int rstride,
int ostride, int x0, int y0, int ostride, int x0, int y0,
int width, int height, int *sum, int width, int height, int *sum,
int shift) { int shift, int size) {
const v128 c128 = v128_dup_8(128);
const v128 cp1 = v128_dup_8(1);
const v128 cm1 = v128_dup_8(-1);
const v128 cp2 = v128_dup_8(2);
const v128 cm2 = v128_dup_8(-2);
const v128 cp4 = v128_dup_8(4);
const v128 cm4 = v128_dup_8(-4);
const int bottom = height - 2 - y0; const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
ssd128_internal ssd0 = v128_ssd_u8_init(); ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init(); ssd128_internal ssd1 = v128_ssd_u8_init();
ssd128_internal ssd2 = v128_ssd_u8_init(); ssd128_internal ssd2 = v128_ssd_u8_init();
ssd128_internal ssd3 = v128_ssd_u8_init(); ssd128_internal ssd3 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
height, sum, shift, size);
return;
}
rec += x0 + y0 * rstride; rec += x0 + y0 * rstride;
org += x0 + y0 * ostride; org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) { for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); v128 a, b, c, d, e, f, o, r;
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
const v128 o = &r, &a, &b, &c, &d, &e, &f, shift);
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), ssd0 = v128_ssd_u8(ssd0, o, r);
v128_shr_u16(v128_load_aligned(org + ostride), shift)); calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride; rec += 2 * rstride;
org += 2 * ostride; org += 2 * ostride;
} }
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
}
sum[0] += v128_ssd_u8_sum(ssd0); sum[0] += v128_ssd_u8_sum(ssd0);
sum[1] += v128_ssd_u8_sum(ssd1); sum[1] += v128_ssd_u8_sum(ssd1);
sum[2] += v128_ssd_u8_sum(ssd2); sum[2] += v128_ssd_u8_sum(ssd2);

View File

@@ -16,6 +16,7 @@
#include "av1/common/alloccommon.h" #include "av1/common/alloccommon.h"
#if CONFIG_CLPF #if CONFIG_CLPF
#include "aom/aom_image.h"
#include "av1/common/clpf.h" #include "av1/common/clpf.h"
#include "av1/encoder/clpf_rdo.h" #include "av1/encoder/clpf_rdo.h"
#endif #endif
@@ -3422,7 +3423,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
#endif #endif
} }
#if CONFIG_CLPF #if CONFIG_CLPF
cm->clpf_strength = 0; cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
cm->clpf_size = 2; cm->clpf_size = 2;
CHECK_MEM_ERROR( CHECK_MEM_ERROR(
cm, cm->clpf_blocks, cm, cm->clpf_blocks,
@@ -3430,21 +3431,37 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
((cm->frame_to_show->y_crop_height + 31) & ~31) >> ((cm->frame_to_show->y_crop_height + 31) & ~31) >>
10)); 10));
if (!is_lossless_requested(&cpi->oxcf)) { if (!is_lossless_requested(&cpi->oxcf)) {
const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
// Find the best strength and block size for the entire frame // Find the best strength and block size for the entire frame
int fb_size_log2, strength; int fb_size_log2, strength_y, strength_u, strength_v;
av1_clpf_test_frame(cm->frame_to_show, cpi->Source, cm, &strength, av1_clpf_test_frame(frame, cpi->Source, cm, &strength_y, &fb_size_log2,
&fb_size_log2); AOM_PLANE_Y);
av1_clpf_test_frame(frame, cpi->Source, cm, &strength_u, &fb_size_log2,
AOM_PLANE_U);
av1_clpf_test_frame(frame, cpi->Source, cm, &strength_v, &fb_size_log2,
AOM_PLANE_V);
if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE); if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE);
if (strength) { if (strength_y) {
// Apply the filter using the chosen strength // Apply the filter using the chosen strength
cm->clpf_strength = strength - (strength == 4); cm->clpf_strength_y = strength_y - (strength_y == 4);
cm->clpf_size = cm->clpf_size =
fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0; fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
cm->clpf_numblocks = av1_clpf_frame( cm->clpf_numblocks = av1_clpf_frame(
cm->frame_to_show, cpi->Source, cm, !!cm->clpf_size, strength, frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
4 + cm->clpf_size, cm->clpf_blocks, av1_clpf_decision); 4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
}
if (strength_u) {
cm->clpf_strength_u = strength_u - (strength_u == 4);
av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
NULL);
}
if (strength_v) {
cm->clpf_strength_v = strength_v - (strength_v == 4);
av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
NULL);
} }
} }
#endif #endif

View File

@@ -147,6 +147,8 @@ void test_clpf(int w, int h, int depth, int iterations,
<< "strength: " << (1 << strength) << std::endl << "strength: " << (1 << strength) << std::endl
<< "xpos: " << xpos << std::endl << "xpos: " << xpos << std::endl
<< "ypos: " << ypos << std::endl << "ypos: " << ypos << std::endl
<< "w: " << w << std::endl
<< "h: " << h << std::endl
<< "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl << "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
<< "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl << "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
<< "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl << "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl