Extend CLPF to chroma.

Objective quality impact (low latency):

PSNR YCbCr:      0.13%     -1.37%     -1.79%
   PSNRHVS:      0.03%
      SSIM:      0.24%
    MSSSIM:      0.10%
 CIEDE2000:     -0.83%

Change-Id: I8ddf0def569286775f0f9d4d4005932766a7fc27
This commit is contained in:
Steinar Midtskogen
2016-09-13 16:37:13 +02:00
committed by Yaowu Xu
parent 9021d09f9a
commit ecf9a0c821
12 changed files with 697 additions and 887 deletions

View File

@@ -590,16 +590,16 @@ if (aom_config("CONFIG_CLPF") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size";
specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size";
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
}
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size";
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size";
specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/;
}

View File

@@ -8,9 +8,10 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "av1/common/clpf.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h"
int av1_clpf_maxbits(const AV1_COMMON *cm) {
@@ -72,21 +73,24 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
#endif
// Return number of filtered blocks
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int,
unsigned int, unsigned int, uint8_t *)) {
int av1_clpf_frame(
const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org,
AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int,
int, unsigned int, unsigned int, uint8_t *, int)) {
/* Constrained low-pass filter (CLPF) */
int c, k, l, m, n;
const int bs = MI_SIZE;
const int width = frame->y_crop_width;
const int height = frame->y_crop_height;
const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
const int suby = plane != AOM_PLANE_Y && frame->subsampling_y;
const int bs = (subx || suby) ? 4 : 8;
const int bslog = get_msb(bs);
int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width;
int height =
plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height;
int xpos, ypos;
const int sstride = frame->y_stride;
const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride;
int dstride = bs;
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
@@ -97,9 +101,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
int cache_idx = 0;
const int cache_size = num_fb_hor << (2 * fb_size_log2);
const int cache_blocks = cache_size / (bs * bs);
YV12_BUFFER_CONFIG dst = *frame;
assert(bs == 8); // Optimised code assumes this.
uint8_t *src_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
: frame->y_buffer;
uint8_t *dst_buffer;
#if CONFIG_AOM_HIGHBITDEPTH
strength <<= (cm->bit_depth - 8);
@@ -108,10 +114,10 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
// Make buffer space for in-place filtering
#if CONFIG_AOM_HIGHBITDEPTH
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth));
dst.y_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
#else
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
dst.y_buffer = cache;
dst_buffer = cache;
#endif
CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr)));
CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst)));
@@ -130,7 +136,8 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
ypos = yoff + m * bs;
if (xpos < width && ypos < height) {
allskip &=
cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip;
}
}
@@ -144,13 +151,14 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
if (!allskip && // Do not filter the block if all is skip encoded
(!enable_fb_flag ||
decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
fb_size_log2, blocks + block_index))) {
fb_size_log2, blocks + block_index, plane))) {
// Iterate over all smaller blocks inside the filter block
for (m = 0; m < (h + bs - 1) / bs; m++) {
for (n = 0; n < (w + bs - 1) / bs; n++) {
for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
xpos = xoff + n * bs;
ypos = yoff + m * bs;
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip) { // Not skip block
// Temporary buffering needed if filtering in-place
if (cache_ptr[cache_idx]) {
@@ -161,50 +169,59 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
for (c = 0; c < bs; c++) {
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
if (bs == 8)
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
}
} else {
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
if (bs == 8)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
}
#else
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
if (bs == 8)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
#endif
}
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2;
dst.y_buffer =
dst_buffer =
CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos;
} else {
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
}
#else
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
#endif
cache_dst[cache_idx] = frame->y_buffer + ypos * sstride + xpos;
cache_dst[cache_idx] = src_buffer + ypos * sstride + xpos;
if (++cache_idx >= cache_blocks) cache_idx = 0;
// Apply the filter
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(frame->y_buffer),
CONVERT_TO_SHORTPTR(dst.y_buffer), sstride,
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
CONVERT_TO_SHORTPTR(dst_buffer), sstride,
dstride, xpos, ypos, bs, bs, width, height,
strength);
} else {
aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride,
xpos, ypos, bs, bs, width, height, strength);
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, bs, bs, width, height, strength);
}
#else
aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride,
xpos, ypos, bs, bs, width, height, strength);
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, bs, bs, width, height, strength);
#endif
}
}
@@ -223,18 +240,27 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
for (c = 0; c < bs; c++) {
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
if (bs == 8)
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
}
} else {
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
if (bs == 4)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
}
#else
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
if (bs == 4)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
#endif
}

View File

@@ -20,10 +20,10 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks,
unsigned int fb_size_log2, uint8_t *blocks, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int,
unsigned int, unsigned int, uint8_t *));
unsigned int, unsigned int, uint8_t *, int));
#endif

View File

@@ -10,131 +10,165 @@
*/
#include "./aom_dsp_rtcd.h"
#include "aom_ports/mem.h"
SIMD_INLINE void calc_delta(v128 o, v128 x, v128 a, v128 b, v128 c, v128 d,
v128 e, v128 f, uint8_t *dst, v128 sp, v128 sm,
int dstride) {
// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
// The difference will be 9 bit, offset by 128 so we can use saturated
// sub to avoid going to 16 bit temporarily before "strength" clipping.
const v128 c128 = v128_dup_8(128);
const v128 x = v128_add_8(c128, o);
const v128 c8 = v128_dup_8(8);
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
const v128 tmp = v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, c), x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, d), x), sp), sm));
const v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
v128_add_8(
v128_max_s8(
v128_min_s8(v128_ssub_s8(v128_add_8(c128, a), x), sp),
sm),
v128_max_s8(
v128_min_s8(v128_ssub_s8(v128_add_8(c128, f), x), sp),
sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, b), x), sp),
sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, e), x), sp),
sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
o = v128_add_8(
return v128_add_8(
o,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
}
static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
int bottom = height - 2 - y0;
// Process blocks of width 8, two lines at a time, 8 bit.
static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const v128 c128 = v128_dup_8(128);
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
v64_load_unaligned(src - 2 * !!x0 + sstride));
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
v64_load_unaligned(src - !!x0 + sstride));
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
v64_load_unaligned(src + !!right + sstride));
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
}
// Process blocks of width 4, four lines at a time, 8 bit.
static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const int right = width - 4 - x0;
const int bottom = height - 4 - y0;
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
for (y = 0; y < sizey; y += 4) {
const uint32_t l0 = u32_load_aligned(src - (y != -y0) * sstride);
const uint32_t l1 = u32_load_aligned(src);
const uint32_t l2 = u32_load_aligned(src + sstride);
const uint32_t l3 = u32_load_aligned(src + 2 * sstride);
const uint32_t l4 = u32_load_aligned(src + 3 * sstride);
const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
v128 o = v128_from_32(l1, l2, l3, l4);
const v128 a = v128_from_32(l0, l1, l2, l3);
v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
u32_load_unaligned(src + sstride - 2 * !!x0),
u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
u32_load_unaligned(src + sstride - !!x0),
u32_load_unaligned(src + 2 * sstride - !!x0),
u32_load_unaligned(src + 3 * sstride - !!x0));
v128 d = v128_from_32(u32_load_unaligned(src + !!right),
u32_load_unaligned(src + sstride + !!right),
u32_load_unaligned(src + 2 * sstride + !!right),
u32_load_unaligned(src + 3 * sstride + !!right));
v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
u32_load_unaligned(src + sstride + 2 * !!right),
u32_load_unaligned(src + 2 * sstride + 2 * !!right),
u32_load_unaligned(src + 3 * sstride + 2 * !!right));
const v128 f = v128_from_32(l2, l3, l4, l5);
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
} else { // No left/right clipping
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
dst += 4 * dstride;
src += 4 * sstride;
}
}
@@ -142,24 +176,23 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, int width, int height,
unsigned int strength) {
// TODO(stemidts):
// A sizex different from 8 will only be needed if CLPF is extended to chroma.
// This will only be used if 4:2:0 and width not a multiple of 16 and along
// the right edge only, so we can fall back to the plain C implementation in
// this case. If not extended to chroma, this test will be redundant.
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
(sizey & 3 && sizex == 4) || x0 + 4 > width) {
// Fallback to C for odd sizes
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
height, strength);
} else {
clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
strength);
(sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, y0,
sizey, width, height, strength);
}
}
#if CONFIG_AOM_HIGHBITDEPTH
static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, uint16_t *dst, v128 sp, v128 sm) {
// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
SIMD_INLINE v128 calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
const v128 c8 = v128_dup_16(8);
const v128 tmp =
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm),
@@ -174,73 +207,124 @@ static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm),
v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))),
v128_add_16(v128_add_16(tmp, tmp), tmp));
v128_store_aligned(
dst,
v128_add_16(
o, v128_shr_s16(
v128_add_16(c8, v128_add_16(delta, v128_cmplt_s16(
delta, v128_zero()))),
4)));
return v128_add_16(
o, v128_shr_s16(
v128_add_16(
c8, v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
4));
}
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
int width, int height, unsigned int strength) {
int y;
int bottom = height - 2 - y0;
static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, uint16_t *dst, v128 sp, v128 sm,
int dstride) {
o = calc_delta_hbd(o, a, b, c, d, e, f, sp, sm);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
}
static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, uint16_t *dst, v128 sp, v128 sm) {
v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, sp, sm));
}
// Process blocks of width 4, two lines at time.
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizey, int width, int height,
unsigned int strength) {
const v128 sp = v128_dup_16(strength);
const v128 sm = v128_dup_16(-(int)strength);
const int right = width - 4 - x0;
const int bottom = height - 2 - y0;
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0b0a090807060504LL),
v64_from_64(0x0302010001000100LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080706LL),
v64_from_64(0x0504030201000100LL));
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_shuffle_8(o, b_shuff);
const v128 c = v128_shuffle_8(o, c_shuff);
const v128 d = v128_load_unaligned(src + 1);
const v128 e = v128_load_unaligned(src + 2);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
v64_load_unaligned(src - 2 * !!x0 + sstride));
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
v64_load_unaligned(src - !!x0 + sstride));
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
v64_load_unaligned(src + !!right + sstride));
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0d0c0b0aLL),
v64_from_64(0x0908070605040302LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0f0e0d0cLL),
v64_from_64(0x0b0a090807060504LL));
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_load_unaligned(src - 2);
const v128 c = v128_load_unaligned(src - 1);
const v128 d = v128_shuffle_8(o, d_shuff);
const v128 e = v128_shuffle_8(o, e_shuff);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
} else { // No left/right clipping
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_load_unaligned(src - 2);
const v128 c = v128_load_unaligned(src - 1);
const v128 d = v128_load_unaligned(src + 1);
const v128 e = v128_load_unaligned(src + 2);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
}
}
// The most simple case. Start here if you need to understand the functions.
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
int width, int height, unsigned int strength) {
const v128 sp = v128_dup_16(strength);
const v128 sm = v128_dup_16(-(int)strength);
const int right = width - 8 - x0;
const int bottom = height - 2 - y0;
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
// Read 8 set of pixels at a time. Clipping along upper and lower
// edges is handled by reading the upper or lower line twice.
// Clipping along the left and right edges is handled by shuffle
// instructions doing shift and pad.
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
v128 b = v128_load_unaligned(src - 2 * !!x0);
v128 c = v128_load_unaligned(src - !!x0);
v128 d = v128_load_unaligned(src + !!right);
v128 e = v128_load_unaligned(src + 2 * !!right);
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
}
}
@@ -248,13 +332,13 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizex, int sizey, int width, int height,
unsigned int strength) {
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) {
// Fallback to C for odd sizes
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
width, height, strength);
} else {
clpf_block_hbd(src, dst, sstride, dstride, x0, y0, sizey, width, height,
strength);
(sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
src, dst, sstride, dstride, x0, y0, sizey, width, height, strength);
}
}
#endif

View File

@@ -153,7 +153,9 @@ typedef struct AV1Common {
#if CONFIG_CLPF
int clpf_numblocks;
int clpf_size;
int clpf_strength;
int clpf_strength_y;
int clpf_strength_u;
int clpf_strength_v;
uint8_t *clpf_blocks;
#endif

View File

@@ -29,6 +29,7 @@
#include "av1/common/alloccommon.h"
#if CONFIG_CLPF
#include "aom/aom_image.h"
#include "av1/common/clpf.h"
#endif
#include "av1/common/common.h"
@@ -2046,8 +2047,10 @@ static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
#if CONFIG_CLPF
static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
cm->clpf_blocks = 0;
cm->clpf_strength = aom_rb_read_literal(rb, 2);
if (cm->clpf_strength) {
cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
if (cm->clpf_strength_y) {
cm->clpf_size = aom_rb_read_literal(rb, 2);
if (cm->clpf_size) {
int i;
@@ -2065,7 +2068,8 @@ static int clpf_bit(UNUSED int k, UNUSED int l,
UNUSED const YV12_BUFFER_CONFIG *org,
UNUSED const AV1_COMMON *cm, UNUSED int block_size,
UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
UNUSED unsigned int fb_size_log2, uint8_t *bit) {
UNUSED unsigned int fb_size_log2, uint8_t *bit,
UNUSED int comp) {
return *bit;
}
#endif
@@ -3928,10 +3932,23 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
#endif // CONFIG_LOOP_RESTORATION
#if CONFIG_CLPF
if (cm->clpf_strength && !cm->skip_loop_filter) {
av1_clpf_frame(&pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
cm->clpf_strength + (cm->clpf_strength == 3),
4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
if (!cm->skip_loop_filter) {
const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
if (cm->clpf_strength_y) {
av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
cm->clpf_strength_y + (cm->clpf_strength_y == 3),
4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
}
if (cm->clpf_strength_u) {
av1_clpf_frame(frame, NULL, cm, 0,
cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
AOM_PLANE_U, NULL);
}
if (cm->clpf_strength_v) {
av1_clpf_frame(frame, NULL, cm, 0,
cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
AOM_PLANE_V, NULL);
}
}
if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
#endif

View File

@@ -2590,8 +2590,10 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
#if CONFIG_CLPF
static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
aom_wb_write_literal(wb, cm->clpf_strength, 2);
if (cm->clpf_strength) {
aom_wb_write_literal(wb, cm->clpf_strength_y, 2);
aom_wb_write_literal(wb, cm->clpf_strength_u, 2);
aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
if (cm->clpf_strength_y) {
aom_wb_write_literal(wb, cm->clpf_size, 2);
if (cm->clpf_size) {
int i;

View File

@@ -11,16 +11,17 @@
#include "av1/common/clpf.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_image.h"
#include "aom/aom_integer.h"
#include "av1/common/quant_common.h"
// Calculate the error of a filtered and unfiltered block
void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
int ostride, int x0, int y0, int width, int height,
int *sum0, int *sum1, unsigned int strength) {
int *sum0, int *sum1, unsigned int strength, int size) {
int x, y;
for (y = y0; y < y0 + 8; y++) {
for (x = x0; x < x0 + 8; x++) {
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x];
int X = rec[y * rstride + x];
int A = rec[AOMMAX(0, y - 1) * rstride + x];
@@ -39,11 +40,11 @@ void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum) {
int width, int height, int *sum, int size) {
int x, y;
for (y = y0; y < y0 + 8; y++) {
for (x = x0; x < x0 + 8; x++) {
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x];
int X = rec[y * rstride + x];
int A = rec[AOMMAX(0, y - 1) * rstride + x];
@@ -71,10 +72,10 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0, int width,
int height, int *sum0, int *sum1,
unsigned int strength, int shift) {
unsigned int strength, int shift, int size) {
int x, y;
for (y = y0; y < y0 + 8; y++) {
for (x = x0; x < x0 + 8; x++) {
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x] >> shift;
int X = rec[y * rstride + x] >> shift;
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
@@ -94,11 +95,12 @@ void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
// aom_clpf_detect_multi_c() apart from "rec" and "org".
void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum, int shift) {
int width, int height, int *sum, int shift,
int size) {
int x, y;
for (y = y0; y < y0 + 8; y++) {
for (x = x0; x < x0 + 8; x++) {
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x] >> shift;
int X = rec[y * rstride + x] >> shift;
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
@@ -125,31 +127,45 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength,
unsigned int fb_size_log2, uint8_t *res) {
unsigned int fb_size_log2, uint8_t *res, int plane) {
int m, n, sum0 = 0, sum1 = 0;
const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
uint8_t *rec_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
: rec->y_buffer;
uint8_t *org_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
: org->y_buffer;
int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
int rec_height =
plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
for (m = 0; m < h; m++) {
for (n = 0; n < w; n++) {
int xpos = (l << fb_size_log2) + n * block_size;
int ypos = (k << fb_size_log2) + m * block_size;
const int bs = MAX_MIB_SIZE;
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip) {
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength,
cm->bit_depth - 8);
aom_clpf_detect_hbd(
CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
rec_stride, org_stride, xpos, ypos, rec_width, rec_height, &sum0,
&sum1, strength, cm->bit_depth - 8, block_size);
} else {
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength);
aom_clpf_detect(rec_buffer, org_buffer, rec_stride, org_stride, xpos,
ypos, rec_width, rec_height, &sum0, &sum1, strength,
block_size);
}
#else
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength);
aom_clpf_detect(rec_buffer, org_buffer, rec_stride, org_stride, xpos,
ypos, rec_width, rec_height, &sum0, &sum1, strength,
block_size);
#endif
}
}
@@ -161,6 +177,7 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
// Calculate the square error of all filter settings. Result:
// res[0][0] : unfiltered
// res[0][1-3] : strength=1,2,4, no signals
// (Only for luma:)
// res[1][0] : (bit count, fb size = 128)
// res[1][1-3] : strength=1,2,4, fb size = 128
// res[2][0] : (bit count, fb size = 64)
@@ -170,12 +187,28 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
unsigned int block_size, unsigned int fb_size_log2, int w,
int h, int64_t res[4][4]) {
int h, int64_t res[4][4], int plane) {
int c, m, n, filtered = 0;
int sum[4];
const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
int bslog = get_msb(block_size);
uint8_t *rec_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
: rec->y_buffer;
uint8_t *org_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
: org->y_buffer;
int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
int rec_height =
plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
sum[0] = sum[1] = sum[2] = sum[3] = 0;
if (fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
if (plane == AOM_PLANE_Y &&
fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
fb_size_log2--;
@@ -190,16 +223,17 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
oldfiltered = res[i][0];
res[i][0] = 0;
filtered =
clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, res);
filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
res, plane);
if (1 << (fb_size_log2 - bslog) < w)
filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
fb_size_log2, w2, h1, res);
fb_size_log2, w2, h1, res, plane);
if (1 << (fb_size_log2 - bslog) < h) {
filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
fb_size_log2, w1, h2, res);
filtered |= clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2),
rec, org, cm, block_size, fb_size_log2, w2, h2, res);
fb_size_log2, w1, h2, res, plane);
filtered |=
clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org,
cm, block_size, fb_size_log2, w2, h2, res, plane);
}
res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
@@ -213,32 +247,31 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
for (n = 0; n < w; n++) {
int xpos = x + n * block_size;
int ypos = y + m * block_size;
if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride +
xpos / MAX_MIB_SIZE]
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip) {
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
CONVERT_TO_SHORTPTR(org->y_buffer),
rec->y_stride, org->y_stride, xpos, ypos,
rec->y_crop_width, rec->y_crop_height, sum,
cm->bit_depth - 8);
aom_clpf_detect_multi_hbd(
CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
cm->bit_depth - 8, block_size);
} else {
aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, sum);
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
xpos, ypos, rec_width, rec_height, sum,
block_size);
}
#else
aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, sum);
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
xpos, ypos, rec_width, rec_height, sum,
block_size);
#endif
filtered = 1;
}
}
}
for (c = 0; c < 4; c++) {
for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) {
res[c][0] += sum[0];
res[c][1] += sum[1];
res[c][2] += sum[2];
@@ -249,30 +282,42 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int *best_strength, int *best_bs) {
int *best_strength, int *best_bs, int plane) {
int c, j, k, l;
int64_t best, sums[4][4];
int width = rec->y_crop_width, height = rec->y_crop_height;
const int bs = MAX_MIB_SIZE;
int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
const int bs = MI_SIZE;
const int bslog = get_msb(bs);
int fb_size_log2 = get_msb(MAX_FB_SIZE);
int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
memset(sums, 0, sizeof(sums));
for (k = 0; k < num_fb_ver; k++) {
for (l = 0; l < num_fb_hor; l++) {
// Calculate the block size after frame border clipping
int h =
AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
int w =
AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
h += !h << fb_size_log2;
w += !w << fb_size_log2;
clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, bs,
fb_size_log2, w / bs, h / bs, sums);
if (plane != AOM_PLANE_Y)
// Use a block size of MI_SIZE regardless of the subsampling. This
// This is accurate enough to determine the best strength and
// we don't need to add SIMD optimisations for 4x4 blocks.
clpf_rdo(0, 0, rec, org, cm, bs, fb_size_log2, width >> bslog,
height >> bslog, sums, plane);
else
for (k = 0; k < num_fb_ver; k++) {
for (l = 0; l < num_fb_hor; l++) {
// Calculate the block size after frame border clipping
int h =
AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
int w =
AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
h += !h << fb_size_log2;
w += !w << fb_size_log2;
clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, MI_SIZE,
fb_size_log2, w >> bslog, h >> bslog, sums, plane);
}
}
}
if (plane != AOM_PLANE_Y) // Slightly favour unfiltered chroma
sums[0][0] -= sums[0][0] >> 7;
for (j = 0; j < 4; j++) {
static const double lambda_square[] = {
@@ -290,13 +335,13 @@ void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
// Estimate the bit costs and adjust the square errors
double lambda =
lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2];
int i, cost = (int)((lambda * (sums[j][0] + 2 + 2 * (j > 0)) + 0.5));
int i, cost = (int)((lambda * (sums[j][0] + 6 + 2 * (j > 0)) + 0.5));
for (i = 0; i < 4; i++)
sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i;
}
best = (int64_t)1 << 62;
for (c = 0; c < 4; c++)
for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++)
for (j = 0; j < 4; j++)
if ((!c || j) && sums[c][j] < best) best = sums[c][j];
best &= 15;

View File

@@ -17,10 +17,10 @@
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength,
unsigned int fb_size_log2, uint8_t *res);
unsigned int fb_size_log2, uint8_t *res, int plane);
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int *best_strength, int *best_bs);
int *best_strength, int *best_bs, int plane);
#endif

View File

@@ -9,342 +9,171 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_simd.h"
#include "aom_ports/mem.h"
SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
SIMD_INLINE void calc_diff(v128 o, v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
v128 *f) {
// The difference will be 9 bit, offset by 128 so we can use saturated
// sub to avoid going to 16 bit temporarily before "strength" clipping.
const v128 c128 = v128_dup_8(128);
v128 x = v128_add_8(c128, o);
*a = v128_ssub_s8(v128_add_8(c128, *a), x);
*b = v128_ssub_s8(v128_add_8(c128, *b), x);
*c = v128_ssub_s8(v128_add_8(c128, *c), x);
*d = v128_ssub_s8(v128_add_8(c128, *d), x);
*e = v128_ssub_s8(v128_add_8(c128, *e), x);
*f = v128_ssub_s8(v128_add_8(c128, *f), x);
}
SIMD_INLINE v128 delta_kernel(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(c, sp), sm),
v128_max_s8(v128_min_s8(d, sp), sm));
const v128 delta = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, sp), sm),
v128_max_s8(v128_min_s8(f, sp), sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, sp), sm),
v128_max_s8(v128_min_s8(e, sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
return v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4);
return v128_add_8(
o, v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4));
}
SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
calc_diff(o, &a, &b, &c, &d, &e, &f);
return delta_kernel(o, a, b, c, d, e, f, sp, sm);
}
SIMD_INLINE void clip_sides(v128 *b, v128 *c, v128 *d, v128 *e, int left,
int right) {
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
if (!left) { // Left clipping
*b = v128_shuffle_8(*b, v128_load_aligned(b_shuff));
*c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
}
if (!right) { // Right clipping
*d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
*e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
}
}
SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int bottom, int right, int y, v128 *o, v128 *r,
v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
v128 *f) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
*o = v128_from_v64(k1, k2);
*r = v128_from_v64(l1, l2);
*a = v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1);
*f = v128_from_v64(l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride));
*b = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
v64_load_unaligned(rec - 2 * !!x0 + rstride));
*c = v128_from_v64(v64_load_unaligned(rec - !!x0),
v64_load_unaligned(rec - !!x0 + rstride));
*d = v128_from_v64(v64_load_unaligned(rec + !!right),
v64_load_unaligned(rec + !!right + rstride));
*e = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
v64_load_unaligned(rec + 2 * !!right + rstride));
clip_sides(b, c, d, e, x0, right);
}
void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum0, int *sum1,
unsigned int strength) {
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
const v128 c128 = v128_dup_8(128);
unsigned int strength, int size) {
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const int right = width - 8 - x0;
const int bottom = height - 2 - y0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
sum1, strength, size);
return;
}
rec += x0 + y0 * rstride;
org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec + 1),
v64_load_unaligned(rec + 1 + rstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec + 2),
v64_load_unaligned(rec + 2 + rstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec - 2),
v64_load_unaligned(rec - 2 + rstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec - 1),
v64_load_unaligned(rec - 1 + rstride)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec - 2),
v64_load_unaligned(rec - 2 + rstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec - 1),
v64_load_unaligned(rec - 1 + rstride)));
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec + 1),
v64_load_unaligned(rec + 1 + rstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(rec + 2),
v64_load_unaligned(rec + 2 + rstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
for (y = 0; y < 8; y += 2) {
v128 a, b, c, d, e, f, o, r;
read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
&a, &b, &c, &d, &e, &f);
ssd0 = v128_ssd_u8(ssd0, o, r);
ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
rec += rstride * 2;
org += ostride * 2;
}
*sum0 += v128_ssd_u8_sum(ssd0);
*sum1 += v128_ssd_u8_sum(ssd1);
}
SIMD_INLINE void calc_delta_multi(v128 x, v128 q, v128 o, v128 a, v128 b,
v128 c, v128 d, v128 e, v128 f, v128 cp1,
v128 cm1, v128 cp2, v128 cm2, v128 cp4,
v128 cm4, ssd128_internal *ssd1,
SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
v128 d, v128 e, v128 f, ssd128_internal *ssd1,
ssd128_internal *ssd2,
ssd128_internal *ssd3) {
v128 tmp, delta1, delta2, delta3;
const v128 c8 = v128_dup_8(8);
a = v128_ssub_s8(a, x);
b = v128_ssub_s8(b, x);
c = v128_ssub_s8(c, x);
d = v128_ssub_s8(d, x);
e = v128_ssub_s8(e, x);
f = v128_ssub_s8(f, x);
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
v128_max_s8(v128_min_s8(d, cp1), cm1));
delta1 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
v128_max_s8(v128_min_s8(f, cp1), cm1)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
v128_max_s8(v128_min_s8(e, cp1), cm1))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
v128_max_s8(v128_min_s8(d, cp2), cm2));
delta2 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
v128_max_s8(v128_min_s8(f, cp2), cm2)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
v128_max_s8(v128_min_s8(e, cp2), cm2))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
v128_max_s8(v128_min_s8(d, cp4), cm4));
delta3 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
v128_max_s8(v128_min_s8(f, cp4), cm4)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
v128_max_s8(v128_min_s8(e, cp4), cm4))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
*ssd1 = v128_ssd_u8(
*ssd1, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta1,
v128_cmplt_s8(delta1, v128_zero()))),
4)));
*ssd2 = v128_ssd_u8(
*ssd2, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta2,
v128_cmplt_s8(delta2, v128_zero()))),
4)));
*ssd3 = v128_ssd_u8(
*ssd3, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta3,
v128_cmplt_s8(delta3, v128_zero()))),
4)));
calc_diff(r, &a, &b, &c, &d, &e, &f);
*ssd1 = v128_ssd_u8(*ssd1, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(1),
v128_dup_8(-1)));
*ssd2 = v128_ssd_u8(*ssd2, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(2),
v128_dup_8(-2)));
*ssd3 = v128_ssd_u8(*ssd3, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(4),
v128_dup_8(-4)));
}
// Test multiple filter strengths at once.
void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum) {
const v128 c128 = v128_dup_8(128);
const v128 cp1 = v128_dup_8(1);
const v128 cm1 = v128_dup_8(-1);
const v128 cp2 = v128_dup_8(2);
const v128 cm2 = v128_dup_8(-2);
const v128 cp4 = v128_dup_8(4);
const v128 cm4 = v128_dup_8(-4);
int width, int height, int *sum,
int size) {
const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
ssd128_internal ssd2 = v128_ssd_u8_init();
ssd128_internal ssd3 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
sum, size);
return;
}
rec += x0 + y0 * rstride;
org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
v128 b = v128_shuffle_8(x, b_shuff);
v128 c = v128_shuffle_8(x, c_shuff);
v128 d = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec + 1),
v64_load_unaligned(rec + 1 + rstride)));
v128 e = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec + 2),
v64_load_unaligned(rec + 2 + rstride)));
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
v128 b = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec - 2),
v64_load_unaligned(rec - 2 + rstride)));
v128 c = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec - 1),
v64_load_unaligned(rec - 1 + rstride)));
v128 d = v128_shuffle_8(x, d_shuff);
v128 e = v128_shuffle_8(x, e_shuff);
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
v128 o = v128_from_v64(k1, k2);
const v128 q = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, q);
v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
v128 b = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec - 2),
v64_load_unaligned(rec - 2 + rstride)));
v128 c = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec - 1),
v64_load_unaligned(rec - 1 + rstride)));
v128 d = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec + 1),
v64_load_unaligned(rec + 1 + rstride)));
v128 e = v128_add_8(c128,
v128_from_v64(v64_load_unaligned(rec + 2),
v64_load_unaligned(rec + 2 + rstride)));
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
for (y = 0; y < 8; y += 2) {
v128 a, b, c, d, e, f, o, r;
read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
&a, &b, &c, &d, &e, &f);
ssd0 = v128_ssd_u8(ssd0, o, r);
calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
sum[0] += v128_ssd_u8_sum(ssd0);
sum[1] += v128_ssd_u8_sum(ssd1);
@@ -353,154 +182,66 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
}
#if CONFIG_AOM_HIGHBITDEPTH
SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0,
int bottom, int right, int y, v128 *o,
v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
v128 *e, v128 *f, int shift) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
*o = v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
*r = v128_unziplo_8(n1, n2);
*a = v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift), n1);
*f = v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(rec + ((y != bottom) + 1) * rstride),
shift));
*b = v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
*c = v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
*d = v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
*e = v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
clip_sides(b, c, d, e, x0, right);
}
void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum0, int *sum1,
unsigned int strength, int shift) {
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
const v128 c128 = v128_dup_8(128);
unsigned int strength, int shift,
int size) {
const v128 sp = v128_dup_8(strength >> shift);
const v128 sm = v128_dup_8(-(int)(strength >> shift));
const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
sum0, sum1, strength, shift, size);
return;
}
rec += x0 + y0 * rstride;
org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
for (y = 0; y < 8; y += 2) {
v128 a, b, c, d, e, f, o, r;
read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
&r, &a, &b, &c, &d, &e, &f, shift);
ssd0 = v128_ssd_u8(ssd0, o, r);
ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
rec += rstride * 2;
org += ostride * 2;
}
*sum0 += v128_ssd_u8_sum(ssd0);
*sum1 += v128_ssd_u8_sum(ssd1);
@@ -510,158 +251,32 @@ void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
const uint16_t *org, int rstride,
int ostride, int x0, int y0,
int width, int height, int *sum,
int shift) {
const v128 c128 = v128_dup_8(128);
const v128 cp1 = v128_dup_8(1);
const v128 cm1 = v128_dup_8(-1);
const v128 cp2 = v128_dup_8(2);
const v128 cm2 = v128_dup_8(-2);
const v128 cp4 = v128_dup_8(4);
const v128 cm4 = v128_dup_8(-4);
int shift, int size) {
const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
ssd128_internal ssd2 = v128_ssd_u8_init();
ssd128_internal ssd3 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
height, sum, shift, size);
return;
}
rec += x0 + y0 * rstride;
org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
for (y = 0; y < 8; y += 2) {
v128 a, b, c, d, e, f, o, r;
read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
&r, &a, &b, &c, &d, &e, &f, shift);
ssd0 = v128_ssd_u8(ssd0, o, r);
calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
sum[0] += v128_ssd_u8_sum(ssd0);
sum[1] += v128_ssd_u8_sum(ssd1);

View File

@@ -16,6 +16,7 @@
#include "av1/common/alloccommon.h"
#if CONFIG_CLPF
#include "aom/aom_image.h"
#include "av1/common/clpf.h"
#include "av1/encoder/clpf_rdo.h"
#endif
@@ -3422,7 +3423,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
#endif
}
#if CONFIG_CLPF
cm->clpf_strength = 0;
cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
cm->clpf_size = 2;
CHECK_MEM_ERROR(
cm, cm->clpf_blocks,
@@ -3430,21 +3431,37 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
((cm->frame_to_show->y_crop_height + 31) & ~31) >>
10));
if (!is_lossless_requested(&cpi->oxcf)) {
const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
// Find the best strength and block size for the entire frame
int fb_size_log2, strength;
av1_clpf_test_frame(cm->frame_to_show, cpi->Source, cm, &strength,
&fb_size_log2);
int fb_size_log2, strength_y, strength_u, strength_v;
av1_clpf_test_frame(frame, cpi->Source, cm, &strength_y, &fb_size_log2,
AOM_PLANE_Y);
av1_clpf_test_frame(frame, cpi->Source, cm, &strength_u, &fb_size_log2,
AOM_PLANE_U);
av1_clpf_test_frame(frame, cpi->Source, cm, &strength_v, &fb_size_log2,
AOM_PLANE_V);
if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE);
if (strength) {
if (strength_y) {
// Apply the filter using the chosen strength
cm->clpf_strength = strength - (strength == 4);
cm->clpf_strength_y = strength_y - (strength_y == 4);
cm->clpf_size =
fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
cm->clpf_numblocks = av1_clpf_frame(
cm->frame_to_show, cpi->Source, cm, !!cm->clpf_size, strength,
4 + cm->clpf_size, cm->clpf_blocks, av1_clpf_decision);
frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
}
if (strength_u) {
cm->clpf_strength_u = strength_u - (strength_u == 4);
av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
NULL);
}
if (strength_v) {
cm->clpf_strength_v = strength_v - (strength_v == 4);
av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
NULL);
}
}
#endif

View File

@@ -147,6 +147,8 @@ void test_clpf(int w, int h, int depth, int iterations,
<< "strength: " << (1 << strength) << std::endl
<< "xpos: " << xpos << std::endl
<< "ypos: " << ypos << std::endl
<< "w: " << w << std::endl
<< "h: " << h << std::endl
<< "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
<< "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
<< "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl