Extend CLPF to chroma.
Objective quality impact (low latency): PSNR YCbCr: 0.13% -1.37% -1.79% PSNRHVS: 0.03% SSIM: 0.24% MSSSIM: 0.10% CIEDE2000: -0.83% Change-Id: I8ddf0def569286775f0f9d4d4005932766a7fc27
This commit is contained in:

committed by
Yaowu Xu

parent
9021d09f9a
commit
ecf9a0c821
@@ -590,16 +590,16 @@ if (aom_config("CONFIG_CLPF") eq "yes") {
|
|||||||
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
|
||||||
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
|
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
|
||||||
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
|
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
|
||||||
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift";
|
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size";
|
||||||
specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
|
specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
|
||||||
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift";
|
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size";
|
||||||
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
|
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
|
||||||
}
|
}
|
||||||
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
|
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
|
||||||
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
|
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
|
||||||
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
|
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size";
|
||||||
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
|
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
|
||||||
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum";
|
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size";
|
||||||
specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/;
|
specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -8,9 +8,10 @@
|
|||||||
* Media Patent License 1.0 was not distributed with this source code in the
|
* Media Patent License 1.0 was not distributed with this source code in the
|
||||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||||
*/
|
*/
|
||||||
#include <assert.h>
|
|
||||||
#include "av1/common/clpf.h"
|
#include "av1/common/clpf.h"
|
||||||
#include "./aom_dsp_rtcd.h"
|
#include "./aom_dsp_rtcd.h"
|
||||||
|
#include "aom/aom_image.h"
|
||||||
#include "aom_dsp/aom_dsp_common.h"
|
#include "aom_dsp/aom_dsp_common.h"
|
||||||
|
|
||||||
int av1_clpf_maxbits(const AV1_COMMON *cm) {
|
int av1_clpf_maxbits(const AV1_COMMON *cm) {
|
||||||
@@ -72,21 +73,24 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Return number of filtered blocks
|
// Return number of filtered blocks
|
||||||
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
int av1_clpf_frame(
|
||||||
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org,
|
||||||
int enable_fb_flag, unsigned int strength,
|
AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
|
||||||
unsigned int fb_size_log2, uint8_t *blocks,
|
unsigned int fb_size_log2, uint8_t *blocks, int plane,
|
||||||
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
|
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
|
||||||
const YV12_BUFFER_CONFIG *,
|
const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int,
|
||||||
const AV1_COMMON *cm, int, int, int,
|
int, unsigned int, unsigned int, uint8_t *, int)) {
|
||||||
unsigned int, unsigned int, uint8_t *)) {
|
|
||||||
/* Constrained low-pass filter (CLPF) */
|
/* Constrained low-pass filter (CLPF) */
|
||||||
int c, k, l, m, n;
|
int c, k, l, m, n;
|
||||||
const int bs = MI_SIZE;
|
const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
|
||||||
const int width = frame->y_crop_width;
|
const int suby = plane != AOM_PLANE_Y && frame->subsampling_y;
|
||||||
const int height = frame->y_crop_height;
|
const int bs = (subx || suby) ? 4 : 8;
|
||||||
|
const int bslog = get_msb(bs);
|
||||||
|
int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width;
|
||||||
|
int height =
|
||||||
|
plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height;
|
||||||
int xpos, ypos;
|
int xpos, ypos;
|
||||||
const int sstride = frame->y_stride;
|
const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride;
|
||||||
int dstride = bs;
|
int dstride = bs;
|
||||||
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
|
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
|
||||||
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
|
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
|
||||||
@@ -97,9 +101,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
int cache_idx = 0;
|
int cache_idx = 0;
|
||||||
const int cache_size = num_fb_hor << (2 * fb_size_log2);
|
const int cache_size = num_fb_hor << (2 * fb_size_log2);
|
||||||
const int cache_blocks = cache_size / (bs * bs);
|
const int cache_blocks = cache_size / (bs * bs);
|
||||||
YV12_BUFFER_CONFIG dst = *frame;
|
uint8_t *src_buffer =
|
||||||
|
plane != AOM_PLANE_Y
|
||||||
assert(bs == 8); // Optimised code assumes this.
|
? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
|
||||||
|
: frame->y_buffer;
|
||||||
|
uint8_t *dst_buffer;
|
||||||
|
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
strength <<= (cm->bit_depth - 8);
|
strength <<= (cm->bit_depth - 8);
|
||||||
@@ -108,10 +114,10 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
// Make buffer space for in-place filtering
|
// Make buffer space for in-place filtering
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth));
|
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth));
|
||||||
dst.y_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
|
dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
|
||||||
#else
|
#else
|
||||||
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
|
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
|
||||||
dst.y_buffer = cache;
|
dst_buffer = cache;
|
||||||
#endif
|
#endif
|
||||||
CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr)));
|
CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr)));
|
||||||
CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst)));
|
CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst)));
|
||||||
@@ -130,7 +136,8 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
ypos = yoff + m * bs;
|
ypos = yoff + m * bs;
|
||||||
if (xpos < width && ypos < height) {
|
if (xpos < width && ypos < height) {
|
||||||
allskip &=
|
allskip &=
|
||||||
cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
|
cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
|
||||||
|
(xpos << subx) / MI_SIZE]
|
||||||
->mbmi.skip;
|
->mbmi.skip;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -144,13 +151,14 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
if (!allskip && // Do not filter the block if all is skip encoded
|
if (!allskip && // Do not filter the block if all is skip encoded
|
||||||
(!enable_fb_flag ||
|
(!enable_fb_flag ||
|
||||||
decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
|
decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
|
||||||
fb_size_log2, blocks + block_index))) {
|
fb_size_log2, blocks + block_index, plane))) {
|
||||||
// Iterate over all smaller blocks inside the filter block
|
// Iterate over all smaller blocks inside the filter block
|
||||||
for (m = 0; m < (h + bs - 1) / bs; m++) {
|
for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
|
||||||
for (n = 0; n < (w + bs - 1) / bs; n++) {
|
for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
|
||||||
xpos = xoff + n * bs;
|
xpos = xoff + n * bs;
|
||||||
ypos = yoff + m * bs;
|
ypos = yoff + m * bs;
|
||||||
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
|
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
|
||||||
|
(xpos << subx) / MI_SIZE]
|
||||||
->mbmi.skip) { // Not skip block
|
->mbmi.skip) { // Not skip block
|
||||||
// Temporary buffering needed if filtering in-place
|
// Temporary buffering needed if filtering in-place
|
||||||
if (cache_ptr[cache_idx]) {
|
if (cache_ptr[cache_idx]) {
|
||||||
@@ -161,50 +169,59 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
for (c = 0; c < bs; c++) {
|
for (c = 0; c < bs; c++) {
|
||||||
*(uint64_t *)(d + c * sstride) =
|
*(uint64_t *)(d + c * sstride) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
|
||||||
|
if (bs == 8)
|
||||||
*(uint64_t *)(d + c * sstride + 4) =
|
*(uint64_t *)(d + c * sstride + 4) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (c = 0; c < bs; c++)
|
for (c = 0; c < bs; c++)
|
||||||
|
if (bs == 8)
|
||||||
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
|
else
|
||||||
|
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
|
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for (c = 0; c < bs; c++)
|
for (c = 0; c < bs; c++)
|
||||||
|
if (bs == 8)
|
||||||
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
|
else
|
||||||
|
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
|
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
if (cm->use_highbitdepth) {
|
if (cm->use_highbitdepth) {
|
||||||
cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2;
|
cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2;
|
||||||
dst.y_buffer =
|
dst_buffer =
|
||||||
CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos;
|
CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos;
|
||||||
} else {
|
} else {
|
||||||
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
|
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
|
||||||
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
|
dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
|
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
|
||||||
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
|
dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
|
||||||
#endif
|
#endif
|
||||||
cache_dst[cache_idx] = frame->y_buffer + ypos * sstride + xpos;
|
cache_dst[cache_idx] = src_buffer + ypos * sstride + xpos;
|
||||||
if (++cache_idx >= cache_blocks) cache_idx = 0;
|
if (++cache_idx >= cache_blocks) cache_idx = 0;
|
||||||
|
|
||||||
// Apply the filter
|
// Apply the filter
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
if (cm->use_highbitdepth) {
|
if (cm->use_highbitdepth) {
|
||||||
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(frame->y_buffer),
|
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
|
||||||
CONVERT_TO_SHORTPTR(dst.y_buffer), sstride,
|
CONVERT_TO_SHORTPTR(dst_buffer), sstride,
|
||||||
dstride, xpos, ypos, bs, bs, width, height,
|
dstride, xpos, ypos, bs, bs, width, height,
|
||||||
strength);
|
strength);
|
||||||
} else {
|
} else {
|
||||||
aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride,
|
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
|
||||||
xpos, ypos, bs, bs, width, height, strength);
|
ypos, bs, bs, width, height, strength);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride,
|
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
|
||||||
xpos, ypos, bs, bs, width, height, strength);
|
ypos, bs, bs, width, height, strength);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -223,16 +240,25 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
for (c = 0; c < bs; c++) {
|
for (c = 0; c < bs; c++) {
|
||||||
*(uint64_t *)(d + c * sstride) =
|
*(uint64_t *)(d + c * sstride) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
|
||||||
|
if (bs == 8)
|
||||||
*(uint64_t *)(d + c * sstride + 4) =
|
*(uint64_t *)(d + c * sstride + 4) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (c = 0; c < bs; c++)
|
for (c = 0; c < bs; c++)
|
||||||
|
if (bs == 4)
|
||||||
|
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
|
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
|
else
|
||||||
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for (c = 0; c < bs; c++)
|
for (c = 0; c < bs; c++)
|
||||||
|
if (bs == 4)
|
||||||
|
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
|
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
|
else
|
||||||
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
#endif
|
#endif
|
||||||
|
@@ -20,10 +20,10 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
|
|||||||
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
||||||
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
|
||||||
int enable_fb_flag, unsigned int strength,
|
int enable_fb_flag, unsigned int strength,
|
||||||
unsigned int fb_size_log2, uint8_t *blocks,
|
unsigned int fb_size_log2, uint8_t *blocks, int plane,
|
||||||
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
|
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
|
||||||
const YV12_BUFFER_CONFIG *,
|
const YV12_BUFFER_CONFIG *,
|
||||||
const AV1_COMMON *cm, int, int, int,
|
const AV1_COMMON *cm, int, int, int,
|
||||||
unsigned int, unsigned int, uint8_t *));
|
unsigned int, unsigned int, uint8_t *, int));
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -10,131 +10,165 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "./aom_dsp_rtcd.h"
|
#include "./aom_dsp_rtcd.h"
|
||||||
|
#include "aom_ports/mem.h"
|
||||||
|
|
||||||
SIMD_INLINE void calc_delta(v128 o, v128 x, v128 a, v128 b, v128 c, v128 d,
|
// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
|
||||||
v128 e, v128 f, uint8_t *dst, v128 sp, v128 sm,
|
// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
|
||||||
int dstride) {
|
// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
|
||||||
|
SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
||||||
|
v128 f, v128 sp, v128 sm) {
|
||||||
|
// The difference will be 9 bit, offset by 128 so we can use saturated
|
||||||
|
// sub to avoid going to 16 bit temporarily before "strength" clipping.
|
||||||
|
const v128 c128 = v128_dup_8(128);
|
||||||
|
const v128 x = v128_add_8(c128, o);
|
||||||
const v128 c8 = v128_dup_8(8);
|
const v128 c8 = v128_dup_8(8);
|
||||||
const v128 tmp =
|
const v128 tmp = v128_add_8(
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
|
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, c), x), sp), sm),
|
||||||
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
|
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, d), x), sp), sm));
|
||||||
const v128 delta = v128_add_8(
|
const v128 delta = v128_add_8(
|
||||||
v128_add_8(
|
v128_add_8(
|
||||||
v128_shl_8(
|
v128_shl_8(
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
|
v128_add_8(
|
||||||
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
|
v128_max_s8(
|
||||||
|
v128_min_s8(v128_ssub_s8(v128_add_8(c128, a), x), sp),
|
||||||
|
sm),
|
||||||
|
v128_max_s8(
|
||||||
|
v128_min_s8(v128_ssub_s8(v128_add_8(c128, f), x), sp),
|
||||||
|
sm)),
|
||||||
2),
|
2),
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
|
v128_add_8(
|
||||||
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
|
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, b), x), sp),
|
||||||
|
sm),
|
||||||
|
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, e), x), sp),
|
||||||
|
sm))),
|
||||||
v128_add_8(v128_add_8(tmp, tmp), tmp));
|
v128_add_8(v128_add_8(tmp, tmp), tmp));
|
||||||
o = v128_add_8(
|
return v128_add_8(
|
||||||
o,
|
o,
|
||||||
v128_shr_s8(
|
v128_shr_s8(
|
||||||
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
|
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
|
||||||
4));
|
4));
|
||||||
v64_store_aligned(dst, v128_high_v64(o));
|
|
||||||
v64_store_aligned(dst + dstride, v128_low_v64(o));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
|
// Process blocks of width 8, two lines at a time, 8 bit.
|
||||||
|
static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
|
||||||
int dstride, int x0, int y0, int sizey, int width,
|
int dstride, int x0, int y0, int sizey, int width,
|
||||||
int height, unsigned int strength) {
|
int height, unsigned int strength) {
|
||||||
int bottom = height - 2 - y0;
|
const int bottom = height - 2 - y0;
|
||||||
|
const int right = width - 8 - x0;
|
||||||
const v128 sp = v128_dup_8(strength);
|
const v128 sp = v128_dup_8(strength);
|
||||||
const v128 sm = v128_dup_8(-(int)strength);
|
const v128 sm = v128_dup_8(-(int)strength);
|
||||||
const v128 c128 = v128_dup_8(128);
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
|
||||||
|
int y;
|
||||||
|
|
||||||
dst += x0 + y0 * dstride;
|
dst += x0 + y0 * dstride;
|
||||||
src += x0 + y0 * sstride;
|
src += x0 + y0 * sstride;
|
||||||
|
|
||||||
if (!x0) { // Clip left
|
for (y = 0; y < sizey; y += 2) {
|
||||||
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
|
const v64 l1 = v64_load_aligned(src);
|
||||||
v64_from_64(0x0504030201000000LL));
|
const v64 l2 = v64_load_aligned(src + sstride);
|
||||||
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
|
v128 o = v128_from_v64(l1, l2);
|
||||||
v64_from_64(0x0605040302010000LL));
|
const v128 a =
|
||||||
|
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
|
||||||
|
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
|
||||||
|
v64_load_unaligned(src - 2 * !!x0 + sstride));
|
||||||
|
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
|
||||||
|
v64_load_unaligned(src - !!x0 + sstride));
|
||||||
|
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
|
||||||
|
v64_load_unaligned(src + !!right + sstride));
|
||||||
|
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
|
||||||
|
v64_load_unaligned(src + 2 * !!right + sstride));
|
||||||
|
const v128 f = v128_from_v64(
|
||||||
|
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
|
||||||
|
|
||||||
|
if (!x0) { // Left clipping
|
||||||
|
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
|
||||||
|
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
|
||||||
|
}
|
||||||
|
if (!right) { // Right clipping
|
||||||
|
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
|
||||||
|
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
|
||||||
|
}
|
||||||
|
|
||||||
|
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
|
||||||
|
v64_store_aligned(dst, v128_high_v64(o));
|
||||||
|
v64_store_aligned(dst + dstride, v128_low_v64(o));
|
||||||
|
src += sstride * 2;
|
||||||
|
dst += dstride * 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process blocks of width 4, four lines at a time, 8 bit.
|
||||||
|
static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
|
||||||
|
int dstride, int x0, int y0, int sizey, int width,
|
||||||
|
int height, unsigned int strength) {
|
||||||
|
const v128 sp = v128_dup_8(strength);
|
||||||
|
const v128 sm = v128_dup_8(-(int)strength);
|
||||||
|
const int right = width - 4 - x0;
|
||||||
|
const int bottom = height - 4 - y0;
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
b_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
c_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
d_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
e_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
|
||||||
int y;
|
int y;
|
||||||
|
|
||||||
for (y = 0; y < sizey; y += 2) {
|
dst += x0 + y0 * dstride;
|
||||||
const v64 l1 = v64_load_aligned(src);
|
src += x0 + y0 * sstride;
|
||||||
const v64 l2 = v64_load_aligned(src + sstride);
|
|
||||||
v128 o = v128_from_v64(l1, l2);
|
|
||||||
const v128 x = v128_add_8(c128, o);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
|
|
||||||
const v128 b = v128_shuffle_8(x, b_shuff);
|
|
||||||
const v128 c = v128_shuffle_8(x, c_shuff);
|
|
||||||
const v128 d = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(src + 1),
|
|
||||||
v64_load_unaligned(src + 1 + sstride)));
|
|
||||||
const v128 e = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(src + 2),
|
|
||||||
v64_load_unaligned(src + 2 + sstride)));
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_from_v64(
|
|
||||||
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
|
|
||||||
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
|
|
||||||
src += sstride * 2;
|
|
||||||
dst += dstride * 2;
|
|
||||||
}
|
|
||||||
} else if (!(width - x0 - 8)) { // Clip right
|
|
||||||
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
|
|
||||||
v64_from_64(0x0707060504030201LL));
|
|
||||||
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
|
|
||||||
v64_from_64(0x0707070605040302LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < sizey; y += 2) {
|
for (y = 0; y < sizey; y += 4) {
|
||||||
const v64 l1 = v64_load_aligned(src);
|
const uint32_t l0 = u32_load_aligned(src - (y != -y0) * sstride);
|
||||||
const v64 l2 = v64_load_aligned(src + sstride);
|
const uint32_t l1 = u32_load_aligned(src);
|
||||||
v128 o = v128_from_v64(l1, l2);
|
const uint32_t l2 = u32_load_aligned(src + sstride);
|
||||||
const v128 x = v128_add_8(c128, o);
|
const uint32_t l3 = u32_load_aligned(src + 2 * sstride);
|
||||||
const v128 a = v128_add_8(
|
const uint32_t l4 = u32_load_aligned(src + 3 * sstride);
|
||||||
c128,
|
const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
|
||||||
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
|
v128 o = v128_from_32(l1, l2, l3, l4);
|
||||||
const v128 b = v128_add_8(
|
const v128 a = v128_from_32(l0, l1, l2, l3);
|
||||||
c128, v128_from_v64(v64_load_unaligned(src - 2),
|
v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
|
||||||
v64_load_unaligned(src - 2 + sstride)));
|
u32_load_unaligned(src + sstride - 2 * !!x0),
|
||||||
const v128 c = v128_add_8(
|
u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
|
||||||
c128, v128_from_v64(v64_load_unaligned(src - 1),
|
u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
|
||||||
v64_load_unaligned(src - 1 + sstride)));
|
v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
|
||||||
const v128 d = v128_shuffle_8(x, d_shuff);
|
u32_load_unaligned(src + sstride - !!x0),
|
||||||
const v128 e = v128_shuffle_8(x, e_shuff);
|
u32_load_unaligned(src + 2 * sstride - !!x0),
|
||||||
const v128 f = v128_add_8(
|
u32_load_unaligned(src + 3 * sstride - !!x0));
|
||||||
c128, v128_from_v64(
|
v128 d = v128_from_32(u32_load_unaligned(src + !!right),
|
||||||
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
|
u32_load_unaligned(src + sstride + !!right),
|
||||||
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
|
u32_load_unaligned(src + 2 * sstride + !!right),
|
||||||
src += sstride * 2;
|
u32_load_unaligned(src + 3 * sstride + !!right));
|
||||||
dst += dstride * 2;
|
v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
|
||||||
|
u32_load_unaligned(src + sstride + 2 * !!right),
|
||||||
|
u32_load_unaligned(src + 2 * sstride + 2 * !!right),
|
||||||
|
u32_load_unaligned(src + 3 * sstride + 2 * !!right));
|
||||||
|
const v128 f = v128_from_32(l2, l3, l4, l5);
|
||||||
|
|
||||||
|
if (!x0) { // Left clipping
|
||||||
|
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
|
||||||
|
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
|
||||||
}
|
}
|
||||||
} else { // No left/right clipping
|
if (!right) { // Right clipping
|
||||||
int y;
|
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
|
||||||
for (y = 0; y < sizey; y += 2) {
|
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
|
||||||
const v64 l1 = v64_load_aligned(src);
|
|
||||||
const v64 l2 = v64_load_aligned(src + sstride);
|
|
||||||
v128 o = v128_from_v64(l1, l2);
|
|
||||||
const v128 x = v128_add_8(c128, o);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
|
|
||||||
const v128 b = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(src - 2),
|
|
||||||
v64_load_unaligned(src - 2 + sstride)));
|
|
||||||
const v128 c = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(src - 1),
|
|
||||||
v64_load_unaligned(src - 1 + sstride)));
|
|
||||||
const v128 d = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(src + 1),
|
|
||||||
v64_load_unaligned(src + 1 + sstride)));
|
|
||||||
const v128 e = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(src + 2),
|
|
||||||
v64_load_unaligned(src + 2 + sstride)));
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_from_v64(
|
|
||||||
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
|
|
||||||
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
|
|
||||||
src += sstride * 2;
|
|
||||||
dst += dstride * 2;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
|
||||||
|
u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
|
||||||
|
u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
|
||||||
|
u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
|
||||||
|
u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
|
||||||
|
|
||||||
|
dst += 4 * dstride;
|
||||||
|
src += 4 * sstride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -142,24 +176,23 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
|
|||||||
int dstride, int x0, int y0, int sizex,
|
int dstride, int x0, int y0, int sizex,
|
||||||
int sizey, int width, int height,
|
int sizey, int width, int height,
|
||||||
unsigned int strength) {
|
unsigned int strength) {
|
||||||
// TODO(stemidts):
|
if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
|
||||||
// A sizex different from 8 will only be needed if CLPF is extended to chroma.
|
(sizey & 3 && sizex == 4) || x0 + 4 > width) {
|
||||||
// This will only be used if 4:2:0 and width not a multiple of 16 and along
|
|
||||||
// the right edge only, so we can fall back to the plain C implementation in
|
|
||||||
// this case. If not extended to chroma, this test will be redundant.
|
|
||||||
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
|
|
||||||
// Fallback to C for odd sizes
|
// Fallback to C for odd sizes
|
||||||
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
|
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
|
||||||
height, strength);
|
height, strength);
|
||||||
} else {
|
} else {
|
||||||
clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
|
(sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, y0,
|
||||||
strength);
|
sizey, width, height, strength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
|
||||||
v128 f, uint16_t *dst, v128 sp, v128 sm) {
|
// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
|
||||||
|
// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
|
||||||
|
SIMD_INLINE v128 calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
||||||
|
v128 f, v128 sp, v128 sm) {
|
||||||
const v128 c8 = v128_dup_16(8);
|
const v128 c8 = v128_dup_16(8);
|
||||||
const v128 tmp =
|
const v128 tmp =
|
||||||
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm),
|
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm),
|
||||||
@@ -174,87 +207,138 @@ static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
|||||||
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm),
|
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm),
|
||||||
v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))),
|
v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))),
|
||||||
v128_add_16(v128_add_16(tmp, tmp), tmp));
|
v128_add_16(v128_add_16(tmp, tmp), tmp));
|
||||||
v128_store_aligned(
|
return v128_add_16(
|
||||||
dst,
|
|
||||||
v128_add_16(
|
|
||||||
o, v128_shr_s16(
|
o, v128_shr_s16(
|
||||||
v128_add_16(c8, v128_add_16(delta, v128_cmplt_s16(
|
v128_add_16(
|
||||||
delta, v128_zero()))),
|
c8, v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
|
||||||
4)));
|
4));
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
|
static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
||||||
int dstride, int x0, int y0, int sizey,
|
v128 f, uint16_t *dst, v128 sp, v128 sm,
|
||||||
int width, int height, unsigned int strength) {
|
int dstride) {
|
||||||
int y;
|
o = calc_delta_hbd(o, a, b, c, d, e, f, sp, sm);
|
||||||
int bottom = height - 2 - y0;
|
v64_store_aligned(dst, v128_high_v64(o));
|
||||||
|
v64_store_aligned(dst + dstride, v128_low_v64(o));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
||||||
|
v128 f, uint16_t *dst, v128 sp, v128 sm) {
|
||||||
|
v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, sp, sm));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process blocks of width 4, two lines at time.
|
||||||
|
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
|
||||||
|
int sstride, int dstride, int x0, int y0,
|
||||||
|
int sizey, int width, int height,
|
||||||
|
unsigned int strength) {
|
||||||
const v128 sp = v128_dup_16(strength);
|
const v128 sp = v128_dup_16(strength);
|
||||||
const v128 sm = v128_dup_16(-(int)strength);
|
const v128 sm = v128_dup_16(-(int)strength);
|
||||||
|
const int right = width - 4 - x0;
|
||||||
|
const int bottom = height - 2 - y0;
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
d_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
e_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
|
||||||
|
int y;
|
||||||
|
|
||||||
dst += x0 + y0 * dstride;
|
dst += x0 + y0 * dstride;
|
||||||
src += x0 + y0 * sstride;
|
src += x0 + y0 * sstride;
|
||||||
|
|
||||||
if (!x0) { // Clip left
|
for (y = 0; y < sizey; y += 2) {
|
||||||
const v128 b_shuff = v128_from_v64(v64_from_64(0x0b0a090807060504LL),
|
const v64 l1 = v64_load_aligned(src);
|
||||||
v64_from_64(0x0302010001000100LL));
|
const v64 l2 = v64_load_aligned(src + sstride);
|
||||||
const v128 c_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080706LL),
|
v128 o = v128_from_v64(l1, l2);
|
||||||
v64_from_64(0x0504030201000100LL));
|
const v128 a =
|
||||||
|
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
|
||||||
|
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
|
||||||
|
v64_load_unaligned(src - 2 * !!x0 + sstride));
|
||||||
|
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
|
||||||
|
v64_load_unaligned(src - !!x0 + sstride));
|
||||||
|
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
|
||||||
|
v64_load_unaligned(src + !!right + sstride));
|
||||||
|
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
|
||||||
|
v64_load_unaligned(src + 2 * !!right + sstride));
|
||||||
|
const v128 f = v128_from_v64(
|
||||||
|
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
|
||||||
|
|
||||||
|
if (!x0) { // Left clipping
|
||||||
|
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
|
||||||
|
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
|
||||||
|
}
|
||||||
|
if (!right) { // Right clipping
|
||||||
|
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
|
||||||
|
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
|
||||||
|
}
|
||||||
|
calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
|
||||||
|
src += sstride * 2;
|
||||||
|
dst += dstride * 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// The most simple case. Start here if you need to understand the functions.
|
||||||
|
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
|
||||||
|
int dstride, int x0, int y0, int sizey,
|
||||||
|
int width, int height, unsigned int strength) {
|
||||||
|
const v128 sp = v128_dup_16(strength);
|
||||||
|
const v128 sm = v128_dup_16(-(int)strength);
|
||||||
|
const int right = width - 8 - x0;
|
||||||
|
const int bottom = height - 2 - y0;
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
d_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
e_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL };
|
||||||
|
int y;
|
||||||
|
|
||||||
|
dst += x0 + y0 * dstride;
|
||||||
|
src += x0 + y0 * sstride;
|
||||||
|
|
||||||
|
// Read 8 set of pixels at a time. Clipping along upper and lower
|
||||||
|
// edges is handled by reading the upper or lower line twice.
|
||||||
|
// Clipping along the left and right edges is handled by shuffle
|
||||||
|
// instructions doing shift and pad.
|
||||||
for (y = 0; y < sizey; y++) {
|
for (y = 0; y < sizey; y++) {
|
||||||
const v128 o = v128_load_aligned(src);
|
const v128 o = v128_load_aligned(src);
|
||||||
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
|
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
|
||||||
const v128 b = v128_shuffle_8(o, b_shuff);
|
|
||||||
const v128 c = v128_shuffle_8(o, c_shuff);
|
|
||||||
const v128 d = v128_load_unaligned(src + 1);
|
|
||||||
const v128 e = v128_load_unaligned(src + 2);
|
|
||||||
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
|
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
|
||||||
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
|
v128 b = v128_load_unaligned(src - 2 * !!x0);
|
||||||
|
v128 c = v128_load_unaligned(src - !!x0);
|
||||||
|
v128 d = v128_load_unaligned(src + !!right);
|
||||||
|
v128 e = v128_load_unaligned(src + 2 * !!right);
|
||||||
|
|
||||||
|
if (!x0) { // Left clipping
|
||||||
|
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
|
||||||
|
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
|
||||||
|
}
|
||||||
|
if (!right) { // Right clipping
|
||||||
|
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
|
||||||
|
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
|
||||||
|
}
|
||||||
|
calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
|
||||||
src += sstride;
|
src += sstride;
|
||||||
dst += dstride;
|
dst += dstride;
|
||||||
}
|
}
|
||||||
} else if (!(width - x0 - 8)) { // Clip right
|
|
||||||
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0d0c0b0aLL),
|
|
||||||
v64_from_64(0x0908070605040302LL));
|
|
||||||
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0f0e0d0cLL),
|
|
||||||
v64_from_64(0x0b0a090807060504LL));
|
|
||||||
for (y = 0; y < sizey; y++) {
|
|
||||||
const v128 o = v128_load_aligned(src);
|
|
||||||
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
|
|
||||||
const v128 b = v128_load_unaligned(src - 2);
|
|
||||||
const v128 c = v128_load_unaligned(src - 1);
|
|
||||||
const v128 d = v128_shuffle_8(o, d_shuff);
|
|
||||||
const v128 e = v128_shuffle_8(o, e_shuff);
|
|
||||||
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
|
|
||||||
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
|
|
||||||
src += sstride;
|
|
||||||
dst += dstride;
|
|
||||||
}
|
|
||||||
} else { // No left/right clipping
|
|
||||||
for (y = 0; y < sizey; y++) {
|
|
||||||
const v128 o = v128_load_aligned(src);
|
|
||||||
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
|
|
||||||
const v128 b = v128_load_unaligned(src - 2);
|
|
||||||
const v128 c = v128_load_unaligned(src - 1);
|
|
||||||
const v128 d = v128_load_unaligned(src + 1);
|
|
||||||
const v128 e = v128_load_unaligned(src + 2);
|
|
||||||
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
|
|
||||||
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
|
|
||||||
src += sstride;
|
|
||||||
dst += dstride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
|
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
|
||||||
int sstride, int dstride, int x0, int y0,
|
int sstride, int dstride, int x0, int y0,
|
||||||
int sizex, int sizey, int width, int height,
|
int sizex, int sizey, int width, int height,
|
||||||
unsigned int strength) {
|
unsigned int strength) {
|
||||||
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
|
if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) {
|
||||||
// Fallback to C for odd sizes
|
// Fallback to C for odd sizes
|
||||||
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
|
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
|
||||||
width, height, strength);
|
width, height, strength);
|
||||||
} else {
|
} else {
|
||||||
clpf_block_hbd(src, dst, sstride, dstride, x0, y0, sizey, width, height,
|
(sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
|
||||||
strength);
|
src, dst, sstride, dstride, x0, y0, sizey, width, height, strength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@@ -153,7 +153,9 @@ typedef struct AV1Common {
|
|||||||
#if CONFIG_CLPF
|
#if CONFIG_CLPF
|
||||||
int clpf_numblocks;
|
int clpf_numblocks;
|
||||||
int clpf_size;
|
int clpf_size;
|
||||||
int clpf_strength;
|
int clpf_strength_y;
|
||||||
|
int clpf_strength_u;
|
||||||
|
int clpf_strength_v;
|
||||||
uint8_t *clpf_blocks;
|
uint8_t *clpf_blocks;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@@ -29,6 +29,7 @@
|
|||||||
|
|
||||||
#include "av1/common/alloccommon.h"
|
#include "av1/common/alloccommon.h"
|
||||||
#if CONFIG_CLPF
|
#if CONFIG_CLPF
|
||||||
|
#include "aom/aom_image.h"
|
||||||
#include "av1/common/clpf.h"
|
#include "av1/common/clpf.h"
|
||||||
#endif
|
#endif
|
||||||
#include "av1/common/common.h"
|
#include "av1/common/common.h"
|
||||||
@@ -2046,8 +2047,10 @@ static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
|
|||||||
#if CONFIG_CLPF
|
#if CONFIG_CLPF
|
||||||
static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
|
static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
|
||||||
cm->clpf_blocks = 0;
|
cm->clpf_blocks = 0;
|
||||||
cm->clpf_strength = aom_rb_read_literal(rb, 2);
|
cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
|
||||||
if (cm->clpf_strength) {
|
cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
|
||||||
|
cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
|
||||||
|
if (cm->clpf_strength_y) {
|
||||||
cm->clpf_size = aom_rb_read_literal(rb, 2);
|
cm->clpf_size = aom_rb_read_literal(rb, 2);
|
||||||
if (cm->clpf_size) {
|
if (cm->clpf_size) {
|
||||||
int i;
|
int i;
|
||||||
@@ -2065,7 +2068,8 @@ static int clpf_bit(UNUSED int k, UNUSED int l,
|
|||||||
UNUSED const YV12_BUFFER_CONFIG *org,
|
UNUSED const YV12_BUFFER_CONFIG *org,
|
||||||
UNUSED const AV1_COMMON *cm, UNUSED int block_size,
|
UNUSED const AV1_COMMON *cm, UNUSED int block_size,
|
||||||
UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
|
UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
|
||||||
UNUSED unsigned int fb_size_log2, uint8_t *bit) {
|
UNUSED unsigned int fb_size_log2, uint8_t *bit,
|
||||||
|
UNUSED int comp) {
|
||||||
return *bit;
|
return *bit;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@@ -3928,10 +3932,23 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
|
|||||||
#endif // CONFIG_LOOP_RESTORATION
|
#endif // CONFIG_LOOP_RESTORATION
|
||||||
|
|
||||||
#if CONFIG_CLPF
|
#if CONFIG_CLPF
|
||||||
if (cm->clpf_strength && !cm->skip_loop_filter) {
|
if (!cm->skip_loop_filter) {
|
||||||
av1_clpf_frame(&pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
|
const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
|
||||||
cm->clpf_strength + (cm->clpf_strength == 3),
|
if (cm->clpf_strength_y) {
|
||||||
4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
|
av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
|
||||||
|
cm->clpf_strength_y + (cm->clpf_strength_y == 3),
|
||||||
|
4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
|
||||||
|
}
|
||||||
|
if (cm->clpf_strength_u) {
|
||||||
|
av1_clpf_frame(frame, NULL, cm, 0,
|
||||||
|
cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
|
||||||
|
AOM_PLANE_U, NULL);
|
||||||
|
}
|
||||||
|
if (cm->clpf_strength_v) {
|
||||||
|
av1_clpf_frame(frame, NULL, cm, 0,
|
||||||
|
cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
|
||||||
|
AOM_PLANE_V, NULL);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
|
if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
|
||||||
#endif
|
#endif
|
||||||
|
@@ -2590,8 +2590,10 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
|
|||||||
|
|
||||||
#if CONFIG_CLPF
|
#if CONFIG_CLPF
|
||||||
static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
|
static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
|
||||||
aom_wb_write_literal(wb, cm->clpf_strength, 2);
|
aom_wb_write_literal(wb, cm->clpf_strength_y, 2);
|
||||||
if (cm->clpf_strength) {
|
aom_wb_write_literal(wb, cm->clpf_strength_u, 2);
|
||||||
|
aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
|
||||||
|
if (cm->clpf_strength_y) {
|
||||||
aom_wb_write_literal(wb, cm->clpf_size, 2);
|
aom_wb_write_literal(wb, cm->clpf_size, 2);
|
||||||
if (cm->clpf_size) {
|
if (cm->clpf_size) {
|
||||||
int i;
|
int i;
|
||||||
|
@@ -11,16 +11,17 @@
|
|||||||
|
|
||||||
#include "av1/common/clpf.h"
|
#include "av1/common/clpf.h"
|
||||||
#include "./aom_dsp_rtcd.h"
|
#include "./aom_dsp_rtcd.h"
|
||||||
|
#include "aom/aom_image.h"
|
||||||
#include "aom/aom_integer.h"
|
#include "aom/aom_integer.h"
|
||||||
#include "av1/common/quant_common.h"
|
#include "av1/common/quant_common.h"
|
||||||
|
|
||||||
// Calculate the error of a filtered and unfiltered block
|
// Calculate the error of a filtered and unfiltered block
|
||||||
void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
|
void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
|
||||||
int ostride, int x0, int y0, int width, int height,
|
int ostride, int x0, int y0, int width, int height,
|
||||||
int *sum0, int *sum1, unsigned int strength) {
|
int *sum0, int *sum1, unsigned int strength, int size) {
|
||||||
int x, y;
|
int x, y;
|
||||||
for (y = y0; y < y0 + 8; y++) {
|
for (y = y0; y < y0 + size; y++) {
|
||||||
for (x = x0; x < x0 + 8; x++) {
|
for (x = x0; x < x0 + size; x++) {
|
||||||
int O = org[y * ostride + x];
|
int O = org[y * ostride + x];
|
||||||
int X = rec[y * rstride + x];
|
int X = rec[y * rstride + x];
|
||||||
int A = rec[AOMMAX(0, y - 1) * rstride + x];
|
int A = rec[AOMMAX(0, y - 1) * rstride + x];
|
||||||
@@ -39,11 +40,11 @@ void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
|
|||||||
|
|
||||||
void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
|
void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
|
||||||
int rstride, int ostride, int x0, int y0,
|
int rstride, int ostride, int x0, int y0,
|
||||||
int width, int height, int *sum) {
|
int width, int height, int *sum, int size) {
|
||||||
int x, y;
|
int x, y;
|
||||||
|
|
||||||
for (y = y0; y < y0 + 8; y++) {
|
for (y = y0; y < y0 + size; y++) {
|
||||||
for (x = x0; x < x0 + 8; x++) {
|
for (x = x0; x < x0 + size; x++) {
|
||||||
int O = org[y * ostride + x];
|
int O = org[y * ostride + x];
|
||||||
int X = rec[y * rstride + x];
|
int X = rec[y * rstride + x];
|
||||||
int A = rec[AOMMAX(0, y - 1) * rstride + x];
|
int A = rec[AOMMAX(0, y - 1) * rstride + x];
|
||||||
@@ -71,10 +72,10 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
|
|||||||
void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
|
void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
|
||||||
int rstride, int ostride, int x0, int y0, int width,
|
int rstride, int ostride, int x0, int y0, int width,
|
||||||
int height, int *sum0, int *sum1,
|
int height, int *sum0, int *sum1,
|
||||||
unsigned int strength, int shift) {
|
unsigned int strength, int shift, int size) {
|
||||||
int x, y;
|
int x, y;
|
||||||
for (y = y0; y < y0 + 8; y++) {
|
for (y = y0; y < y0 + size; y++) {
|
||||||
for (x = x0; x < x0 + 8; x++) {
|
for (x = x0; x < x0 + size; x++) {
|
||||||
int O = org[y * ostride + x] >> shift;
|
int O = org[y * ostride + x] >> shift;
|
||||||
int X = rec[y * rstride + x] >> shift;
|
int X = rec[y * rstride + x] >> shift;
|
||||||
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
|
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
|
||||||
@@ -94,11 +95,12 @@ void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
|
|||||||
// aom_clpf_detect_multi_c() apart from "rec" and "org".
|
// aom_clpf_detect_multi_c() apart from "rec" and "org".
|
||||||
void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
|
void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
|
||||||
int rstride, int ostride, int x0, int y0,
|
int rstride, int ostride, int x0, int y0,
|
||||||
int width, int height, int *sum, int shift) {
|
int width, int height, int *sum, int shift,
|
||||||
|
int size) {
|
||||||
int x, y;
|
int x, y;
|
||||||
|
|
||||||
for (y = y0; y < y0 + 8; y++) {
|
for (y = y0; y < y0 + size; y++) {
|
||||||
for (x = x0; x < x0 + 8; x++) {
|
for (x = x0; x < x0 + size; x++) {
|
||||||
int O = org[y * ostride + x] >> shift;
|
int O = org[y * ostride + x] >> shift;
|
||||||
int X = rec[y * rstride + x] >> shift;
|
int X = rec[y * rstride + x] >> shift;
|
||||||
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
|
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
|
||||||
@@ -125,31 +127,45 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
|
|||||||
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
|
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
|
||||||
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
||||||
int block_size, int w, int h, unsigned int strength,
|
int block_size, int w, int h, unsigned int strength,
|
||||||
unsigned int fb_size_log2, uint8_t *res) {
|
unsigned int fb_size_log2, uint8_t *res, int plane) {
|
||||||
int m, n, sum0 = 0, sum1 = 0;
|
int m, n, sum0 = 0, sum1 = 0;
|
||||||
|
const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
|
||||||
|
const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
|
||||||
|
uint8_t *rec_buffer =
|
||||||
|
plane != AOM_PLANE_Y
|
||||||
|
? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
|
||||||
|
: rec->y_buffer;
|
||||||
|
uint8_t *org_buffer =
|
||||||
|
plane != AOM_PLANE_Y
|
||||||
|
? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
|
||||||
|
: org->y_buffer;
|
||||||
|
int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
|
||||||
|
int rec_height =
|
||||||
|
plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
|
||||||
|
int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
|
||||||
|
int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
|
||||||
for (m = 0; m < h; m++) {
|
for (m = 0; m < h; m++) {
|
||||||
for (n = 0; n < w; n++) {
|
for (n = 0; n < w; n++) {
|
||||||
int xpos = (l << fb_size_log2) + n * block_size;
|
int xpos = (l << fb_size_log2) + n * block_size;
|
||||||
int ypos = (k << fb_size_log2) + m * block_size;
|
int ypos = (k << fb_size_log2) + m * block_size;
|
||||||
const int bs = MAX_MIB_SIZE;
|
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
|
||||||
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
|
(xpos << subx) / MI_SIZE]
|
||||||
->mbmi.skip) {
|
->mbmi.skip) {
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
if (cm->use_highbitdepth) {
|
if (cm->use_highbitdepth) {
|
||||||
aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
|
aom_clpf_detect_hbd(
|
||||||
CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
|
CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
|
||||||
org->y_stride, xpos, ypos, rec->y_crop_width,
|
rec_stride, org_stride, xpos, ypos, rec_width, rec_height, &sum0,
|
||||||
rec->y_crop_height, &sum0, &sum1, strength,
|
&sum1, strength, cm->bit_depth - 8, block_size);
|
||||||
cm->bit_depth - 8);
|
|
||||||
} else {
|
} else {
|
||||||
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
|
aom_clpf_detect(rec_buffer, org_buffer, rec_stride, org_stride, xpos,
|
||||||
org->y_stride, xpos, ypos, rec->y_crop_width,
|
ypos, rec_width, rec_height, &sum0, &sum1, strength,
|
||||||
rec->y_crop_height, &sum0, &sum1, strength);
|
block_size);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
|
aom_clpf_detect(rec_buffer, org_buffer, rec_stride, org_stride, xpos,
|
||||||
org->y_stride, xpos, ypos, rec->y_crop_width,
|
ypos, rec_width, rec_height, &sum0, &sum1, strength,
|
||||||
rec->y_crop_height, &sum0, &sum1, strength);
|
block_size);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -161,6 +177,7 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
|
|||||||
// Calculate the square error of all filter settings. Result:
|
// Calculate the square error of all filter settings. Result:
|
||||||
// res[0][0] : unfiltered
|
// res[0][0] : unfiltered
|
||||||
// res[0][1-3] : strength=1,2,4, no signals
|
// res[0][1-3] : strength=1,2,4, no signals
|
||||||
|
// (Only for luma:)
|
||||||
// res[1][0] : (bit count, fb size = 128)
|
// res[1][0] : (bit count, fb size = 128)
|
||||||
// res[1][1-3] : strength=1,2,4, fb size = 128
|
// res[1][1-3] : strength=1,2,4, fb size = 128
|
||||||
// res[2][0] : (bit count, fb size = 64)
|
// res[2][0] : (bit count, fb size = 64)
|
||||||
@@ -170,12 +187,28 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
|
|||||||
static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
|
static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
|
||||||
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
||||||
unsigned int block_size, unsigned int fb_size_log2, int w,
|
unsigned int block_size, unsigned int fb_size_log2, int w,
|
||||||
int h, int64_t res[4][4]) {
|
int h, int64_t res[4][4], int plane) {
|
||||||
int c, m, n, filtered = 0;
|
int c, m, n, filtered = 0;
|
||||||
int sum[4];
|
int sum[4];
|
||||||
|
const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
|
||||||
|
const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
|
||||||
int bslog = get_msb(block_size);
|
int bslog = get_msb(block_size);
|
||||||
|
uint8_t *rec_buffer =
|
||||||
|
plane != AOM_PLANE_Y
|
||||||
|
? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
|
||||||
|
: rec->y_buffer;
|
||||||
|
uint8_t *org_buffer =
|
||||||
|
plane != AOM_PLANE_Y
|
||||||
|
? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
|
||||||
|
: org->y_buffer;
|
||||||
|
int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
|
||||||
|
int rec_height =
|
||||||
|
plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
|
||||||
|
int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
|
||||||
|
int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
|
||||||
sum[0] = sum[1] = sum[2] = sum[3] = 0;
|
sum[0] = sum[1] = sum[2] = sum[3] = 0;
|
||||||
if (fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
|
if (plane == AOM_PLANE_Y &&
|
||||||
|
fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
|
||||||
int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
|
int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
|
||||||
|
|
||||||
fb_size_log2--;
|
fb_size_log2--;
|
||||||
@@ -190,16 +223,17 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
|
|||||||
oldfiltered = res[i][0];
|
oldfiltered = res[i][0];
|
||||||
res[i][0] = 0;
|
res[i][0] = 0;
|
||||||
|
|
||||||
filtered =
|
filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
|
||||||
clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, res);
|
res, plane);
|
||||||
if (1 << (fb_size_log2 - bslog) < w)
|
if (1 << (fb_size_log2 - bslog) < w)
|
||||||
filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
|
filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
|
||||||
fb_size_log2, w2, h1, res);
|
fb_size_log2, w2, h1, res, plane);
|
||||||
if (1 << (fb_size_log2 - bslog) < h) {
|
if (1 << (fb_size_log2 - bslog) < h) {
|
||||||
filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
|
filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
|
||||||
fb_size_log2, w1, h2, res);
|
fb_size_log2, w1, h2, res, plane);
|
||||||
filtered |= clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2),
|
filtered |=
|
||||||
rec, org, cm, block_size, fb_size_log2, w2, h2, res);
|
clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org,
|
||||||
|
cm, block_size, fb_size_log2, w2, h2, res, plane);
|
||||||
}
|
}
|
||||||
|
|
||||||
res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
|
res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
|
||||||
@@ -213,32 +247,31 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
|
|||||||
for (n = 0; n < w; n++) {
|
for (n = 0; n < w; n++) {
|
||||||
int xpos = x + n * block_size;
|
int xpos = x + n * block_size;
|
||||||
int ypos = y + m * block_size;
|
int ypos = y + m * block_size;
|
||||||
if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride +
|
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
|
||||||
xpos / MAX_MIB_SIZE]
|
(xpos << subx) / MI_SIZE]
|
||||||
->mbmi.skip) {
|
->mbmi.skip) {
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
if (cm->use_highbitdepth) {
|
if (cm->use_highbitdepth) {
|
||||||
aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
|
aom_clpf_detect_multi_hbd(
|
||||||
CONVERT_TO_SHORTPTR(org->y_buffer),
|
CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
|
||||||
rec->y_stride, org->y_stride, xpos, ypos,
|
rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
|
||||||
rec->y_crop_width, rec->y_crop_height, sum,
|
cm->bit_depth - 8, block_size);
|
||||||
cm->bit_depth - 8);
|
|
||||||
} else {
|
} else {
|
||||||
aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride,
|
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
|
||||||
org->y_stride, xpos, ypos, rec->y_crop_width,
|
xpos, ypos, rec_width, rec_height, sum,
|
||||||
rec->y_crop_height, sum);
|
block_size);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride,
|
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
|
||||||
org->y_stride, xpos, ypos, rec->y_crop_width,
|
xpos, ypos, rec_width, rec_height, sum,
|
||||||
rec->y_crop_height, sum);
|
block_size);
|
||||||
#endif
|
#endif
|
||||||
filtered = 1;
|
filtered = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (c = 0; c < 4; c++) {
|
for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) {
|
||||||
res[c][0] += sum[0];
|
res[c][0] += sum[0];
|
||||||
res[c][1] += sum[1];
|
res[c][1] += sum[1];
|
||||||
res[c][2] += sum[2];
|
res[c][2] += sum[2];
|
||||||
@@ -249,17 +282,26 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
|
|||||||
|
|
||||||
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
|
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
|
||||||
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
||||||
int *best_strength, int *best_bs) {
|
int *best_strength, int *best_bs, int plane) {
|
||||||
int c, j, k, l;
|
int c, j, k, l;
|
||||||
int64_t best, sums[4][4];
|
int64_t best, sums[4][4];
|
||||||
int width = rec->y_crop_width, height = rec->y_crop_height;
|
int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
|
||||||
const int bs = MAX_MIB_SIZE;
|
int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
|
||||||
|
const int bs = MI_SIZE;
|
||||||
|
const int bslog = get_msb(bs);
|
||||||
int fb_size_log2 = get_msb(MAX_FB_SIZE);
|
int fb_size_log2 = get_msb(MAX_FB_SIZE);
|
||||||
int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
|
int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
|
||||||
int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
|
int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
|
||||||
|
|
||||||
memset(sums, 0, sizeof(sums));
|
memset(sums, 0, sizeof(sums));
|
||||||
|
|
||||||
|
if (plane != AOM_PLANE_Y)
|
||||||
|
// Use a block size of MI_SIZE regardless of the subsampling. This
|
||||||
|
// This is accurate enough to determine the best strength and
|
||||||
|
// we don't need to add SIMD optimisations for 4x4 blocks.
|
||||||
|
clpf_rdo(0, 0, rec, org, cm, bs, fb_size_log2, width >> bslog,
|
||||||
|
height >> bslog, sums, plane);
|
||||||
|
else
|
||||||
for (k = 0; k < num_fb_ver; k++) {
|
for (k = 0; k < num_fb_ver; k++) {
|
||||||
for (l = 0; l < num_fb_hor; l++) {
|
for (l = 0; l < num_fb_hor; l++) {
|
||||||
// Calculate the block size after frame border clipping
|
// Calculate the block size after frame border clipping
|
||||||
@@ -269,11 +311,14 @@ void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
|
|||||||
AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
|
AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
|
||||||
h += !h << fb_size_log2;
|
h += !h << fb_size_log2;
|
||||||
w += !w << fb_size_log2;
|
w += !w << fb_size_log2;
|
||||||
clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, bs,
|
clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, MI_SIZE,
|
||||||
fb_size_log2, w / bs, h / bs, sums);
|
fb_size_log2, w >> bslog, h >> bslog, sums, plane);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (plane != AOM_PLANE_Y) // Slightly favour unfiltered chroma
|
||||||
|
sums[0][0] -= sums[0][0] >> 7;
|
||||||
|
|
||||||
for (j = 0; j < 4; j++) {
|
for (j = 0; j < 4; j++) {
|
||||||
static const double lambda_square[] = {
|
static const double lambda_square[] = {
|
||||||
// exp(x / 8.5)
|
// exp(x / 8.5)
|
||||||
@@ -290,13 +335,13 @@ void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
|
|||||||
// Estimate the bit costs and adjust the square errors
|
// Estimate the bit costs and adjust the square errors
|
||||||
double lambda =
|
double lambda =
|
||||||
lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2];
|
lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2];
|
||||||
int i, cost = (int)((lambda * (sums[j][0] + 2 + 2 * (j > 0)) + 0.5));
|
int i, cost = (int)((lambda * (sums[j][0] + 6 + 2 * (j > 0)) + 0.5));
|
||||||
for (i = 0; i < 4; i++)
|
for (i = 0; i < 4; i++)
|
||||||
sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i;
|
sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i;
|
||||||
}
|
}
|
||||||
|
|
||||||
best = (int64_t)1 << 62;
|
best = (int64_t)1 << 62;
|
||||||
for (c = 0; c < 4; c++)
|
for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++)
|
||||||
for (j = 0; j < 4; j++)
|
for (j = 0; j < 4; j++)
|
||||||
if ((!c || j) && sums[c][j] < best) best = sums[c][j];
|
if ((!c || j) && sums[c][j] < best) best = sums[c][j];
|
||||||
best &= 15;
|
best &= 15;
|
||||||
|
@@ -17,10 +17,10 @@
|
|||||||
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
|
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
|
||||||
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
||||||
int block_size, int w, int h, unsigned int strength,
|
int block_size, int w, int h, unsigned int strength,
|
||||||
unsigned int fb_size_log2, uint8_t *res);
|
unsigned int fb_size_log2, uint8_t *res, int plane);
|
||||||
|
|
||||||
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
|
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
|
||||||
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
||||||
int *best_strength, int *best_bs);
|
int *best_strength, int *best_bs, int plane);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -9,343 +9,172 @@
|
|||||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "./aom_dsp_rtcd.h"
|
||||||
#include "aom_dsp/aom_simd.h"
|
#include "aom_dsp/aom_simd.h"
|
||||||
|
#include "aom_ports/mem.h"
|
||||||
|
|
||||||
SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
|
SIMD_INLINE void calc_diff(v128 o, v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
|
||||||
|
v128 *f) {
|
||||||
|
// The difference will be 9 bit, offset by 128 so we can use saturated
|
||||||
|
// sub to avoid going to 16 bit temporarily before "strength" clipping.
|
||||||
|
const v128 c128 = v128_dup_8(128);
|
||||||
|
v128 x = v128_add_8(c128, o);
|
||||||
|
*a = v128_ssub_s8(v128_add_8(c128, *a), x);
|
||||||
|
*b = v128_ssub_s8(v128_add_8(c128, *b), x);
|
||||||
|
*c = v128_ssub_s8(v128_add_8(c128, *c), x);
|
||||||
|
*d = v128_ssub_s8(v128_add_8(c128, *d), x);
|
||||||
|
*e = v128_ssub_s8(v128_add_8(c128, *e), x);
|
||||||
|
*f = v128_ssub_s8(v128_add_8(c128, *f), x);
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMD_INLINE v128 delta_kernel(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
||||||
v128 f, v128 sp, v128 sm) {
|
v128 f, v128 sp, v128 sm) {
|
||||||
const v128 tmp =
|
const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(c, sp), sm),
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
|
v128_max_s8(v128_min_s8(d, sp), sm));
|
||||||
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
|
const v128 delta = v128_add_8(
|
||||||
v128 delta = v128_add_8(
|
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, sp), sm),
|
||||||
v128_add_8(
|
v128_max_s8(v128_min_s8(f, sp), sm)),
|
||||||
v128_shl_8(
|
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
|
|
||||||
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
|
|
||||||
2),
|
2),
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
|
v128_add_8(v128_max_s8(v128_min_s8(b, sp), sm),
|
||||||
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
|
v128_max_s8(v128_min_s8(e, sp), sm))),
|
||||||
v128_add_8(v128_add_8(tmp, tmp), tmp));
|
v128_add_8(v128_add_8(tmp, tmp), tmp));
|
||||||
|
|
||||||
return v128_shr_s8(
|
return v128_add_8(
|
||||||
|
o, v128_shr_s8(
|
||||||
v128_add_8(v128_dup_8(8),
|
v128_add_8(v128_dup_8(8),
|
||||||
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
|
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
|
||||||
4);
|
4));
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
|
||||||
|
v128 f, v128 sp, v128 sm) {
|
||||||
|
calc_diff(o, &a, &b, &c, &d, &e, &f);
|
||||||
|
return delta_kernel(o, a, b, c, d, e, f, sp, sm);
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMD_INLINE void clip_sides(v128 *b, v128 *c, v128 *d, v128 *e, int left,
|
||||||
|
int right) {
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
|
||||||
|
DECLARE_ALIGNED(16, static const uint64_t,
|
||||||
|
e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
|
||||||
|
|
||||||
|
if (!left) { // Left clipping
|
||||||
|
*b = v128_shuffle_8(*b, v128_load_aligned(b_shuff));
|
||||||
|
*c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
|
||||||
|
}
|
||||||
|
if (!right) { // Right clipping
|
||||||
|
*d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
|
||||||
|
*e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
|
||||||
|
int rstride, int ostride, int x0, int y0,
|
||||||
|
int bottom, int right, int y, v128 *o, v128 *r,
|
||||||
|
v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
|
||||||
|
v128 *f) {
|
||||||
|
const v64 k1 = v64_load_aligned(org);
|
||||||
|
const v64 k2 = v64_load_aligned(org + ostride);
|
||||||
|
const v64 l1 = v64_load_aligned(rec);
|
||||||
|
const v64 l2 = v64_load_aligned(rec + rstride);
|
||||||
|
*o = v128_from_v64(k1, k2);
|
||||||
|
*r = v128_from_v64(l1, l2);
|
||||||
|
*a = v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1);
|
||||||
|
*f = v128_from_v64(l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride));
|
||||||
|
*b = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
|
||||||
|
v64_load_unaligned(rec - 2 * !!x0 + rstride));
|
||||||
|
*c = v128_from_v64(v64_load_unaligned(rec - !!x0),
|
||||||
|
v64_load_unaligned(rec - !!x0 + rstride));
|
||||||
|
*d = v128_from_v64(v64_load_unaligned(rec + !!right),
|
||||||
|
v64_load_unaligned(rec + !!right + rstride));
|
||||||
|
*e = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
|
||||||
|
v64_load_unaligned(rec + 2 * !!right + rstride));
|
||||||
|
clip_sides(b, c, d, e, x0, right);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
|
void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
|
||||||
int rstride, int ostride, int x0, int y0,
|
int rstride, int ostride, int x0, int y0,
|
||||||
int width, int height, int *sum0, int *sum1,
|
int width, int height, int *sum0, int *sum1,
|
||||||
unsigned int strength) {
|
unsigned int strength, int size) {
|
||||||
ssd128_internal ssd0 = v128_ssd_u8_init();
|
|
||||||
ssd128_internal ssd1 = v128_ssd_u8_init();
|
|
||||||
const v128 c128 = v128_dup_8(128);
|
|
||||||
const v128 sp = v128_dup_8(strength);
|
const v128 sp = v128_dup_8(strength);
|
||||||
const v128 sm = v128_dup_8(-(int)strength);
|
const v128 sm = v128_dup_8(-(int)strength);
|
||||||
|
const int right = width - 8 - x0;
|
||||||
const int bottom = height - 2 - y0;
|
const int bottom = height - 2 - y0;
|
||||||
|
ssd128_internal ssd0 = v128_ssd_u8_init();
|
||||||
|
ssd128_internal ssd1 = v128_ssd_u8_init();
|
||||||
|
int y;
|
||||||
|
|
||||||
|
if (size != 8) { // Fallback to plain C
|
||||||
|
aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
|
||||||
|
sum1, strength, size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
rec += x0 + y0 * rstride;
|
rec += x0 + y0 * rstride;
|
||||||
org += x0 + y0 * ostride;
|
org += x0 + y0 * ostride;
|
||||||
|
|
||||||
if (!x0) { // Clip left
|
|
||||||
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
|
|
||||||
v64_from_64(0x0504030201000000LL));
|
|
||||||
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
|
|
||||||
v64_from_64(0x0605040302010000LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
for (y = 0; y < 8; y += 2) {
|
||||||
const v64 k1 = v64_load_aligned(org);
|
v128 a, b, c, d, e, f, o, r;
|
||||||
const v64 k2 = v64_load_aligned(org + ostride);
|
read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
|
||||||
const v64 l1 = v64_load_aligned(rec);
|
&a, &b, &c, &d, &e, &f);
|
||||||
const v64 l2 = v64_load_aligned(rec + rstride);
|
ssd0 = v128_ssd_u8(ssd0, o, r);
|
||||||
v128 o = v128_from_v64(k1, k2);
|
ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
|
||||||
const v128 q = v128_from_v64(l1, l2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
|
|
||||||
const v128 b = v128_shuffle_8(x, b_shuff);
|
|
||||||
const v128 c = v128_shuffle_8(x, c_shuff);
|
|
||||||
const v128 d = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(rec + 1),
|
|
||||||
v64_load_unaligned(rec + 1 + rstride)));
|
|
||||||
const v128 e = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(rec + 2),
|
|
||||||
v64_load_unaligned(rec + 2 + rstride)));
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_from_v64(
|
|
||||||
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
ssd1 = v128_ssd_u8(
|
|
||||||
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
|
|
||||||
rec += rstride * 2;
|
rec += rstride * 2;
|
||||||
org += ostride * 2;
|
org += ostride * 2;
|
||||||
}
|
}
|
||||||
} else if (!(width - x0 - 8)) { // Clip right
|
|
||||||
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
|
|
||||||
v64_from_64(0x0707060504030201LL));
|
|
||||||
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
|
|
||||||
v64_from_64(0x0707070605040302LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
|
||||||
const v64 k1 = v64_load_aligned(org);
|
|
||||||
const v64 k2 = v64_load_aligned(org + ostride);
|
|
||||||
const v64 l1 = v64_load_aligned(rec);
|
|
||||||
const v64 l2 = v64_load_aligned(rec + rstride);
|
|
||||||
v128 o = v128_from_v64(k1, k2);
|
|
||||||
const v128 q = v128_from_v64(l1, l2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
|
|
||||||
const v128 b = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(rec - 2),
|
|
||||||
v64_load_unaligned(rec - 2 + rstride)));
|
|
||||||
const v128 c = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(rec - 1),
|
|
||||||
v64_load_unaligned(rec - 1 + rstride)));
|
|
||||||
const v128 d = v128_shuffle_8(x, d_shuff);
|
|
||||||
const v128 e = v128_shuffle_8(x, e_shuff);
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_from_v64(
|
|
||||||
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
ssd1 = v128_ssd_u8(
|
|
||||||
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
|
|
||||||
rec += rstride * 2;
|
|
||||||
org += ostride * 2;
|
|
||||||
}
|
|
||||||
} else { // No left/right clipping
|
|
||||||
int y;
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
|
||||||
const v64 k1 = v64_load_aligned(org);
|
|
||||||
const v64 k2 = v64_load_aligned(org + ostride);
|
|
||||||
const v64 l1 = v64_load_aligned(rec);
|
|
||||||
const v64 l2 = v64_load_aligned(rec + rstride);
|
|
||||||
v128 o = v128_from_v64(k1, k2);
|
|
||||||
const v128 q = v128_from_v64(l1, l2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
|
|
||||||
const v128 b = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(rec - 2),
|
|
||||||
v64_load_unaligned(rec - 2 + rstride)));
|
|
||||||
const v128 c = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(rec - 1),
|
|
||||||
v64_load_unaligned(rec - 1 + rstride)));
|
|
||||||
const v128 d = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(rec + 1),
|
|
||||||
v64_load_unaligned(rec + 1 + rstride)));
|
|
||||||
const v128 e = v128_add_8(
|
|
||||||
c128, v128_from_v64(v64_load_unaligned(rec + 2),
|
|
||||||
v64_load_unaligned(rec + 2 + rstride)));
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_from_v64(
|
|
||||||
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
ssd1 = v128_ssd_u8(
|
|
||||||
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
|
|
||||||
rec += rstride * 2;
|
|
||||||
org += ostride * 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*sum0 += v128_ssd_u8_sum(ssd0);
|
*sum0 += v128_ssd_u8_sum(ssd0);
|
||||||
*sum1 += v128_ssd_u8_sum(ssd1);
|
*sum1 += v128_ssd_u8_sum(ssd1);
|
||||||
}
|
}
|
||||||
|
|
||||||
SIMD_INLINE void calc_delta_multi(v128 x, v128 q, v128 o, v128 a, v128 b,
|
SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
|
||||||
v128 c, v128 d, v128 e, v128 f, v128 cp1,
|
v128 d, v128 e, v128 f, ssd128_internal *ssd1,
|
||||||
v128 cm1, v128 cp2, v128 cm2, v128 cp4,
|
|
||||||
v128 cm4, ssd128_internal *ssd1,
|
|
||||||
ssd128_internal *ssd2,
|
ssd128_internal *ssd2,
|
||||||
ssd128_internal *ssd3) {
|
ssd128_internal *ssd3) {
|
||||||
v128 tmp, delta1, delta2, delta3;
|
calc_diff(r, &a, &b, &c, &d, &e, &f);
|
||||||
const v128 c8 = v128_dup_8(8);
|
*ssd1 = v128_ssd_u8(*ssd1, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(1),
|
||||||
|
v128_dup_8(-1)));
|
||||||
a = v128_ssub_s8(a, x);
|
*ssd2 = v128_ssd_u8(*ssd2, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(2),
|
||||||
b = v128_ssub_s8(b, x);
|
v128_dup_8(-2)));
|
||||||
c = v128_ssub_s8(c, x);
|
*ssd3 = v128_ssd_u8(*ssd3, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(4),
|
||||||
d = v128_ssub_s8(d, x);
|
v128_dup_8(-4)));
|
||||||
e = v128_ssub_s8(e, x);
|
|
||||||
f = v128_ssub_s8(f, x);
|
|
||||||
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
|
|
||||||
v128_max_s8(v128_min_s8(d, cp1), cm1));
|
|
||||||
delta1 = v128_add_8(
|
|
||||||
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
|
|
||||||
v128_max_s8(v128_min_s8(f, cp1), cm1)),
|
|
||||||
2),
|
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
|
|
||||||
v128_max_s8(v128_min_s8(e, cp1), cm1))),
|
|
||||||
v128_add_8(v128_add_8(tmp, tmp), tmp));
|
|
||||||
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
|
|
||||||
v128_max_s8(v128_min_s8(d, cp2), cm2));
|
|
||||||
delta2 = v128_add_8(
|
|
||||||
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
|
|
||||||
v128_max_s8(v128_min_s8(f, cp2), cm2)),
|
|
||||||
2),
|
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
|
|
||||||
v128_max_s8(v128_min_s8(e, cp2), cm2))),
|
|
||||||
v128_add_8(v128_add_8(tmp, tmp), tmp));
|
|
||||||
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
|
|
||||||
v128_max_s8(v128_min_s8(d, cp4), cm4));
|
|
||||||
delta3 = v128_add_8(
|
|
||||||
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
|
|
||||||
v128_max_s8(v128_min_s8(f, cp4), cm4)),
|
|
||||||
2),
|
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
|
|
||||||
v128_max_s8(v128_min_s8(e, cp4), cm4))),
|
|
||||||
v128_add_8(v128_add_8(tmp, tmp), tmp));
|
|
||||||
|
|
||||||
*ssd1 = v128_ssd_u8(
|
|
||||||
*ssd1, o,
|
|
||||||
v128_add_8(
|
|
||||||
q, v128_shr_s8(
|
|
||||||
v128_add_8(c8, v128_add_8(delta1,
|
|
||||||
v128_cmplt_s8(delta1, v128_zero()))),
|
|
||||||
4)));
|
|
||||||
*ssd2 = v128_ssd_u8(
|
|
||||||
*ssd2, o,
|
|
||||||
v128_add_8(
|
|
||||||
q, v128_shr_s8(
|
|
||||||
v128_add_8(c8, v128_add_8(delta2,
|
|
||||||
v128_cmplt_s8(delta2, v128_zero()))),
|
|
||||||
4)));
|
|
||||||
*ssd3 = v128_ssd_u8(
|
|
||||||
*ssd3, o,
|
|
||||||
v128_add_8(
|
|
||||||
q, v128_shr_s8(
|
|
||||||
v128_add_8(c8, v128_add_8(delta3,
|
|
||||||
v128_cmplt_s8(delta3, v128_zero()))),
|
|
||||||
4)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test multiple filter strengths at once.
|
// Test multiple filter strengths at once.
|
||||||
void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
|
void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
|
||||||
int rstride, int ostride, int x0, int y0,
|
int rstride, int ostride, int x0, int y0,
|
||||||
int width, int height, int *sum) {
|
int width, int height, int *sum,
|
||||||
const v128 c128 = v128_dup_8(128);
|
int size) {
|
||||||
const v128 cp1 = v128_dup_8(1);
|
|
||||||
const v128 cm1 = v128_dup_8(-1);
|
|
||||||
const v128 cp2 = v128_dup_8(2);
|
|
||||||
const v128 cm2 = v128_dup_8(-2);
|
|
||||||
const v128 cp4 = v128_dup_8(4);
|
|
||||||
const v128 cm4 = v128_dup_8(-4);
|
|
||||||
const int bottom = height - 2 - y0;
|
const int bottom = height - 2 - y0;
|
||||||
|
const int right = width - 8 - x0;
|
||||||
ssd128_internal ssd0 = v128_ssd_u8_init();
|
ssd128_internal ssd0 = v128_ssd_u8_init();
|
||||||
ssd128_internal ssd1 = v128_ssd_u8_init();
|
ssd128_internal ssd1 = v128_ssd_u8_init();
|
||||||
ssd128_internal ssd2 = v128_ssd_u8_init();
|
ssd128_internal ssd2 = v128_ssd_u8_init();
|
||||||
ssd128_internal ssd3 = v128_ssd_u8_init();
|
ssd128_internal ssd3 = v128_ssd_u8_init();
|
||||||
|
int y;
|
||||||
|
|
||||||
|
if (size != 8) { // Fallback to plain C
|
||||||
|
aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
|
||||||
|
sum, size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
rec += x0 + y0 * rstride;
|
rec += x0 + y0 * rstride;
|
||||||
org += x0 + y0 * ostride;
|
org += x0 + y0 * ostride;
|
||||||
|
|
||||||
if (!x0) { // Clip left
|
|
||||||
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
|
|
||||||
v64_from_64(0x0504030201000000LL));
|
|
||||||
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
|
|
||||||
v64_from_64(0x0605040302010000LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
for (y = 0; y < 8; y += 2) {
|
||||||
const v64 k1 = v64_load_aligned(org);
|
v128 a, b, c, d, e, f, o, r;
|
||||||
const v64 k2 = v64_load_aligned(org + ostride);
|
read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
|
||||||
const v64 l1 = v64_load_aligned(rec);
|
&a, &b, &c, &d, &e, &f);
|
||||||
const v64 l2 = v64_load_aligned(rec + rstride);
|
ssd0 = v128_ssd_u8(ssd0, o, r);
|
||||||
v128 o = v128_from_v64(k1, k2);
|
calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
|
||||||
const v128 q = v128_from_v64(l1, l2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
v128 a = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
|
|
||||||
v128 b = v128_shuffle_8(x, b_shuff);
|
|
||||||
v128 c = v128_shuffle_8(x, c_shuff);
|
|
||||||
v128 d = v128_add_8(c128,
|
|
||||||
v128_from_v64(v64_load_unaligned(rec + 1),
|
|
||||||
v64_load_unaligned(rec + 1 + rstride)));
|
|
||||||
v128 e = v128_add_8(c128,
|
|
||||||
v128_from_v64(v64_load_unaligned(rec + 2),
|
|
||||||
v64_load_unaligned(rec + 2 + rstride)));
|
|
||||||
v128 f = v128_add_8(
|
|
||||||
c128, v128_from_v64(
|
|
||||||
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
|
|
||||||
&ssd1, &ssd2, &ssd3);
|
|
||||||
rec += 2 * rstride;
|
rec += 2 * rstride;
|
||||||
org += 2 * ostride;
|
org += 2 * ostride;
|
||||||
}
|
}
|
||||||
} else if (!(width - x0 - 8)) { // Clip right
|
|
||||||
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
|
|
||||||
v64_from_64(0x0707060504030201LL));
|
|
||||||
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
|
|
||||||
v64_from_64(0x0707070605040302LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
|
||||||
const v64 k1 = v64_load_aligned(org);
|
|
||||||
const v64 k2 = v64_load_aligned(org + ostride);
|
|
||||||
const v64 l1 = v64_load_aligned(rec);
|
|
||||||
const v64 l2 = v64_load_aligned(rec + rstride);
|
|
||||||
v128 o = v128_from_v64(k1, k2);
|
|
||||||
const v128 q = v128_from_v64(l1, l2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
v128 a = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
|
|
||||||
v128 b = v128_add_8(c128,
|
|
||||||
v128_from_v64(v64_load_unaligned(rec - 2),
|
|
||||||
v64_load_unaligned(rec - 2 + rstride)));
|
|
||||||
v128 c = v128_add_8(c128,
|
|
||||||
v128_from_v64(v64_load_unaligned(rec - 1),
|
|
||||||
v64_load_unaligned(rec - 1 + rstride)));
|
|
||||||
v128 d = v128_shuffle_8(x, d_shuff);
|
|
||||||
v128 e = v128_shuffle_8(x, e_shuff);
|
|
||||||
v128 f = v128_add_8(
|
|
||||||
c128, v128_from_v64(
|
|
||||||
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
|
|
||||||
&ssd1, &ssd2, &ssd3);
|
|
||||||
rec += 2 * rstride;
|
|
||||||
org += 2 * ostride;
|
|
||||||
}
|
|
||||||
} else { // No left/right clipping
|
|
||||||
int y;
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
|
||||||
const v64 k1 = v64_load_aligned(org);
|
|
||||||
const v64 k2 = v64_load_aligned(org + ostride);
|
|
||||||
const v64 l1 = v64_load_aligned(rec);
|
|
||||||
const v64 l2 = v64_load_aligned(rec + rstride);
|
|
||||||
v128 o = v128_from_v64(k1, k2);
|
|
||||||
const v128 q = v128_from_v64(l1, l2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
v128 a = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1));
|
|
||||||
v128 b = v128_add_8(c128,
|
|
||||||
v128_from_v64(v64_load_unaligned(rec - 2),
|
|
||||||
v64_load_unaligned(rec - 2 + rstride)));
|
|
||||||
v128 c = v128_add_8(c128,
|
|
||||||
v128_from_v64(v64_load_unaligned(rec - 1),
|
|
||||||
v64_load_unaligned(rec - 1 + rstride)));
|
|
||||||
v128 d = v128_add_8(c128,
|
|
||||||
v128_from_v64(v64_load_unaligned(rec + 1),
|
|
||||||
v64_load_unaligned(rec + 1 + rstride)));
|
|
||||||
v128 e = v128_add_8(c128,
|
|
||||||
v128_from_v64(v64_load_unaligned(rec + 2),
|
|
||||||
v64_load_unaligned(rec + 2 + rstride)));
|
|
||||||
v128 f = v128_add_8(
|
|
||||||
c128, v128_from_v64(
|
|
||||||
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
|
|
||||||
&ssd1, &ssd2, &ssd3);
|
|
||||||
rec += 2 * rstride;
|
|
||||||
org += 2 * ostride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sum[0] += v128_ssd_u8_sum(ssd0);
|
sum[0] += v128_ssd_u8_sum(ssd0);
|
||||||
sum[1] += v128_ssd_u8_sum(ssd1);
|
sum[1] += v128_ssd_u8_sum(ssd1);
|
||||||
sum[2] += v128_ssd_u8_sum(ssd2);
|
sum[2] += v128_ssd_u8_sum(ssd2);
|
||||||
@@ -353,155 +182,67 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
|
SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
|
||||||
|
int rstride, int ostride, int x0, int y0,
|
||||||
|
int bottom, int right, int y, v128 *o,
|
||||||
|
v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
|
||||||
|
v128 *e, v128 *f, int shift) {
|
||||||
|
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
|
||||||
|
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
|
||||||
|
*o = v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
|
||||||
|
v128_shr_u16(v128_load_aligned(org + ostride), shift));
|
||||||
|
*r = v128_unziplo_8(n1, n2);
|
||||||
|
*a = v128_unziplo_8(
|
||||||
|
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift), n1);
|
||||||
|
*f = v128_unziplo_8(
|
||||||
|
n2, v128_shr_u16(v128_load_unaligned(rec + ((y != bottom) + 1) * rstride),
|
||||||
|
shift));
|
||||||
|
*b = v128_unziplo_8(
|
||||||
|
v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
|
||||||
|
v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
|
||||||
|
*c = v128_unziplo_8(
|
||||||
|
v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
|
||||||
|
v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
|
||||||
|
*d = v128_unziplo_8(
|
||||||
|
v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
|
||||||
|
v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
|
||||||
|
*e = v128_unziplo_8(
|
||||||
|
v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
|
||||||
|
v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
|
||||||
|
clip_sides(b, c, d, e, x0, right);
|
||||||
|
}
|
||||||
|
|
||||||
void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
|
void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
|
||||||
int rstride, int ostride, int x0, int y0,
|
int rstride, int ostride, int x0, int y0,
|
||||||
int width, int height, int *sum0, int *sum1,
|
int width, int height, int *sum0, int *sum1,
|
||||||
unsigned int strength, int shift) {
|
unsigned int strength, int shift,
|
||||||
ssd128_internal ssd0 = v128_ssd_u8_init();
|
int size) {
|
||||||
ssd128_internal ssd1 = v128_ssd_u8_init();
|
|
||||||
const v128 c128 = v128_dup_8(128);
|
|
||||||
const v128 sp = v128_dup_8(strength >> shift);
|
const v128 sp = v128_dup_8(strength >> shift);
|
||||||
const v128 sm = v128_dup_8(-(int)(strength >> shift));
|
const v128 sm = v128_dup_8(-(int)(strength >> shift));
|
||||||
const int bottom = height - 2 - y0;
|
const int bottom = height - 2 - y0;
|
||||||
|
const int right = width - 8 - x0;
|
||||||
|
ssd128_internal ssd0 = v128_ssd_u8_init();
|
||||||
|
ssd128_internal ssd1 = v128_ssd_u8_init();
|
||||||
|
int y;
|
||||||
|
|
||||||
|
if (size != 8) { // Fallback to plain C
|
||||||
|
aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
|
||||||
|
sum0, sum1, strength, shift, size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
rec += x0 + y0 * rstride;
|
rec += x0 + y0 * rstride;
|
||||||
org += x0 + y0 * ostride;
|
org += x0 + y0 * ostride;
|
||||||
|
|
||||||
if (!x0) { // Clip left
|
|
||||||
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
|
|
||||||
v64_from_64(0x0504030201000000LL));
|
|
||||||
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
|
|
||||||
v64_from_64(0x0605040302010000LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
for (y = 0; y < 8; y += 2) {
|
||||||
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
|
v128 a, b, c, d, e, f, o, r;
|
||||||
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
|
read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
|
||||||
const v128 o =
|
&r, &a, &b, &c, &d, &e, &f, shift);
|
||||||
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
|
ssd0 = v128_ssd_u8(ssd0, o, r);
|
||||||
v128_shr_u16(v128_load_aligned(org + ostride), shift));
|
ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm));
|
||||||
const v128 q = v128_unziplo_8(n1, n2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
|
|
||||||
shift),
|
|
||||||
n1));
|
|
||||||
const v128 b = v128_shuffle_8(x, b_shuff);
|
|
||||||
const v128 c = v128_shuffle_8(x, c_shuff);
|
|
||||||
const v128 d = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
|
|
||||||
const v128 e = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
n2, v128_shr_u16(v128_load_unaligned(
|
|
||||||
rec + ((y != bottom) + 1) * rstride),
|
|
||||||
shift)));
|
|
||||||
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
ssd1 = v128_ssd_u8(
|
|
||||||
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
|
|
||||||
rec += rstride * 2;
|
rec += rstride * 2;
|
||||||
org += ostride * 2;
|
org += ostride * 2;
|
||||||
}
|
}
|
||||||
} else if (!(width - x0 - 8)) { // Clip right
|
|
||||||
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
|
|
||||||
v64_from_64(0x0707060504030201LL));
|
|
||||||
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
|
|
||||||
v64_from_64(0x0707070605040302LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
|
||||||
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
|
|
||||||
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
|
|
||||||
const v128 o =
|
|
||||||
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
|
|
||||||
v128_shr_u16(v128_load_aligned(org + ostride), shift));
|
|
||||||
const v128 q = v128_unziplo_8(n1, n2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
|
|
||||||
shift),
|
|
||||||
n1));
|
|
||||||
const v128 b = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
|
|
||||||
const v128 c = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
|
|
||||||
const v128 d = v128_shuffle_8(x, d_shuff);
|
|
||||||
const v128 e = v128_shuffle_8(x, e_shuff);
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
n2, v128_shr_u16(v128_load_unaligned(
|
|
||||||
rec + ((y != bottom) + 1) * rstride),
|
|
||||||
shift)));
|
|
||||||
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
ssd1 = v128_ssd_u8(
|
|
||||||
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
|
|
||||||
rec += rstride * 2;
|
|
||||||
org += ostride * 2;
|
|
||||||
}
|
|
||||||
} else { // No left/right clipping
|
|
||||||
int y;
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
|
||||||
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
|
|
||||||
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
|
|
||||||
const v128 o =
|
|
||||||
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
|
|
||||||
v128_shr_u16(v128_load_aligned(org + ostride), shift));
|
|
||||||
const v128 q = v128_unziplo_8(n1, n2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
|
|
||||||
shift),
|
|
||||||
n1));
|
|
||||||
const v128 b = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
|
|
||||||
const v128 c = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
|
|
||||||
const v128 d = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
|
|
||||||
const v128 e = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
n2, v128_shr_u16(v128_load_unaligned(
|
|
||||||
rec + ((y != bottom) + 1) * rstride),
|
|
||||||
shift)));
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
ssd1 = v128_ssd_u8(
|
|
||||||
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
|
|
||||||
rec += rstride * 2;
|
|
||||||
org += ostride * 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*sum0 += v128_ssd_u8_sum(ssd0);
|
*sum0 += v128_ssd_u8_sum(ssd0);
|
||||||
*sum1 += v128_ssd_u8_sum(ssd1);
|
*sum1 += v128_ssd_u8_sum(ssd1);
|
||||||
}
|
}
|
||||||
@@ -510,159 +251,33 @@ void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
|
|||||||
const uint16_t *org, int rstride,
|
const uint16_t *org, int rstride,
|
||||||
int ostride, int x0, int y0,
|
int ostride, int x0, int y0,
|
||||||
int width, int height, int *sum,
|
int width, int height, int *sum,
|
||||||
int shift) {
|
int shift, int size) {
|
||||||
const v128 c128 = v128_dup_8(128);
|
|
||||||
const v128 cp1 = v128_dup_8(1);
|
|
||||||
const v128 cm1 = v128_dup_8(-1);
|
|
||||||
const v128 cp2 = v128_dup_8(2);
|
|
||||||
const v128 cm2 = v128_dup_8(-2);
|
|
||||||
const v128 cp4 = v128_dup_8(4);
|
|
||||||
const v128 cm4 = v128_dup_8(-4);
|
|
||||||
const int bottom = height - 2 - y0;
|
const int bottom = height - 2 - y0;
|
||||||
|
const int right = width - 8 - x0;
|
||||||
ssd128_internal ssd0 = v128_ssd_u8_init();
|
ssd128_internal ssd0 = v128_ssd_u8_init();
|
||||||
ssd128_internal ssd1 = v128_ssd_u8_init();
|
ssd128_internal ssd1 = v128_ssd_u8_init();
|
||||||
ssd128_internal ssd2 = v128_ssd_u8_init();
|
ssd128_internal ssd2 = v128_ssd_u8_init();
|
||||||
ssd128_internal ssd3 = v128_ssd_u8_init();
|
ssd128_internal ssd3 = v128_ssd_u8_init();
|
||||||
|
int y;
|
||||||
|
|
||||||
|
if (size != 8) { // Fallback to plain C
|
||||||
|
aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
|
||||||
|
height, sum, shift, size);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
rec += x0 + y0 * rstride;
|
rec += x0 + y0 * rstride;
|
||||||
org += x0 + y0 * ostride;
|
org += x0 + y0 * ostride;
|
||||||
|
|
||||||
if (!x0) { // Clip left
|
|
||||||
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
|
|
||||||
v64_from_64(0x0504030201000000LL));
|
|
||||||
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
|
|
||||||
v64_from_64(0x0605040302010000LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
for (y = 0; y < 8; y += 2) {
|
||||||
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
|
v128 a, b, c, d, e, f, o, r;
|
||||||
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
|
read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
|
||||||
const v128 o =
|
&r, &a, &b, &c, &d, &e, &f, shift);
|
||||||
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
|
ssd0 = v128_ssd_u8(ssd0, o, r);
|
||||||
v128_shr_u16(v128_load_aligned(org + ostride), shift));
|
calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3);
|
||||||
const v128 q = v128_unziplo_8(n1, n2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
|
|
||||||
shift),
|
|
||||||
n1));
|
|
||||||
const v128 b = v128_shuffle_8(x, b_shuff);
|
|
||||||
const v128 c = v128_shuffle_8(x, c_shuff);
|
|
||||||
const v128 d = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
|
|
||||||
const v128 e = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
n2, v128_shr_u16(v128_load_unaligned(
|
|
||||||
rec + ((y != bottom) + 1) * rstride),
|
|
||||||
shift)));
|
|
||||||
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
|
|
||||||
&ssd1, &ssd2, &ssd3);
|
|
||||||
rec += 2 * rstride;
|
rec += 2 * rstride;
|
||||||
org += 2 * ostride;
|
org += 2 * ostride;
|
||||||
}
|
}
|
||||||
} else if (!(width - x0 - 8)) { // Clip right
|
|
||||||
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
|
|
||||||
v64_from_64(0x0707060504030201LL));
|
|
||||||
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
|
|
||||||
v64_from_64(0x0707070605040302LL));
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
|
||||||
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
|
|
||||||
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
|
|
||||||
const v128 o =
|
|
||||||
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
|
|
||||||
v128_shr_u16(v128_load_aligned(org + ostride), shift));
|
|
||||||
const v128 q = v128_unziplo_8(n1, n2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
|
|
||||||
shift),
|
|
||||||
n1));
|
|
||||||
const v128 b = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
|
|
||||||
const v128 c = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
|
|
||||||
const v128 d = v128_shuffle_8(x, d_shuff);
|
|
||||||
const v128 e = v128_shuffle_8(x, e_shuff);
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
n2, v128_shr_u16(v128_load_unaligned(
|
|
||||||
rec + ((y != bottom) + 1) * rstride),
|
|
||||||
shift)));
|
|
||||||
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
|
|
||||||
&ssd1, &ssd2, &ssd3);
|
|
||||||
rec += 2 * rstride;
|
|
||||||
org += 2 * ostride;
|
|
||||||
}
|
|
||||||
} else { // No left/right clipping
|
|
||||||
int y;
|
|
||||||
for (y = 0; y < 8; y += 2) {
|
|
||||||
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
|
|
||||||
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
|
|
||||||
const v128 o =
|
|
||||||
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
|
|
||||||
v128_shr_u16(v128_load_aligned(org + ostride), shift));
|
|
||||||
const v128 q = v128_unziplo_8(n1, n2);
|
|
||||||
const v128 x = v128_add_8(c128, q);
|
|
||||||
const v128 a = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
|
|
||||||
shift),
|
|
||||||
n1));
|
|
||||||
const v128 b = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
|
|
||||||
const v128 c = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
|
|
||||||
const v128 d = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
|
|
||||||
const v128 e = v128_add_8(
|
|
||||||
c128,
|
|
||||||
v128_unziplo_8(
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
|
|
||||||
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
|
|
||||||
const v128 f = v128_add_8(
|
|
||||||
c128, v128_unziplo_8(
|
|
||||||
n2, v128_shr_u16(v128_load_unaligned(
|
|
||||||
rec + ((y != bottom) + 1) * rstride),
|
|
||||||
shift)));
|
|
||||||
|
|
||||||
ssd0 = v128_ssd_u8(ssd0, o, q);
|
|
||||||
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
|
|
||||||
&ssd1, &ssd2, &ssd3);
|
|
||||||
rec += 2 * rstride;
|
|
||||||
org += 2 * ostride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sum[0] += v128_ssd_u8_sum(ssd0);
|
sum[0] += v128_ssd_u8_sum(ssd0);
|
||||||
sum[1] += v128_ssd_u8_sum(ssd1);
|
sum[1] += v128_ssd_u8_sum(ssd1);
|
||||||
sum[2] += v128_ssd_u8_sum(ssd2);
|
sum[2] += v128_ssd_u8_sum(ssd2);
|
||||||
|
@@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
#include "av1/common/alloccommon.h"
|
#include "av1/common/alloccommon.h"
|
||||||
#if CONFIG_CLPF
|
#if CONFIG_CLPF
|
||||||
|
#include "aom/aom_image.h"
|
||||||
#include "av1/common/clpf.h"
|
#include "av1/common/clpf.h"
|
||||||
#include "av1/encoder/clpf_rdo.h"
|
#include "av1/encoder/clpf_rdo.h"
|
||||||
#endif
|
#endif
|
||||||
@@ -3422,7 +3423,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#if CONFIG_CLPF
|
#if CONFIG_CLPF
|
||||||
cm->clpf_strength = 0;
|
cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
|
||||||
cm->clpf_size = 2;
|
cm->clpf_size = 2;
|
||||||
CHECK_MEM_ERROR(
|
CHECK_MEM_ERROR(
|
||||||
cm, cm->clpf_blocks,
|
cm, cm->clpf_blocks,
|
||||||
@@ -3430,21 +3431,37 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
|
|||||||
((cm->frame_to_show->y_crop_height + 31) & ~31) >>
|
((cm->frame_to_show->y_crop_height + 31) & ~31) >>
|
||||||
10));
|
10));
|
||||||
if (!is_lossless_requested(&cpi->oxcf)) {
|
if (!is_lossless_requested(&cpi->oxcf)) {
|
||||||
|
const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
|
||||||
|
|
||||||
// Find the best strength and block size for the entire frame
|
// Find the best strength and block size for the entire frame
|
||||||
int fb_size_log2, strength;
|
int fb_size_log2, strength_y, strength_u, strength_v;
|
||||||
av1_clpf_test_frame(cm->frame_to_show, cpi->Source, cm, &strength,
|
av1_clpf_test_frame(frame, cpi->Source, cm, &strength_y, &fb_size_log2,
|
||||||
&fb_size_log2);
|
AOM_PLANE_Y);
|
||||||
|
av1_clpf_test_frame(frame, cpi->Source, cm, &strength_u, &fb_size_log2,
|
||||||
|
AOM_PLANE_U);
|
||||||
|
av1_clpf_test_frame(frame, cpi->Source, cm, &strength_v, &fb_size_log2,
|
||||||
|
AOM_PLANE_V);
|
||||||
|
|
||||||
if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE);
|
if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE);
|
||||||
|
|
||||||
if (strength) {
|
if (strength_y) {
|
||||||
// Apply the filter using the chosen strength
|
// Apply the filter using the chosen strength
|
||||||
cm->clpf_strength = strength - (strength == 4);
|
cm->clpf_strength_y = strength_y - (strength_y == 4);
|
||||||
cm->clpf_size =
|
cm->clpf_size =
|
||||||
fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
|
fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
|
||||||
cm->clpf_numblocks = av1_clpf_frame(
|
cm->clpf_numblocks = av1_clpf_frame(
|
||||||
cm->frame_to_show, cpi->Source, cm, !!cm->clpf_size, strength,
|
frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
|
||||||
4 + cm->clpf_size, cm->clpf_blocks, av1_clpf_decision);
|
4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
|
||||||
|
}
|
||||||
|
if (strength_u) {
|
||||||
|
cm->clpf_strength_u = strength_u - (strength_u == 4);
|
||||||
|
av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
|
||||||
|
NULL);
|
||||||
|
}
|
||||||
|
if (strength_v) {
|
||||||
|
cm->clpf_strength_v = strength_v - (strength_v == 4);
|
||||||
|
av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
|
||||||
|
NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@@ -147,6 +147,8 @@ void test_clpf(int w, int h, int depth, int iterations,
|
|||||||
<< "strength: " << (1 << strength) << std::endl
|
<< "strength: " << (1 << strength) << std::endl
|
||||||
<< "xpos: " << xpos << std::endl
|
<< "xpos: " << xpos << std::endl
|
||||||
<< "ypos: " << ypos << std::endl
|
<< "ypos: " << ypos << std::endl
|
||||||
|
<< "w: " << w << std::endl
|
||||||
|
<< "h: " << h << std::endl
|
||||||
<< "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
|
<< "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
|
||||||
<< "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
|
<< "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
|
||||||
<< "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl
|
<< "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl
|
||||||
|
Reference in New Issue
Block a user