Reduce memory footprint for CLPF decoding.
Instead of having CLPF write to an entire new frame and copy the result back into the original frame, make the filter able to work in-place by keeping a buffer of size frame_width*filter_block_size and delay the write-back by one filter_block_size row. This reduces the cycles spent in the filter to ~75%. Change-Id: I78ca74380c45492daa8935d08d766851edb5fbc1
This commit is contained in:
committed by
Yaowu Xu
parent
34dac00adc
commit
e8224c7ad5
@@ -587,7 +587,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint
|
|||||||
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
|
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
|
||||||
|
|
||||||
if (aom_config("CONFIG_CLPF") eq "yes") {
|
if (aom_config("CONFIG_CLPF") eq "yes") {
|
||||||
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
|
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
|
||||||
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
|
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
|
||||||
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
|
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
|
||||||
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
|
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
|
||||||
|
|||||||
@@ -27,30 +27,30 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
|
|||||||
return (8 + delta - (delta < 0)) >> 4;
|
return (8 + delta - (delta < 0)) >> 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0,
|
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
|
||||||
int y0, int sizex, int sizey, int width, int height,
|
int dstride, int x0, int y0, int sizex, int sizey,
|
||||||
unsigned int strength) {
|
int width, int height, unsigned int strength) {
|
||||||
int x, y;
|
int x, y;
|
||||||
for (y = y0; y < y0 + sizey; y++) {
|
for (y = y0; y < y0 + sizey; y++) {
|
||||||
for (x = x0; x < x0 + sizex; x++) {
|
for (x = x0; x < x0 + sizex; x++) {
|
||||||
int X = src[y * stride + x];
|
int X = src[y * sstride + x];
|
||||||
int A = src[AOMMAX(0, y - 1) * stride + x];
|
int A = src[AOMMAX(0, y - 1) * sstride + x];
|
||||||
int B = src[y * stride + AOMMAX(0, x - 2)];
|
int B = src[y * sstride + AOMMAX(0, x - 2)];
|
||||||
int C = src[y * stride + AOMMAX(0, x - 1)];
|
int C = src[y * sstride + AOMMAX(0, x - 1)];
|
||||||
int D = src[y * stride + AOMMIN(width - 1, x + 1)];
|
int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
|
||||||
int E = src[y * stride + AOMMIN(width - 1, x + 2)];
|
int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
|
||||||
int F = src[AOMMIN(height - 1, y + 1) * stride + x];
|
int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
|
||||||
int delta;
|
int delta;
|
||||||
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
|
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
|
||||||
dst[y * stride + x] = X + delta;
|
dst[y * dstride + x] = X + delta;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return number of filtered blocks
|
// Return number of filtered blocks
|
||||||
int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
|
int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
|
||||||
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org,
|
||||||
int enable_fb_flag, unsigned int strength,
|
AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
|
||||||
unsigned int fb_size_log2, uint8_t *blocks,
|
unsigned int fb_size_log2, uint8_t *blocks,
|
||||||
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
|
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
|
||||||
const YV12_BUFFER_CONFIG *,
|
const YV12_BUFFER_CONFIG *,
|
||||||
@@ -59,23 +59,45 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
|
|||||||
/* Constrained low-pass filter (CLPF) */
|
/* Constrained low-pass filter (CLPF) */
|
||||||
int c, k, l, m, n;
|
int c, k, l, m, n;
|
||||||
const int bs = MI_SIZE;
|
const int bs = MI_SIZE;
|
||||||
int width = rec->y_crop_width;
|
const int width = rec->y_crop_width;
|
||||||
int height = rec->y_crop_height;
|
const int height = rec->y_crop_height;
|
||||||
int xpos, ypos;
|
int xpos, ypos;
|
||||||
int stride_y = rec->y_stride;
|
const int sstride = rec->y_stride;
|
||||||
int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
|
int dstride = orig_dst->y_stride;
|
||||||
int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
|
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
|
||||||
|
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
|
||||||
int block_index = 0;
|
int block_index = 0;
|
||||||
|
uint8_t *cache = NULL;
|
||||||
|
uint8_t **cache_ptr = NULL;
|
||||||
|
uint8_t **cache_dst = NULL;
|
||||||
|
int cache_idx = 0;
|
||||||
|
const int cache_size = num_fb_hor << (2 * fb_size_log2);
|
||||||
|
const int cache_blocks = cache_size / (bs * bs);
|
||||||
|
YV12_BUFFER_CONFIG dst = *orig_dst;
|
||||||
|
|
||||||
|
// Make buffer space for in-place filtering
|
||||||
|
if (rec->y_buffer == dst.y_buffer) {
|
||||||
|
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
|
||||||
|
CHECK_MEM_ERROR(cm, cache_ptr,
|
||||||
|
aom_malloc(cache_blocks * sizeof(*cache_ptr)));
|
||||||
|
CHECK_MEM_ERROR(cm, cache_dst,
|
||||||
|
aom_malloc(cache_blocks * sizeof(*cache_dst)));
|
||||||
|
memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
|
||||||
|
dst.y_buffer = cache;
|
||||||
|
dstride = bs;
|
||||||
|
}
|
||||||
|
|
||||||
// Iterate over all filter blocks
|
// Iterate over all filter blocks
|
||||||
for (k = 0; k < num_fb_ver; k++) {
|
for (k = 0; k < num_fb_ver; k++) {
|
||||||
for (l = 0; l < num_fb_hor; l++) {
|
for (l = 0; l < num_fb_hor; l++) {
|
||||||
int h, w;
|
int h, w;
|
||||||
int allskip = 1;
|
int allskip = 1;
|
||||||
|
const int xoff = l << fb_size_log2;
|
||||||
|
const int yoff = k << fb_size_log2;
|
||||||
for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
|
for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
|
||||||
for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
|
for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
|
||||||
xpos = (l << fb_size_log2) + n * bs;
|
xpos = xoff + n * bs;
|
||||||
ypos = (k << fb_size_log2) + m * bs;
|
ypos = yoff + m * bs;
|
||||||
if (xpos < width && ypos < height) {
|
if (xpos < width && ypos < height) {
|
||||||
allskip &=
|
allskip &=
|
||||||
cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
|
cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
|
||||||
@@ -96,31 +118,57 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
|
|||||||
// Iterate over all smaller blocks inside the filter block
|
// Iterate over all smaller blocks inside the filter block
|
||||||
for (m = 0; m < (h + bs - 1) / bs; m++) {
|
for (m = 0; m < (h + bs - 1) / bs; m++) {
|
||||||
for (n = 0; n < (w + bs - 1) / bs; n++) {
|
for (n = 0; n < (w + bs - 1) / bs; n++) {
|
||||||
xpos = (l << fb_size_log2) + n * bs;
|
xpos = xoff + n * bs;
|
||||||
ypos = (k << fb_size_log2) + m * bs;
|
ypos = yoff + m * bs;
|
||||||
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
|
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
|
||||||
->mbmi.skip) {
|
->mbmi.skip) { // Not skip block
|
||||||
// Not skip block, apply the filter
|
// Temporary buffering needed if filtering in-place
|
||||||
aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos,
|
if (cache) {
|
||||||
bs, bs, width, height, strength);
|
if (cache_ptr[cache_idx]) {
|
||||||
|
// Copy filtered block back into the frame
|
||||||
|
for (c = 0; c < bs; c++)
|
||||||
|
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
|
}
|
||||||
|
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
|
||||||
|
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
|
||||||
|
cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos;
|
||||||
|
if (++cache_idx >= cache_blocks) cache_idx = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply the filter
|
||||||
|
aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride,
|
||||||
|
xpos, ypos, bs, bs, width, height, strength);
|
||||||
|
|
||||||
} else { // Skip block, copy instead
|
} else { // Skip block, copy instead
|
||||||
for (c = 0; c < bs; c++)
|
if (!cache)
|
||||||
*(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) =
|
for (c = 0; c < bs; c++)
|
||||||
*(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos);
|
*(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *(
|
||||||
|
uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else { // Entire filter block is skip, copy
|
} else { // Entire filter block is skip, copy
|
||||||
for (m = 0; m < h; m++)
|
if (!cache)
|
||||||
memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y +
|
for (m = 0; m < h; m++)
|
||||||
(l << fb_size_log2),
|
memcpy(dst.y_buffer + (yoff + m) * dstride + xoff,
|
||||||
rec->y_buffer + ((k << fb_size_log2) + m) * stride_y +
|
rec->y_buffer + (yoff + m) * sstride + xoff, w);
|
||||||
(l << fb_size_log2),
|
|
||||||
w);
|
|
||||||
}
|
}
|
||||||
block_index += !allskip; // Count number of blocks filtered
|
block_index += !allskip; // Count number of blocks filtered
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cache) {
|
||||||
|
// Copy remaining blocks into the frame
|
||||||
|
for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
|
||||||
|
cache_idx++)
|
||||||
|
for (c = 0; c < bs; c++)
|
||||||
|
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
|
|
||||||
|
aom_free(cache);
|
||||||
|
aom_free(cache_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
return block_index;
|
return block_index;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
int av1_clpf_maxbits(const AV1_COMMON *cm);
|
int av1_clpf_maxbits(const AV1_COMMON *cm);
|
||||||
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
|
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
|
||||||
int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
|
int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
|
||||||
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
|
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
|
||||||
int enable_fb_flag, unsigned int strength,
|
int enable_fb_flag, unsigned int strength,
|
||||||
unsigned int fb_size_log2, uint8_t *blocks,
|
unsigned int fb_size_log2, uint8_t *blocks,
|
||||||
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
|
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
|
||||||
|
|||||||
@@ -11,11 +11,11 @@
|
|||||||
|
|
||||||
#include "./aom_dsp_rtcd.h"
|
#include "./aom_dsp_rtcd.h"
|
||||||
|
|
||||||
static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
|
static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
|
||||||
int y0, int sizey, int width, int height,
|
int dstride, int x0, int y0, int sizey, int width,
|
||||||
unsigned int strength) {
|
int height, unsigned int strength) {
|
||||||
dst += x0 + y0 * stride;
|
dst += x0 + y0 * dstride;
|
||||||
src += x0 + y0 * stride;
|
src += x0 + y0 * sstride;
|
||||||
{
|
{
|
||||||
int bottom = height - 2 - y0;
|
int bottom = height - 2 - y0;
|
||||||
const v128 sp = v128_dup_8(strength);
|
const v128 sp = v128_dup_8(strength);
|
||||||
@@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
|
|||||||
|
|
||||||
for (y = 0; y < sizey; y += 2) {
|
for (y = 0; y < sizey; y += 2) {
|
||||||
const v64 l1 = v64_load_aligned(src);
|
const v64 l1 = v64_load_aligned(src);
|
||||||
const v64 l2 = v64_load_aligned(src + stride);
|
const v64 l2 = v64_load_aligned(src + sstride);
|
||||||
v128 o = v128_from_v64(l1, l2);
|
v128 o = v128_from_v64(l1, l2);
|
||||||
const v128 x = v128_add_8(c128, o);
|
const v128 x = v128_add_8(c128, o);
|
||||||
const v128 a = v128_add_8(
|
const v128 a = v128_add_8(
|
||||||
c128,
|
c128,
|
||||||
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
|
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
|
||||||
const v128 b = v128_shuffle_8(x, b_shuff);
|
const v128 b = v128_shuffle_8(x, b_shuff);
|
||||||
const v128 c = v128_shuffle_8(x, c_shuff);
|
const v128 c = v128_shuffle_8(x, c_shuff);
|
||||||
const v128 d = v128_add_8(
|
const v128 d = v128_add_8(
|
||||||
c128, v128_from_v64(v64_load_unaligned(src + 1),
|
c128, v128_from_v64(v64_load_unaligned(src + 1),
|
||||||
v64_load_unaligned(src + 1 + stride)));
|
v64_load_unaligned(src + 1 + sstride)));
|
||||||
const v128 e = v128_add_8(
|
const v128 e = v128_add_8(
|
||||||
c128, v128_from_v64(v64_load_unaligned(src + 2),
|
c128, v128_from_v64(v64_load_unaligned(src + 2),
|
||||||
v64_load_unaligned(src + 2 + stride)));
|
v64_load_unaligned(src + 2 + sstride)));
|
||||||
const v128 f = v128_add_8(
|
const v128 f = v128_add_8(
|
||||||
c128, v128_from_v64(l2, v64_load_aligned(
|
c128, v128_from_v64(l2, v64_load_aligned(
|
||||||
src + ((y != bottom) + 1) * stride)));
|
src + ((y != bottom) + 1) * sstride)));
|
||||||
|
|
||||||
const v128 tmp =
|
const v128 tmp =
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
|
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
|
||||||
@@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
|
|||||||
delta, v128_zero()))),
|
delta, v128_zero()))),
|
||||||
4));
|
4));
|
||||||
v64_store_aligned(dst, v128_high_v64(o));
|
v64_store_aligned(dst, v128_high_v64(o));
|
||||||
v64_store_aligned(dst + stride, v128_low_v64(o));
|
v64_store_aligned(dst + dstride, v128_low_v64(o));
|
||||||
src += stride * 2;
|
src += sstride * 2;
|
||||||
dst += stride * 2;
|
dst += dstride * 2;
|
||||||
}
|
}
|
||||||
} else if (!(width - x0 - 8)) { // Clip right
|
} else if (!(width - x0 - 8)) { // Clip right
|
||||||
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
|
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
|
||||||
@@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
|
|||||||
|
|
||||||
for (y = 0; y < sizey; y += 2) {
|
for (y = 0; y < sizey; y += 2) {
|
||||||
const v64 l1 = v64_load_aligned(src);
|
const v64 l1 = v64_load_aligned(src);
|
||||||
const v64 l2 = v64_load_aligned(src + stride);
|
const v64 l2 = v64_load_aligned(src + sstride);
|
||||||
v128 o = v128_from_v64(l1, l2);
|
v128 o = v128_from_v64(l1, l2);
|
||||||
const v128 x = v128_add_8(c128, o);
|
const v128 x = v128_add_8(c128, o);
|
||||||
const v128 a = v128_add_8(
|
const v128 a = v128_add_8(
|
||||||
c128,
|
c128,
|
||||||
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
|
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
|
||||||
const v128 b = v128_add_8(
|
const v128 b = v128_add_8(
|
||||||
c128, v128_from_v64(v64_load_unaligned(src - 2),
|
c128, v128_from_v64(v64_load_unaligned(src - 2),
|
||||||
v64_load_unaligned(src - 2 + stride)));
|
v64_load_unaligned(src - 2 + sstride)));
|
||||||
const v128 c = v128_add_8(
|
const v128 c = v128_add_8(
|
||||||
c128, v128_from_v64(v64_load_unaligned(src - 1),
|
c128, v128_from_v64(v64_load_unaligned(src - 1),
|
||||||
v64_load_unaligned(src - 1 + stride)));
|
v64_load_unaligned(src - 1 + sstride)));
|
||||||
const v128 d = v128_shuffle_8(x, d_shuff);
|
const v128 d = v128_shuffle_8(x, d_shuff);
|
||||||
const v128 e = v128_shuffle_8(x, e_shuff);
|
const v128 e = v128_shuffle_8(x, e_shuff);
|
||||||
const v128 f = v128_add_8(
|
const v128 f = v128_add_8(
|
||||||
c128, v128_from_v64(l2, v64_load_aligned(
|
c128, v128_from_v64(l2, v64_load_aligned(
|
||||||
src + ((y != bottom) + 1) * stride)));
|
src + ((y != bottom) + 1) * sstride)));
|
||||||
|
|
||||||
const v128 tmp =
|
const v128 tmp =
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
|
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
|
||||||
@@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
|
|||||||
delta, v128_zero()))),
|
delta, v128_zero()))),
|
||||||
4));
|
4));
|
||||||
v64_store_aligned(dst, v128_high_v64(o));
|
v64_store_aligned(dst, v128_high_v64(o));
|
||||||
v64_store_aligned(dst + stride, v128_low_v64(o));
|
v64_store_aligned(dst + dstride, v128_low_v64(o));
|
||||||
src += stride * 2;
|
src += sstride * 2;
|
||||||
dst += stride * 2;
|
dst += dstride * 2;
|
||||||
}
|
}
|
||||||
} else { // No left/right clipping
|
} else { // No left/right clipping
|
||||||
int y;
|
int y;
|
||||||
for (y = 0; y < sizey; y += 2) {
|
for (y = 0; y < sizey; y += 2) {
|
||||||
const v64 l1 = v64_load_aligned(src);
|
const v64 l1 = v64_load_aligned(src);
|
||||||
const v64 l2 = v64_load_aligned(src + stride);
|
const v64 l2 = v64_load_aligned(src + sstride);
|
||||||
v128 o = v128_from_v64(l1, l2);
|
v128 o = v128_from_v64(l1, l2);
|
||||||
const v128 x = v128_add_8(c128, o);
|
const v128 x = v128_add_8(c128, o);
|
||||||
const v128 a = v128_add_8(
|
const v128 a = v128_add_8(
|
||||||
c128,
|
c128,
|
||||||
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
|
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
|
||||||
const v128 b = v128_add_8(
|
const v128 b = v128_add_8(
|
||||||
c128, v128_from_v64(v64_load_unaligned(src - 2),
|
c128, v128_from_v64(v64_load_unaligned(src - 2),
|
||||||
v64_load_unaligned(src - 2 + stride)));
|
v64_load_unaligned(src - 2 + sstride)));
|
||||||
const v128 c = v128_add_8(
|
const v128 c = v128_add_8(
|
||||||
c128, v128_from_v64(v64_load_unaligned(src - 1),
|
c128, v128_from_v64(v64_load_unaligned(src - 1),
|
||||||
v64_load_unaligned(src - 1 + stride)));
|
v64_load_unaligned(src - 1 + sstride)));
|
||||||
const v128 d = v128_add_8(
|
const v128 d = v128_add_8(
|
||||||
c128, v128_from_v64(v64_load_unaligned(src + 1),
|
c128, v128_from_v64(v64_load_unaligned(src + 1),
|
||||||
v64_load_unaligned(src + 1 + stride)));
|
v64_load_unaligned(src + 1 + sstride)));
|
||||||
const v128 e = v128_add_8(
|
const v128 e = v128_add_8(
|
||||||
c128, v128_from_v64(v64_load_unaligned(src + 2),
|
c128, v128_from_v64(v64_load_unaligned(src + 2),
|
||||||
v64_load_unaligned(src + 2 + stride)));
|
v64_load_unaligned(src + 2 + sstride)));
|
||||||
const v128 f = v128_add_8(
|
const v128 f = v128_add_8(
|
||||||
c128, v128_from_v64(l2, v64_load_aligned(
|
c128, v128_from_v64(l2, v64_load_aligned(
|
||||||
src + ((y != bottom) + 1) * stride)));
|
src + ((y != bottom) + 1) * sstride)));
|
||||||
|
|
||||||
const v128 tmp =
|
const v128 tmp =
|
||||||
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
|
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
|
||||||
@@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
|
|||||||
delta, v128_zero()))),
|
delta, v128_zero()))),
|
||||||
4));
|
4));
|
||||||
v64_store_aligned(dst, v128_high_v64(o));
|
v64_store_aligned(dst, v128_high_v64(o));
|
||||||
v64_store_aligned(dst + stride, v128_low_v64(o));
|
v64_store_aligned(dst + dstride, v128_low_v64(o));
|
||||||
src += stride * 2;
|
src += sstride * 2;
|
||||||
dst += stride * 2;
|
dst += dstride * 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
|
void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
|
||||||
int x0, int y0, int sizex, int sizey, int width,
|
int dstride, int x0, int y0, int sizex,
|
||||||
int height, unsigned int strength) {
|
int sizey, int width, int height,
|
||||||
|
unsigned int strength) {
|
||||||
// TODO(stemidts):
|
// TODO(stemidts):
|
||||||
// A sizex different from 8 will only be needed if CLPF is extended to chroma.
|
// A sizex different from 8 will only be needed if CLPF is extended to chroma.
|
||||||
// This will only be used if 4:2:0 and width not a multiple of 16 and along
|
// This will only be used if 4:2:0 and width not a multiple of 16 and along
|
||||||
@@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
|
|||||||
// this case. If not extended to chroma, this test will be redundant.
|
// this case. If not extended to chroma, this test will be redundant.
|
||||||
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
|
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
|
||||||
// Fallback to C for odd sizes
|
// Fallback to C for odd sizes
|
||||||
aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
|
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
|
||||||
strength);
|
height, strength);
|
||||||
} else {
|
} else {
|
||||||
clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
|
clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
|
||||||
|
strength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3929,19 +3929,10 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
|
|||||||
|
|
||||||
#if CONFIG_CLPF
|
#if CONFIG_CLPF
|
||||||
if (cm->clpf_strength && !cm->skip_loop_filter) {
|
if (cm->clpf_strength && !cm->skip_loop_filter) {
|
||||||
YV12_BUFFER_CONFIG dst; // Buffer for the result
|
const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
|
||||||
|
av1_clpf_frame(frame, frame, 0, cm, !!cm->clpf_size,
|
||||||
dst = pbi->cur_buf->buf;
|
|
||||||
CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
|
|
||||||
|
|
||||||
av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
|
|
||||||
cm->clpf_strength + (cm->clpf_strength == 3),
|
cm->clpf_strength + (cm->clpf_strength == 3),
|
||||||
4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
|
4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
|
||||||
|
|
||||||
// Copy result
|
|
||||||
memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
|
|
||||||
dst.y_height * dst.y_stride);
|
|
||||||
aom_free(dst.y_buffer);
|
|
||||||
}
|
}
|
||||||
if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
|
if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -26,9 +26,9 @@ using libaom_test::ACMRandom;
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride,
|
typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
|
||||||
int x0, int y0, int sizex, int sizey, int width,
|
int dstride, int x0, int y0, int sizex, int sizey,
|
||||||
int height, unsigned int strength);
|
int width, int height, unsigned int strength);
|
||||||
|
|
||||||
typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
|
typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
|
||||||
clpf_block_param_t;
|
clpf_block_param_t;
|
||||||
@@ -85,10 +85,10 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
|
|||||||
for (ypos = 0; ypos < size && !error; ypos += h * !error) {
|
for (ypos = 0; ypos < size && !error; ypos += h * !error) {
|
||||||
for (xpos = 0; xpos < size && !error; xpos += w * !error) {
|
for (xpos = 0; xpos < size && !error; xpos += w * !error) {
|
||||||
for (strength = 0; strength < 3 && !error; strength += !error) {
|
for (strength = 0; strength < 3 && !error; strength += !error) {
|
||||||
ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size,
|
ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
|
||||||
1 << strength);
|
1 << strength);
|
||||||
ASM_REGISTER_STATE_CHECK(
|
ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h,
|
||||||
clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength));
|
size, size, 1 << strength));
|
||||||
|
|
||||||
for (pos = 0; pos < size * size && !error; pos++) {
|
for (pos = 0; pos < size * size && !error; pos++) {
|
||||||
error = ref_d[pos] != d[pos];
|
error = ref_d[pos] != d[pos];
|
||||||
@@ -137,7 +137,8 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
|
|||||||
for (ypos = 0; ypos < size; ypos += h) {
|
for (ypos = 0; ypos < size; ypos += h) {
|
||||||
for (xpos = 0; xpos < size; xpos += w) {
|
for (xpos = 0; xpos < size; xpos += w) {
|
||||||
for (strength = 0; strength < 3; strength++) {
|
for (strength = 0; strength < 3; strength++) {
|
||||||
ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
|
ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size,
|
||||||
|
1 << strength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -150,7 +151,7 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
|
|||||||
for (ypos = 0; ypos < size; ypos += h) {
|
for (ypos = 0; ypos < size; ypos += h) {
|
||||||
for (xpos = 0; xpos < size; xpos += w) {
|
for (xpos = 0; xpos < size; xpos += w) {
|
||||||
for (strength = 0; strength < 3; strength++) {
|
for (strength = 0; strength < 3; strength++) {
|
||||||
clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
|
clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user