Clean up and speed up CLPF clipping
* Move clipping tests from inside to outside loops * Let sizex and sizey to clpf_block() be the clipped block size rather than both just bs * Make fallback tests to C more accurate Change-Id: Icdc57540ce21b41a95403fdcc37988a4ebf546c7
This commit is contained in:
committed by
Yaowu Xu
parent
6116141c23
commit
e66fc87c46
@@ -153,8 +153,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
// Iterate over all smaller blocks inside the filter block
|
// Iterate over all smaller blocks inside the filter block
|
||||||
for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
|
for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
|
||||||
for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
|
for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
|
||||||
|
int sizex, sizey;
|
||||||
xpos = xoff + n * bs;
|
xpos = xoff + n * bs;
|
||||||
ypos = yoff + m * bs;
|
ypos = yoff + m * bs;
|
||||||
|
sizex = AOMMIN(width - xpos, bs);
|
||||||
|
sizey = AOMMIN(height - ypos, bs);
|
||||||
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
|
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
|
||||||
(xpos << subx) / MI_SIZE]
|
(xpos << subx) / MI_SIZE]
|
||||||
->mbmi.skip) { // Not skip block
|
->mbmi.skip) { // Not skip block
|
||||||
@@ -164,30 +167,49 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
if (cm->use_highbitdepth) {
|
if (cm->use_highbitdepth) {
|
||||||
uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
|
uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
|
||||||
for (c = 0; c < bs; c++) {
|
if (sizex == 8) {
|
||||||
*(uint64_t *)(d + c * sstride) =
|
for (c = 0; c < sizey; c++) {
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
|
*(uint64_t *)(d + c * sstride) =
|
||||||
if (bs == 8)
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
|
||||||
*(uint64_t *)(d + c * sstride + 4) =
|
*(uint64_t *)(d + c * sstride + 4) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
|
||||||
|
}
|
||||||
|
} else if (sizex == 4) {
|
||||||
|
for (c = 0; c < sizey; c++)
|
||||||
|
*(uint64_t *)(d + c * sstride) =
|
||||||
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
|
||||||
|
} else {
|
||||||
|
for (c = 0; c < sizey; c++)
|
||||||
|
memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2,
|
||||||
|
sizex);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (c = 0; c < bs; c++)
|
if (sizex == 8)
|
||||||
if (bs == 8)
|
for (c = 0; c < sizey; c++)
|
||||||
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
else
|
else if (sizex == 4)
|
||||||
|
for (c = 0; c < sizey; c++)
|
||||||
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
|
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
|
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
|
else
|
||||||
|
for (c = 0; c < sizey; c++)
|
||||||
|
memcpy(cache_dst[cache_idx] + c * sstride,
|
||||||
|
cache_ptr[cache_idx] + c * bs, sizex);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for (c = 0; c < bs; c++)
|
if (sizex == 8)
|
||||||
if (bs == 8)
|
for (c = 0; c < sizey; c++)
|
||||||
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
else
|
else if (sizex == 4)
|
||||||
|
for (c = 0; c < sizey; c++)
|
||||||
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
|
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
|
||||||
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
|
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
|
||||||
|
else
|
||||||
|
for (c = 0; c < sizey; c++)
|
||||||
|
memcpy(cache_dst[cache_idx] + c * sstride,
|
||||||
|
cache_ptr[cache_idx] + c * bs, sizex);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#if CONFIG_AOM_HIGHBITDEPTH
|
#if CONFIG_AOM_HIGHBITDEPTH
|
||||||
@@ -211,15 +233,15 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
|
|||||||
if (cm->use_highbitdepth) {
|
if (cm->use_highbitdepth) {
|
||||||
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
|
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
|
||||||
CONVERT_TO_SHORTPTR(dst_buffer), sstride,
|
CONVERT_TO_SHORTPTR(dst_buffer), sstride,
|
||||||
dstride, xpos, ypos, bs, bs, width, height,
|
dstride, xpos, ypos, sizex, sizey, width,
|
||||||
strength);
|
height, strength);
|
||||||
} else {
|
} else {
|
||||||
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
|
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
|
||||||
ypos, bs, bs, width, height, strength);
|
ypos, sizex, sizey, width, height, strength);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
|
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
|
||||||
ypos, bs, bs, width, height, strength);
|
ypos, sizex, sizey, width, height, strength);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -76,24 +76,27 @@ static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
|
|||||||
v128 o = v128_from_v64(l1, l2);
|
v128 o = v128_from_v64(l1, l2);
|
||||||
const v128 a =
|
const v128 a =
|
||||||
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
|
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
|
||||||
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
|
|
||||||
v64_load_unaligned(src - 2 * !!x0 + sstride));
|
|
||||||
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
|
|
||||||
v64_load_unaligned(src - !!x0 + sstride));
|
|
||||||
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
|
|
||||||
v64_load_unaligned(src + !!right + sstride));
|
|
||||||
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
|
|
||||||
v64_load_unaligned(src + 2 * !!right + sstride));
|
|
||||||
const v128 f = v128_from_v64(
|
const v128 f = v128_from_v64(
|
||||||
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
|
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
|
||||||
|
v128 b, c, d, e;
|
||||||
|
|
||||||
if (!x0) { // Left clipping
|
if (x0) {
|
||||||
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
|
b = v128_from_v64(v64_load_unaligned(src - 2),
|
||||||
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
|
v64_load_unaligned(src - 2 + sstride));
|
||||||
|
c = v128_from_v64(v64_load_unaligned(src - 1),
|
||||||
|
v64_load_unaligned(src - 1 + sstride));
|
||||||
|
} else { // Left clipping
|
||||||
|
b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
|
||||||
|
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
|
||||||
}
|
}
|
||||||
if (!right) { // Right clipping
|
if (right) {
|
||||||
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
|
d = v128_from_v64(v64_load_unaligned(src + 1),
|
||||||
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
|
v64_load_unaligned(src + 1 + sstride));
|
||||||
|
e = v128_from_v64(v64_load_unaligned(src + 2),
|
||||||
|
v64_load_unaligned(src + 2 + sstride));
|
||||||
|
} else { // Right clipping
|
||||||
|
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
|
||||||
|
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
|
||||||
}
|
}
|
||||||
|
|
||||||
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
|
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
|
||||||
@@ -134,31 +137,34 @@ static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
|
|||||||
const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
|
const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
|
||||||
v128 o = v128_from_32(l1, l2, l3, l4);
|
v128 o = v128_from_32(l1, l2, l3, l4);
|
||||||
const v128 a = v128_from_32(l0, l1, l2, l3);
|
const v128 a = v128_from_32(l0, l1, l2, l3);
|
||||||
v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
|
|
||||||
u32_load_unaligned(src + sstride - 2 * !!x0),
|
|
||||||
u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
|
|
||||||
u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
|
|
||||||
v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
|
|
||||||
u32_load_unaligned(src + sstride - !!x0),
|
|
||||||
u32_load_unaligned(src + 2 * sstride - !!x0),
|
|
||||||
u32_load_unaligned(src + 3 * sstride - !!x0));
|
|
||||||
v128 d = v128_from_32(u32_load_unaligned(src + !!right),
|
|
||||||
u32_load_unaligned(src + sstride + !!right),
|
|
||||||
u32_load_unaligned(src + 2 * sstride + !!right),
|
|
||||||
u32_load_unaligned(src + 3 * sstride + !!right));
|
|
||||||
v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
|
|
||||||
u32_load_unaligned(src + sstride + 2 * !!right),
|
|
||||||
u32_load_unaligned(src + 2 * sstride + 2 * !!right),
|
|
||||||
u32_load_unaligned(src + 3 * sstride + 2 * !!right));
|
|
||||||
const v128 f = v128_from_32(l2, l3, l4, l5);
|
const v128 f = v128_from_32(l2, l3, l4, l5);
|
||||||
|
v128 b, c, d, e;
|
||||||
|
|
||||||
if (!x0) { // Left clipping
|
if (x0) {
|
||||||
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
|
b = v128_from_32(u32_load_unaligned(src - 2),
|
||||||
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
|
u32_load_unaligned(src + sstride - 2),
|
||||||
|
u32_load_unaligned(src + 2 * sstride - 2),
|
||||||
|
u32_load_unaligned(src + 3 * sstride - 2));
|
||||||
|
c = v128_from_32(u32_load_unaligned(src - 1),
|
||||||
|
u32_load_unaligned(src + sstride - 1),
|
||||||
|
u32_load_unaligned(src + 2 * sstride - 1),
|
||||||
|
u32_load_unaligned(src + 3 * sstride - 1));
|
||||||
|
} else { // Left clipping
|
||||||
|
b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
|
||||||
|
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
|
||||||
}
|
}
|
||||||
if (!right) { // Right clipping
|
if (right) {
|
||||||
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
|
d = v128_from_32(u32_load_unaligned(src + 1),
|
||||||
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
|
u32_load_unaligned(src + sstride + 1),
|
||||||
|
u32_load_unaligned(src + 2 * sstride + 1),
|
||||||
|
u32_load_unaligned(src + 3 * sstride + 1));
|
||||||
|
e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
|
||||||
|
u32_load_unaligned(src + sstride + 2),
|
||||||
|
u32_load_unaligned(src + 2 * sstride + 2),
|
||||||
|
u32_load_unaligned(src + 3 * sstride + 2));
|
||||||
|
} else { // Right clipping
|
||||||
|
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
|
||||||
|
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
|
||||||
}
|
}
|
||||||
|
|
||||||
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
|
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
|
||||||
@@ -176,9 +182,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
|
|||||||
int dstride, int x0, int y0, int sizex,
|
int dstride, int x0, int y0, int sizex,
|
||||||
int sizey, int width, int height,
|
int sizey, int width, int height,
|
||||||
unsigned int strength) {
|
unsigned int strength) {
|
||||||
if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
|
if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
|
||||||
(sizey & 3 && sizex == 4) || x0 + 4 > width) {
|
// Fallback to C for odd sizes:
|
||||||
// Fallback to C for odd sizes
|
// * block widths not 4 or 8
|
||||||
|
// * block heights not a multiple of 4 if the block width is 4
|
||||||
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
|
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
|
||||||
height, strength);
|
height, strength);
|
||||||
} else {
|
} else {
|
||||||
@@ -255,24 +262,27 @@ SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
|
|||||||
v128 o = v128_from_v64(l1, l2);
|
v128 o = v128_from_v64(l1, l2);
|
||||||
const v128 a =
|
const v128 a =
|
||||||
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
|
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
|
||||||
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
|
|
||||||
v64_load_unaligned(src - 2 * !!x0 + sstride));
|
|
||||||
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
|
|
||||||
v64_load_unaligned(src - !!x0 + sstride));
|
|
||||||
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
|
|
||||||
v64_load_unaligned(src + !!right + sstride));
|
|
||||||
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
|
|
||||||
v64_load_unaligned(src + 2 * !!right + sstride));
|
|
||||||
const v128 f = v128_from_v64(
|
const v128 f = v128_from_v64(
|
||||||
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
|
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
|
||||||
|
v128 b, c, d, e;
|
||||||
|
|
||||||
if (!x0) { // Left clipping
|
if (x0) {
|
||||||
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
|
b = v128_from_v64(v64_load_unaligned(src - 2),
|
||||||
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
|
v64_load_unaligned(src - 2 + sstride));
|
||||||
|
c = v128_from_v64(v64_load_unaligned(src - 1),
|
||||||
|
v64_load_unaligned(src - 1 + sstride));
|
||||||
|
} else { // Left clipping
|
||||||
|
b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
|
||||||
|
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
|
||||||
}
|
}
|
||||||
if (!right) { // Right clipping
|
if (right) {
|
||||||
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
|
d = v128_from_v64(v64_load_unaligned(src + 1),
|
||||||
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
|
v64_load_unaligned(src + 1 + sstride));
|
||||||
|
e = v128_from_v64(v64_load_unaligned(src + 2),
|
||||||
|
v64_load_unaligned(src + 2 + sstride));
|
||||||
|
} else { // Right clipping
|
||||||
|
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
|
||||||
|
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
|
||||||
}
|
}
|
||||||
calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
|
calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
|
||||||
src += sstride * 2;
|
src += sstride * 2;
|
||||||
@@ -309,18 +319,21 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
|
|||||||
const v128 o = v128_load_aligned(src);
|
const v128 o = v128_load_aligned(src);
|
||||||
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
|
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
|
||||||
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
|
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
|
||||||
v128 b = v128_load_unaligned(src - 2 * !!x0);
|
v128 b, c, d, e;
|
||||||
v128 c = v128_load_unaligned(src - !!x0);
|
|
||||||
v128 d = v128_load_unaligned(src + !!right);
|
|
||||||
v128 e = v128_load_unaligned(src + 2 * !!right);
|
|
||||||
|
|
||||||
if (!x0) { // Left clipping
|
if (x0) {
|
||||||
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
|
b = v128_load_unaligned(src - 2);
|
||||||
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
|
c = v128_load_unaligned(src - 1);
|
||||||
|
} else { // Left clipping
|
||||||
|
b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
|
||||||
|
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
|
||||||
}
|
}
|
||||||
if (!right) { // Right clipping
|
if (right) {
|
||||||
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
|
d = v128_load_unaligned(src + 1);
|
||||||
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
|
e = v128_load_unaligned(src + 2);
|
||||||
|
} else { // Right clipping
|
||||||
|
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
|
||||||
|
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
|
||||||
}
|
}
|
||||||
calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
|
calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
|
||||||
src += sstride;
|
src += sstride;
|
||||||
@@ -332,8 +345,10 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
|
|||||||
int sstride, int dstride, int x0, int y0,
|
int sstride, int dstride, int x0, int y0,
|
||||||
int sizex, int sizey, int width, int height,
|
int sizex, int sizey, int width, int height,
|
||||||
unsigned int strength) {
|
unsigned int strength) {
|
||||||
if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) {
|
if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
|
||||||
// Fallback to C for odd sizes
|
// Fallback to C for odd sizes:
|
||||||
|
// * block width not 4 or 8
|
||||||
|
// * block heights not a multiple of 2 if the block width is 4
|
||||||
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
|
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
|
||||||
width, height, strength);
|
width, height, strength);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
Reference in New Issue
Block a user