Clean up and speed up CLPF clipping

* Move clipping tests from inside to outside loops
* Let sizex and sizey to clpf_block() be the clipped block size rather
  than both just bs
* Make fallback tests to C more accurate

Change-Id: Icdc57540ce21b41a95403fdcc37988a4ebf546c7
This commit is contained in:
Steinar Midtskogen
2016-09-26 12:51:25 +02:00
committed by Yaowu Xu
parent 6116141c23
commit e66fc87c46
2 changed files with 116 additions and 79 deletions

View File

@@ -153,8 +153,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
// Iterate over all smaller blocks inside the filter block // Iterate over all smaller blocks inside the filter block
for (m = 0; m < ((h + bs - 1) >> bslog); m++) { for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
for (n = 0; n < ((w + bs - 1) >> bslog); n++) { for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
int sizex, sizey;
xpos = xoff + n * bs; xpos = xoff + n * bs;
ypos = yoff + m * bs; ypos = yoff + m * bs;
sizex = AOMMIN(width - xpos, bs);
sizey = AOMMIN(height - ypos, bs);
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE] (xpos << subx) / MI_SIZE]
->mbmi.skip) { // Not skip block ->mbmi.skip) { // Not skip block
@@ -164,30 +167,49 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) { if (cm->use_highbitdepth) {
uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]); uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
for (c = 0; c < bs; c++) { if (sizex == 8) {
*(uint64_t *)(d + c * sstride) = for (c = 0; c < sizey; c++) {
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); *(uint64_t *)(d + c * sstride) =
if (bs == 8) *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
*(uint64_t *)(d + c * sstride + 4) = *(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
}
} else if (sizex == 4) {
for (c = 0; c < sizey; c++)
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
} else {
for (c = 0; c < sizey; c++)
memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2,
sizex);
} }
} else { } else {
for (c = 0; c < bs; c++) if (sizex == 8)
if (bs == 8) for (c = 0; c < sizey; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs); *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else else if (sizex == 4)
for (c = 0; c < sizey; c++)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) = *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs); *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
for (c = 0; c < sizey; c++)
memcpy(cache_dst[cache_idx] + c * sstride,
cache_ptr[cache_idx] + c * bs, sizex);
} }
#else #else
for (c = 0; c < bs; c++) if (sizex == 8)
if (bs == 8) for (c = 0; c < sizey; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs); *(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else else if (sizex == 4)
for (c = 0; c < sizey; c++)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) = *(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs); *(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
for (c = 0; c < sizey; c++)
memcpy(cache_dst[cache_idx] + c * sstride,
cache_ptr[cache_idx] + c * bs, sizex);
#endif #endif
} }
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
@@ -211,15 +233,15 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
if (cm->use_highbitdepth) { if (cm->use_highbitdepth) {
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer), aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
CONVERT_TO_SHORTPTR(dst_buffer), sstride, CONVERT_TO_SHORTPTR(dst_buffer), sstride,
dstride, xpos, ypos, bs, bs, width, height, dstride, xpos, ypos, sizex, sizey, width,
strength); height, strength);
} else { } else {
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, bs, bs, width, height, strength); ypos, sizex, sizey, width, height, strength);
} }
#else #else
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, bs, bs, width, height, strength); ypos, sizex, sizey, width, height, strength);
#endif #endif
} }
} }

View File

@@ -76,24 +76,27 @@ static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
v128 o = v128_from_v64(l1, l2); v128 o = v128_from_v64(l1, l2);
const v128 a = const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1); v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
v64_load_unaligned(src - 2 * !!x0 + sstride));
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
v64_load_unaligned(src - !!x0 + sstride));
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
v64_load_unaligned(src + !!right + sstride));
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64( const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)); l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
v128 b, c, d, e;
if (!x0) { // Left clipping if (x0) {
b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); b = v128_from_v64(v64_load_unaligned(src - 2),
c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); v64_load_unaligned(src - 2 + sstride));
c = v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride));
} else { // Left clipping
b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
} }
if (!right) { // Right clipping if (right) {
d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); d = v128_from_v64(v64_load_unaligned(src + 1),
e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); v64_load_unaligned(src + 1 + sstride));
e = v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride));
} else { // Right clipping
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
} }
o = calc_delta(o, a, b, c, d, e, f, sp, sm); o = calc_delta(o, a, b, c, d, e, f, sp, sm);
@@ -134,31 +137,34 @@ static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride); const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
v128 o = v128_from_32(l1, l2, l3, l4); v128 o = v128_from_32(l1, l2, l3, l4);
const v128 a = v128_from_32(l0, l1, l2, l3); const v128 a = v128_from_32(l0, l1, l2, l3);
v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
u32_load_unaligned(src + sstride - 2 * !!x0),
u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
u32_load_unaligned(src + sstride - !!x0),
u32_load_unaligned(src + 2 * sstride - !!x0),
u32_load_unaligned(src + 3 * sstride - !!x0));
v128 d = v128_from_32(u32_load_unaligned(src + !!right),
u32_load_unaligned(src + sstride + !!right),
u32_load_unaligned(src + 2 * sstride + !!right),
u32_load_unaligned(src + 3 * sstride + !!right));
v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
u32_load_unaligned(src + sstride + 2 * !!right),
u32_load_unaligned(src + 2 * sstride + 2 * !!right),
u32_load_unaligned(src + 3 * sstride + 2 * !!right));
const v128 f = v128_from_32(l2, l3, l4, l5); const v128 f = v128_from_32(l2, l3, l4, l5);
v128 b, c, d, e;
if (!x0) { // Left clipping if (x0) {
b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); b = v128_from_32(u32_load_unaligned(src - 2),
c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); u32_load_unaligned(src + sstride - 2),
u32_load_unaligned(src + 2 * sstride - 2),
u32_load_unaligned(src + 3 * sstride - 2));
c = v128_from_32(u32_load_unaligned(src - 1),
u32_load_unaligned(src + sstride - 1),
u32_load_unaligned(src + 2 * sstride - 1),
u32_load_unaligned(src + 3 * sstride - 1));
} else { // Left clipping
b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
} }
if (!right) { // Right clipping if (right) {
d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); d = v128_from_32(u32_load_unaligned(src + 1),
e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); u32_load_unaligned(src + sstride + 1),
u32_load_unaligned(src + 2 * sstride + 1),
u32_load_unaligned(src + 3 * sstride + 1));
e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
u32_load_unaligned(src + sstride + 2),
u32_load_unaligned(src + 2 * sstride + 2),
u32_load_unaligned(src + 3 * sstride + 2));
} else { // Right clipping
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
} }
o = calc_delta(o, a, b, c, d, e, f, sp, sm); o = calc_delta(o, a, b, c, d, e, f, sp, sm);
@@ -176,9 +182,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int dstride, int x0, int y0, int sizex,
int sizey, int width, int height, int sizey, int width, int height,
unsigned int strength) { unsigned int strength) {
if ((sizex != 4 && sizex != 8) || y0 + 4 > height || if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
(sizey & 3 && sizex == 4) || x0 + 4 > width) { // Fallback to C for odd sizes:
// Fallback to C for odd sizes // * block widths not 4 or 8
// * block heights not a multiple of 4 if the block width is 4
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width, aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
height, strength); height, strength);
} else { } else {
@@ -255,24 +262,27 @@ SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
v128 o = v128_from_v64(l1, l2); v128 o = v128_from_v64(l1, l2);
const v128 a = const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1); v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
v64_load_unaligned(src - 2 * !!x0 + sstride));
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
v64_load_unaligned(src - !!x0 + sstride));
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
v64_load_unaligned(src + !!right + sstride));
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64( const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)); l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
v128 b, c, d, e;
if (!x0) { // Left clipping if (x0) {
b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); b = v128_from_v64(v64_load_unaligned(src - 2),
c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); v64_load_unaligned(src - 2 + sstride));
c = v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride));
} else { // Left clipping
b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
} }
if (!right) { // Right clipping if (right) {
d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); d = v128_from_v64(v64_load_unaligned(src + 1),
e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); v64_load_unaligned(src + 1 + sstride));
e = v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride));
} else { // Right clipping
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
} }
calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride); calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2; src += sstride * 2;
@@ -309,18 +319,21 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
const v128 o = v128_load_aligned(src); const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride); const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
v128 b = v128_load_unaligned(src - 2 * !!x0); v128 b, c, d, e;
v128 c = v128_load_unaligned(src - !!x0);
v128 d = v128_load_unaligned(src + !!right);
v128 e = v128_load_unaligned(src + 2 * !!right);
if (!x0) { // Left clipping if (x0) {
b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); b = v128_load_unaligned(src - 2);
c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); c = v128_load_unaligned(src - 1);
} else { // Left clipping
b = v128_shuffle_8(o, v128_load_aligned(b_shuff));
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
} }
if (!right) { // Right clipping if (right) {
d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); d = v128_load_unaligned(src + 1);
e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); e = v128_load_unaligned(src + 2);
} else { // Right clipping
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
} }
calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm); calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride; src += sstride;
@@ -332,8 +345,10 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0, int sstride, int dstride, int x0, int y0,
int sizex, int sizey, int width, int height, int sizex, int sizey, int width, int height,
unsigned int strength) { unsigned int strength) {
if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) { if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
// Fallback to C for odd sizes // Fallback to C for odd sizes:
// * block width not 4 or 8
// * block heights not a multiple of 2 if the block width is 4
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
width, height, strength); width, height, strength);
} else { } else {