Added high bit-depth support in CLPF.

Change-Id: Ic5eadb323227a820ad876c32d4dc296e05db6ece
This commit is contained in:
Steinar Midtskogen
2016-09-09 15:23:35 +02:00
committed by Yaowu Xu
parent 9351b2f792
commit 3dbd55a6c4
6 changed files with 1038 additions and 479 deletions

View File

@@ -587,6 +587,14 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CLPF") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift";
specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift";
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
}
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";

View File

@@ -8,6 +8,7 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "av1/common/clpf.h"
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
@@ -47,6 +48,29 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
}
}
#if CONFIG_AOM_HIGHBITDEPTH
// Identical to aom_clpf_block_c() apart from "src" and "dst".
void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
int width, int height, unsigned int strength) {
int x, y;
for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) {
int X = src[y * sstride + x];
int A = src[AOMMAX(0, y - 1) * sstride + x];
int B = src[y * sstride + AOMMAX(0, x - 2)];
int C = src[y * sstride + AOMMAX(0, x - 1)];
int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
int delta;
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
dst[y * dstride + x] = X + delta;
}
}
}
#endif
// Return number of filtered blocks
int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org,
@@ -75,15 +99,27 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
const int cache_blocks = cache_size / (bs * bs);
YV12_BUFFER_CONFIG dst = *orig_dst;
assert(bs == 8); // Optimised code assumes this.
#if CONFIG_AOM_HIGHBITDEPTH
strength <<= (cm->bit_depth - 8);
#endif
// Make buffer space for in-place filtering
if (rec->y_buffer == dst.y_buffer) {
#if CONFIG_AOM_HIGHBITDEPTH
CHECK_MEM_ERROR(cm, cache,
aom_malloc(cache_size << !!cm->use_highbitdepth));
dst.y_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
#else
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
dst.y_buffer = cache;
#endif
CHECK_MEM_ERROR(cm, cache_ptr,
aom_malloc(cache_blocks * sizeof(*cache_ptr)));
CHECK_MEM_ERROR(cm, cache_dst,
aom_malloc(cache_blocks * sizeof(*cache_dst)));
memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
dst.y_buffer = cache;
dstride = bs;
}
@@ -125,34 +161,108 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
// Temporary buffering needed if filtering in-place
if (cache) {
if (cache_ptr[cache_idx]) {
// Copy filtered block back into the frame
// Copy filtered block back into the frame
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
uint16_t *const d =
CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
for (c = 0; c < bs; c++) {
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
}
} else {
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
}
#else
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
#endif
}
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2;
dst.y_buffer = CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) -
ypos * bs - xpos;
} else {
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
}
#else
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
#endif
cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos;
if (++cache_idx >= cache_blocks) cache_idx = 0;
}
// Apply the filter
// Apply the filter
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
CONVERT_TO_SHORTPTR(dst.y_buffer), sstride,
dstride, xpos, ypos, bs, bs, width, height,
strength);
} else {
aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride,
xpos, ypos, bs, bs, width, height, strength);
}
#else
aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride,
xpos, ypos, bs, bs, width, height, strength);
#endif
} else { // Skip block, copy instead
if (!cache)
if (!cache) {
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
uint16_t *const d = CONVERT_TO_SHORTPTR(dst.y_buffer);
const uint16_t *const s = CONVERT_TO_SHORTPTR(rec->y_buffer);
for (c = 0; c < bs; c++) {
*(uint64_t *)(d + (ypos + c) * dstride + xpos) =
*(uint64_t *)(s + (ypos + c) * sstride + xpos);
*(uint64_t *)(d + (ypos + c) * dstride + xpos + 4) =
*(uint64_t *)(s + (ypos + c) * sstride + xpos + 4);
}
} else {
for (c = 0; c < bs; c++)
*(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) =
*(uint64_t *)(rec->y_buffer + (ypos + c) * sstride +
xpos);
}
#else
for (c = 0; c < bs; c++)
*(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *(
uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos);
#endif
}
}
}
}
} else { // Entire filter block is skip, copy
if (!cache)
if (!cache) {
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
for (m = 0; m < h; m++)
memcpy(CONVERT_TO_SHORTPTR(dst.y_buffer) + (yoff + m) * dstride +
xoff,
CONVERT_TO_SHORTPTR(rec->y_buffer) + (yoff + m) * sstride +
xoff,
w * 2);
} else {
for (m = 0; m < h; m++)
memcpy(dst.y_buffer + (yoff + m) * dstride + xoff,
rec->y_buffer + (yoff + m) * sstride + xoff, w);
}
#else
for (m = 0; m < h; m++)
memcpy(dst.y_buffer + (yoff + m) * dstride + xoff,
rec->y_buffer + (yoff + m) * sstride + xoff, w);
#endif
}
}
block_index += !allskip; // Count number of blocks filtered
}
@@ -161,10 +271,27 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
if (cache) {
// Copy remaining blocks into the frame
for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
cache_idx++)
cache_idx++) {
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
for (c = 0; c < bs; c++) {
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
}
} else {
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
}
#else
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
#endif
}
aom_free(cache);
aom_free(cache_ptr);

View File

@@ -11,170 +11,129 @@
#include "./aom_dsp_rtcd.h"
SIMD_INLINE void calc_delta(v128 o, v128 x, v128 a, v128 b, v128 c, v128 d,
v128 e, v128 f, uint8_t *dst, v128 sp, v128 sm,
int dstride) {
const v128 c8 = v128_dup_8(8);
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
const v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
o = v128_add_8(
o,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
}
static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
int bottom = height - 2 - y0;
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const v128 c128 = v128_dup_8(128);
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
{
int bottom = height - 2 - y0;
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const v128 c8 = v128_dup_8(8);
const v128 c128 = v128_dup_8(128);
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * sstride)));
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
const v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
o = v128_add_8(
o, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * sstride)));
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
const v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
o = v128_add_8(
o, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
} else { // No left/right clipping
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * sstride)));
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
const v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
o = v128_add_8(
o, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(
delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
}
} else { // No left/right clipping
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
}
}
}
@@ -197,3 +156,105 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
strength);
}
}
#if CONFIG_AOM_HIGHBITDEPTH
static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, uint16_t *dst, v128 sp, v128 sm) {
const v128 c8 = v128_dup_16(8);
const v128 tmp =
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm),
v128_max_s16(v128_min_s16(v128_sub_16(d, o), sp), sm));
const v128 delta = v128_add_16(
v128_add_16(
v128_shl_16(
v128_add_16(
v128_max_s16(v128_min_s16(v128_sub_16(a, o), sp), sm),
v128_max_s16(v128_min_s16(v128_sub_16(f, o), sp), sm)),
2),
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm),
v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))),
v128_add_16(v128_add_16(tmp, tmp), tmp));
v128_store_aligned(
dst,
v128_add_16(
o, v128_shr_s16(
v128_add_16(c8, v128_add_16(delta, v128_cmplt_s16(
delta, v128_zero()))),
4)));
}
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
int width, int height, unsigned int strength) {
int y;
int bottom = height - 2 - y0;
const v128 sp = v128_dup_16(strength);
const v128 sm = v128_dup_16(-(int)strength);
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0b0a090807060504LL),
v64_from_64(0x0302010001000100LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080706LL),
v64_from_64(0x0504030201000100LL));
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_shuffle_8(o, b_shuff);
const v128 c = v128_shuffle_8(o, c_shuff);
const v128 d = v128_load_unaligned(src + 1);
const v128 e = v128_load_unaligned(src + 2);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0d0c0b0aLL),
v64_from_64(0x0908070605040302LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0f0e0d0cLL),
v64_from_64(0x0b0a090807060504LL));
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_load_unaligned(src - 2);
const v128 c = v128_load_unaligned(src - 1);
const v128 d = v128_shuffle_8(o, d_shuff);
const v128 e = v128_shuffle_8(o, e_shuff);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
}
} else { // No left/right clipping
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_aligned(src - (y != -y0) * sstride);
const v128 b = v128_load_unaligned(src - 2);
const v128 c = v128_load_unaligned(src - 1);
const v128 d = v128_load_unaligned(src + 1);
const v128 e = v128_load_unaligned(src + 2);
const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride);
calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm);
src += sstride;
dst += dstride;
}
}
}
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizex, int sizey, int width, int height,
unsigned int strength) {
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
// Fallback to C for odd sizes
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
width, height, strength);
} else {
clpf_block_hbd(src, dst, sstride, dstride, x0, y0, sizey, width, height,
strength);
}
}
#endif

View File

@@ -66,6 +66,62 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
}
}
#if CONFIG_AOM_HIGHBITDEPTH
// Identical to aom_clpf_detect_c() apart from "rec" and "org".
void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0, int width,
int height, int *sum0, int *sum1,
unsigned int strength, int shift) {
int x, y;
for (y = y0; y < y0 + 8; y++) {
for (x = x0; x < x0 + 8; x++) {
int O = org[y * ostride + x] >> shift;
int X = rec[y * rstride + x] >> shift;
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength >> shift);
int Y = X + delta;
*sum0 += (O - X) * (O - X);
*sum1 += (O - Y) * (O - Y);
}
}
}
// aom_clpf_detect_multi_c() apart from "rec" and "org".
void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum, int shift) {
int x, y;
for (y = y0; y < y0 + 8; y++) {
for (x = x0; x < x0 + 8; x++) {
int O = org[y * ostride + x] >> shift;
int X = rec[y * rstride + x] >> shift;
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1);
int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2);
int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4);
int F1 = X + delta1;
int F2 = X + delta2;
int F3 = X + delta3;
sum[0] += (O - X) * (O - X);
sum[1] += (O - F1) * (O - F1);
sum[2] += (O - F2) * (O - F2);
sum[3] += (O - F3) * (O - F3);
}
}
}
#endif
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength,
@@ -77,10 +133,25 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
int ypos = (k << fb_size_log2) + m * block_size;
const int bs = MAX_MIB_SIZE;
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
->mbmi.skip)
->mbmi.skip) {
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength,
cm->bit_depth - 8);
} else {
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength);
}
#else
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength);
#endif
}
}
}
*res = sum1 < sum0;
@@ -145,9 +216,23 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride +
xpos / MAX_MIB_SIZE]
->mbmi.skip) {
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
CONVERT_TO_SHORTPTR(org->y_buffer),
rec->y_stride, org->y_stride, xpos, ypos,
rec->y_crop_width, rec->y_crop_height, sum,
cm->bit_depth - 8);
} else {
aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, sum);
}
#else
aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, sum);
#endif
filtered = 1;
}
}

View File

@@ -11,6 +11,27 @@
#include "aom_dsp/aom_simd.h"
SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
return v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4);
}
void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum0, int *sum1,
@@ -54,27 +75,9 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
delta = v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4);
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
@@ -107,26 +110,9 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
delta = v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4);
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
@@ -158,27 +144,9 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
delta = v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4);
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta));
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
@@ -187,8 +155,73 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
*sum1 += v128_ssd_u8_sum(ssd1);
}
// Test multiple filter strengths at once. Use a simpler filter (4 tap, every
// second line).
SIMD_INLINE void calc_delta_multi(v128 x, v128 q, v128 o, v128 a, v128 b,
v128 c, v128 d, v128 e, v128 f, v128 cp1,
v128 cm1, v128 cp2, v128 cm2, v128 cp4,
v128 cm4, ssd128_internal *ssd1,
ssd128_internal *ssd2,
ssd128_internal *ssd3) {
v128 tmp, delta1, delta2, delta3;
const v128 c8 = v128_dup_8(8);
a = v128_ssub_s8(a, x);
b = v128_ssub_s8(b, x);
c = v128_ssub_s8(c, x);
d = v128_ssub_s8(d, x);
e = v128_ssub_s8(e, x);
f = v128_ssub_s8(f, x);
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
v128_max_s8(v128_min_s8(d, cp1), cm1));
delta1 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
v128_max_s8(v128_min_s8(f, cp1), cm1)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
v128_max_s8(v128_min_s8(e, cp1), cm1))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
v128_max_s8(v128_min_s8(d, cp2), cm2));
delta2 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
v128_max_s8(v128_min_s8(f, cp2), cm2)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
v128_max_s8(v128_min_s8(e, cp2), cm2))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
v128_max_s8(v128_min_s8(d, cp4), cm4));
delta3 = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
v128_max_s8(v128_min_s8(f, cp4), cm4)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
v128_max_s8(v128_min_s8(e, cp4), cm4))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
*ssd1 = v128_ssd_u8(
*ssd1, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta1,
v128_cmplt_s8(delta1, v128_zero()))),
4)));
*ssd2 = v128_ssd_u8(
*ssd2, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta2,
v128_cmplt_s8(delta2, v128_zero()))),
4)));
*ssd3 = v128_ssd_u8(
*ssd3, o,
v128_add_8(
q, v128_shr_s8(
v128_add_8(c8, v128_add_8(delta3,
v128_cmplt_s8(delta3, v128_zero()))),
4)));
}
// Test multiple filter strengths at once.
void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum) {
@@ -199,7 +232,6 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
const v128 cm2 = v128_dup_8(-2);
const v128 cp4 = v128_dup_8(4);
const v128 cm4 = v128_dup_8(-4);
const v128 c8 = v128_dup_8(8);
const int bottom = height - 2 - y0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
@@ -238,70 +270,9 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
v128 tmp, delta1, delta2, delta3;
a = v128_ssub_s8(a, x);
b = v128_ssub_s8(b, x);
c = v128_ssub_s8(c, x);
d = v128_ssub_s8(d, x);
e = v128_ssub_s8(e, x);
f = v128_ssub_s8(f, x);
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
v128_max_s8(v128_min_s8(d, cp1), cm1));
delta1 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
v128_max_s8(v128_min_s8(f, cp1), cm1)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
v128_max_s8(v128_min_s8(e, cp1), cm1))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
v128_max_s8(v128_min_s8(d, cp2), cm2));
delta2 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
v128_max_s8(v128_min_s8(f, cp2), cm2)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
v128_max_s8(v128_min_s8(e, cp2), cm2))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
v128_max_s8(v128_min_s8(d, cp4), cm4));
delta3 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
v128_max_s8(v128_min_s8(f, cp4), cm4)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
v128_max_s8(v128_min_s8(e, cp4), cm4))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
delta1, v128_zero()))),
4)));
ssd2 = v128_ssd_u8(
ssd2, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
delta2, v128_zero()))),
4)));
ssd3 = v128_ssd_u8(
ssd3, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
delta3, v128_zero()))),
4)));
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
@@ -334,70 +305,9 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
v128 tmp, delta1, delta2, delta3;
a = v128_ssub_s8(a, x);
b = v128_ssub_s8(b, x);
c = v128_ssub_s8(c, x);
d = v128_ssub_s8(d, x);
e = v128_ssub_s8(e, x);
f = v128_ssub_s8(f, x);
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
v128_max_s8(v128_min_s8(d, cp1), cm1));
delta1 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
v128_max_s8(v128_min_s8(f, cp1), cm1)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
v128_max_s8(v128_min_s8(e, cp1), cm1))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
v128_max_s8(v128_min_s8(d, cp2), cm2));
delta2 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
v128_max_s8(v128_min_s8(f, cp2), cm2)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
v128_max_s8(v128_min_s8(e, cp2), cm2))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
v128_max_s8(v128_min_s8(d, cp4), cm4));
delta3 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
v128_max_s8(v128_min_s8(f, cp4), cm4)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
v128_max_s8(v128_min_s8(e, cp4), cm4))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
delta1, v128_zero()))),
4)));
ssd2 = v128_ssd_u8(
ssd2, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
delta2, v128_zero()))),
4)));
ssd3 = v128_ssd_u8(
ssd3, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
delta3, v128_zero()))),
4)));
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
@@ -429,70 +339,9 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)));
v128 tmp, delta1, delta2, delta3;
a = v128_ssub_s8(a, x);
b = v128_ssub_s8(b, x);
c = v128_ssub_s8(c, x);
d = v128_ssub_s8(d, x);
e = v128_ssub_s8(e, x);
f = v128_ssub_s8(f, x);
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1),
v128_max_s8(v128_min_s8(d, cp1), cm1));
delta1 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1),
v128_max_s8(v128_min_s8(f, cp1), cm1)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1),
v128_max_s8(v128_min_s8(e, cp1), cm1))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2),
v128_max_s8(v128_min_s8(d, cp2), cm2));
delta2 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2),
v128_max_s8(v128_min_s8(f, cp2), cm2)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2),
v128_max_s8(v128_min_s8(e, cp2), cm2))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4),
v128_max_s8(v128_min_s8(d, cp4), cm4));
delta3 = v128_add_8(
v128_add_8(
v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4),
v128_max_s8(v128_min_s8(f, cp4), cm4)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4),
v128_max_s8(v128_min_s8(e, cp4), cm4))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8(
delta1, v128_zero()))),
4)));
ssd2 = v128_ssd_u8(
ssd2, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8(
delta2, v128_zero()))),
4)));
ssd3 = v128_ssd_u8(
ssd3, o,
v128_add_8(
q,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8(
delta3, v128_zero()))),
4)));
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
@@ -502,3 +351,321 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
sum[2] += v128_ssd_u8_sum(ssd2);
sum[3] += v128_ssd_u8_sum(ssd3);
}
#if CONFIG_AOM_HIGHBITDEPTH
void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum0, int *sum1,
unsigned int strength, int shift) {
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
const v128 c128 = v128_dup_8(128);
const v128 sp = v128_dup_8(strength >> shift);
const v128 sm = v128_dup_8(-(int)(strength >> shift));
const int bottom = height - 2 - y0;
rec += x0 + y0 * rstride;
org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
ssd1 = v128_ssd_u8(
ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm)));
rec += rstride * 2;
org += ostride * 2;
}
}
*sum0 += v128_ssd_u8_sum(ssd0);
*sum1 += v128_ssd_u8_sum(ssd1);
}
void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
const uint16_t *org, int rstride,
int ostride, int x0, int y0,
int width, int height, int *sum,
int shift) {
const v128 c128 = v128_dup_8(128);
const v128 cp1 = v128_dup_8(1);
const v128 cm1 = v128_dup_8(-1);
const v128 cp2 = v128_dup_8(2);
const v128 cm2 = v128_dup_8(-2);
const v128 cp4 = v128_dup_8(4);
const v128 cm4 = v128_dup_8(-4);
const int bottom = height - 2 - y0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
ssd128_internal ssd2 = v128_ssd_u8_init();
ssd128_internal ssd3 = v128_ssd_u8_init();
rec += x0 + y0 * rstride;
org += x0 + y0 * ostride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
} else { // No left/right clipping
int y;
for (y = 0; y < 8; y += 2) {
const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift);
const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
const v128 o =
v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift),
v128_shr_u16(v128_load_aligned(org + ostride), shift));
const v128 q = v128_unziplo_8(n1, n2);
const v128 x = v128_add_8(c128, q);
const v128 a = v128_add_8(
c128, v128_unziplo_8(
v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride),
shift),
n1));
const v128 b = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 2), shift),
v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift)));
const v128 c = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec - 1), shift),
v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift)));
const v128 d = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 1), shift),
v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift)));
const v128 e = v128_add_8(
c128,
v128_unziplo_8(
v128_shr_u16(v128_load_unaligned(rec + 2), shift),
v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift)));
const v128 f = v128_add_8(
c128, v128_unziplo_8(
n2, v128_shr_u16(v128_load_unaligned(
rec + ((y != bottom) + 1) * rstride),
shift)));
ssd0 = v128_ssd_u8(ssd0, o, q);
calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4,
&ssd1, &ssd2, &ssd3);
rec += 2 * rstride;
org += 2 * ostride;
}
}
sum[0] += v128_ssd_u8_sum(ssd0);
sum[1] += v128_ssd_u8_sum(ssd1);
sum[2] += v128_ssd_u8_sum(ssd2);
sum[3] += v128_ssd_u8_sum(ssd3);
}
#endif

View File

@@ -54,44 +54,84 @@ class ClpfBlockTest : public ::testing::TestWithParam<clpf_block_param_t> {
typedef ClpfBlockTest ClpfSpeedTest;
TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
int w = sizex;
int h = sizey;
const int size = 32;
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint8_t, s[size * size]);
DECLARE_ALIGNED(16, uint8_t, d[size * size]);
DECLARE_ALIGNED(16, uint8_t, ref_d[size * size]);
memset(ref_d, 0, size * size);
memset(d, 0, size * size);
#if CONFIG_AOM_HIGHBITDEPTH
typedef void (*clpf_block_hbd_t)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizex, int sizey, int width, int height,
unsigned int strength);
int error = 0;
int pos = 0;
int strength = 0;
int xpos = 0, ypos = 0;
int bits;
int level;
typedef std::tr1::tuple<clpf_block_hbd_t, clpf_block_hbd_t, int, int>
clpf_block_hbd_param_t;
class ClpfBlockHbdTest
: public ::testing::TestWithParam<clpf_block_hbd_param_t> {
public:
virtual ~ClpfBlockHbdTest() {}
virtual void SetUp() {
clpf = GET_PARAM(0);
ref_clpf = GET_PARAM(1);
sizex = GET_PARAM(2);
sizey = GET_PARAM(3);
}
virtual void TearDown() { libaom_test::ClearSystemState(); }
protected:
int sizex;
int sizey;
clpf_block_hbd_t clpf;
clpf_block_hbd_t ref_clpf;
};
typedef ClpfBlockHbdTest ClpfHbdSpeedTest;
#endif
template <typename pixel>
void test_clpf(int w, int h, int depth, int iterations,
void (*clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
int width, int height, unsigned int strength),
void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, int width, int height,
unsigned int strength)) {
const int size = 24;
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, pixel, s[size * size]);
DECLARE_ALIGNED(16, pixel, d[size * size]);
DECLARE_ALIGNED(16, pixel, ref_d[size * size]);
memset(ref_d, 0, size * size * sizeof(*ref_d));
memset(d, 0, size * size * sizeof(*d));
int error = 0, pos = 0, strength = 0, xpos = 0, ypos = 0;
int bits, level, count;
// Test every combination of:
// * Input with 1-8 bits of noise
// * Noise level around every value from 0 to 255
// * Input with up to <depth> bits of noise
// * Noise level around every value from 0 to (1<<depth)-1
// * Blocks anywhere in the frame (along all egdes and also fully inside)
// * All strengths
for (level = 0; level < 256 && !error; level++) {
for (bits = 1; bits < 9 && !error; bits++) {
for (int i = 0; i < size * size; i++)
s[i] = clamp((rnd.Rand8() & ((1 << bits) - 1)) + level, 0, 255);
// If clpf and ref_clpf are the same, we're just testing speed
for (count = 0; count < iterations; count++) {
for (level = 0; level < (1 << depth) && !error; level++) {
for (bits = 1; bits <= depth && !error; bits++) {
for (int i = 0; i < size * size; i++)
s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
(1 << depth) - 1);
for (ypos = 0; ypos < size && !error; ypos += h * !error) {
for (xpos = 0; xpos < size && !error; xpos += w * !error) {
for (strength = 0; strength < 3 && !error; strength += !error) {
ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
1 << strength);
ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h,
size, size, 1 << strength));
for (pos = 0; pos < size * size && !error; pos++) {
error = ref_d[pos] != d[pos];
for (ypos = 0; ypos < size && !error; ypos += h * !error) {
for (xpos = 0; xpos < size && !error; xpos += w * !error) {
for (strength = depth - 8; strength < depth - 5 && !error;
strength += !error) {
ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
1 << strength);
if (clpf != ref_clpf)
ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w,
h, size, size, 1 << strength));
if (ref_clpf != clpf)
for (pos = 0; pos < size * size && !error; pos++) {
error = ref_d[pos] != d[pos];
}
}
}
}
@@ -116,46 +156,26 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
<< std::endl;
}
TEST_P(ClpfSpeedTest, TestSpeed) {
int w = sizex;
int h = sizey;
const int size = 32;
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint8_t, s[size * size]);
DECLARE_ALIGNED(16, uint8_t, d[size * size]);
int strength;
int xpos, ypos;
for (int i = 0; i < size * size; i++) s[i] = rnd.Rand8();
template <typename pixel>
void test_clpf_speed(int w, int h, int depth, int iterations,
void (*clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, int width, int height,
unsigned int strength),
void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, int width, int height,
unsigned int strength)) {
aom_usec_timer ref_timer;
aom_usec_timer timer;
aom_usec_timer_start(&ref_timer);
for (int c = 0; c < 65536; c++) {
for (ypos = 0; ypos < size; ypos += h) {
for (xpos = 0; xpos < size; xpos += w) {
for (strength = 0; strength < 3; strength++) {
ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size,
1 << strength);
}
}
}
}
test_clpf(w, h, depth, iterations, ref_clpf, ref_clpf);
aom_usec_timer_mark(&ref_timer);
int ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer);
aom_usec_timer_start(&timer);
for (int c = 0; c < 65536; c++) {
for (ypos = 0; ypos < size; ypos += h) {
for (xpos = 0; xpos < size; xpos += w) {
for (strength = 0; strength < 3; strength++) {
clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength);
}
}
}
}
test_clpf(w, h, depth, iterations, clpf, clpf);
aom_usec_timer_mark(&timer);
int elapsed_time = aom_usec_timer_elapsed(&timer);
@@ -170,6 +190,24 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
<< "SIMD time: " << elapsed_time << "ms" << std::endl;
}
TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
test_clpf(sizex, sizey, 8, 1, clpf, ref_clpf);
}
TEST_P(ClpfSpeedTest, TestSpeed) {
test_clpf_speed(sizex, sizey, 8, 16, clpf, ref_clpf);
}
#if CONFIG_AOM_HIGHBITDEPTH
TEST_P(ClpfBlockHbdTest, TestSIMDNoMismatch) {
test_clpf(sizex, sizey, 12, 1, clpf, ref_clpf);
}
TEST_P(ClpfHbdSpeedTest, TestSpeed) {
test_clpf_speed(sizex, sizey, 12, 1, clpf, ref_clpf);
}
#endif
using std::tr1::make_tuple;
// Test all supported architectures and block sizes
@@ -213,6 +251,48 @@ INSTANTIATE_TEST_CASE_P(
4)));
#endif
#if CONFIG_AOM_HIGHBITDEPTH
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, ClpfBlockHbdTest,
::testing::Values(
make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 8),
make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 4),
make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 8),
make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 4)));
#endif
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3, ClpfBlockHbdTest,
::testing::Values(
make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 8),
make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 4),
make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 8),
make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 4)));
#endif
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(
SSSE4_1, ClpfBlockHbdTest,
::testing::Values(
make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 8),
make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 4),
make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 8),
make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 4)));
#endif
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(
NEON, ClpfBlockHbdTest,
::testing::Values(
make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 8),
make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 4),
make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 8),
make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 4)));
#endif
#endif
// Test speed for all supported architectures
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, ClpfSpeedTest,
@@ -237,4 +317,35 @@ INSTANTIATE_TEST_CASE_P(NEON, ClpfSpeedTest,
::testing::Values(make_tuple(&aom_clpf_block_neon,
&aom_clpf_block_c, 8, 8)));
#endif
#if CONFIG_AOM_HIGHBITDEPTH
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, ClpfHbdSpeedTest,
::testing::Values(make_tuple(&aom_clpf_block_hbd_sse2,
&aom_clpf_block_hbd_c, 8,
8)));
#endif
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(SSSE3, ClpfHbdSpeedTest,
::testing::Values(make_tuple(&aom_clpf_block_hbd_ssse3,
&aom_clpf_block_hbd_c, 8,
8)));
#endif
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSSE4_1, ClpfHbdSpeedTest,
::testing::Values(make_tuple(&aom_clpf_block_hbd_ssse3,
&aom_clpf_block_hbd_c, 8,
8)));
#endif
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, ClpfHbdSpeedTest,
::testing::Values(make_tuple(&aom_clpf_block_hbd_neon,
&aom_clpf_block_hbd_c, 8,
8)));
#endif
#endif
} // namespace