From 3dbd55a6c46dc60e6d8f439a1c2044d7bd79111d Mon Sep 17 00:00:00 2001 From: Steinar Midtskogen Date: Fri, 9 Sep 2016 15:23:35 +0200 Subject: [PATCH] Added high bit-depth support in CLPF. Change-Id: Ic5eadb323227a820ad876c32d4dc296e05db6ece --- aom_dsp/aom_dsp_rtcd_defs.pl | 8 + av1/common/clpf.c | 141 +++++++- av1/common/clpf_simd.h | 373 +++++++++++-------- av1/encoder/clpf_rdo.c | 87 ++++- av1/encoder/clpf_rdo_simd.h | 669 ++++++++++++++++++++++------------- test/clpf_test.cc | 239 +++++++++---- 6 files changed, 1038 insertions(+), 479 deletions(-) diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 5f7384be7..14089bb48 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -587,6 +587,14 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; if (aom_config("CONFIG_CLPF") eq "yes") { + if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; + specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/; + add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift"; + specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/; + add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift"; + specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/; + } add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/; add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength"; diff --git a/av1/common/clpf.c b/av1/common/clpf.c index f4dfa1fa3..57ba0a9b1 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -8,6 +8,7 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include "av1/common/clpf.h" #include "./aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" @@ -47,6 +48,29 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride, } } +#if CONFIG_AOM_HIGHBITDEPTH +// Identical to aom_clpf_block_c() apart from "src" and "dst". +void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride, + int dstride, int x0, int y0, int sizex, int sizey, + int width, int height, unsigned int strength) { + int x, y; + for (y = y0; y < y0 + sizey; y++) { + for (x = x0; x < x0 + sizex; x++) { + int X = src[y * sstride + x]; + int A = src[AOMMAX(0, y - 1) * sstride + x]; + int B = src[y * sstride + AOMMAX(0, x - 2)]; + int C = src[y * sstride + AOMMAX(0, x - 1)]; + int D = src[y * sstride + AOMMIN(width - 1, x + 1)]; + int E = src[y * sstride + AOMMIN(width - 1, x + 2)]; + int F = src[AOMMIN(height - 1, y + 1) * sstride + x]; + int delta; + delta = av1_clpf_sample(X, A, B, C, D, E, F, strength); + dst[y * dstride + x] = X + delta; + } + } +} +#endif + // Return number of filtered blocks int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, @@ -75,15 +99,27 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst, const int cache_blocks = cache_size / (bs * bs); YV12_BUFFER_CONFIG dst = *orig_dst; + assert(bs == 8); // Optimised code assumes this. + +#if CONFIG_AOM_HIGHBITDEPTH + strength <<= (cm->bit_depth - 8); +#endif + // Make buffer space for in-place filtering if (rec->y_buffer == dst.y_buffer) { +#if CONFIG_AOM_HIGHBITDEPTH + CHECK_MEM_ERROR(cm, cache, + aom_malloc(cache_size << !!cm->use_highbitdepth)); + dst.y_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache; +#else CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size)); + dst.y_buffer = cache; +#endif CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr))); CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst))); memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst)); - dst.y_buffer = cache; dstride = bs; } @@ -125,34 +161,108 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst, // Temporary buffering needed if filtering in-place if (cache) { if (cache_ptr[cache_idx]) { - // Copy filtered block back into the frame +// Copy filtered block back into the frame +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + uint16_t *const d = + CONVERT_TO_SHORTPTR(cache_dst[cache_idx]); + for (c = 0; c < bs; c++) { + *(uint64_t *)(d + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); + *(uint64_t *)(d + c * sstride + 4) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); + } + } else { + for (c = 0; c < bs; c++) + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + } +#else for (c = 0; c < bs; c++) *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_ptr[cache_idx] + c * bs); +#endif } +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2; + dst.y_buffer = CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - + ypos * bs - xpos; + } else { + cache_ptr[cache_idx] = cache + cache_idx * bs * bs; + dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; + } +#else cache_ptr[cache_idx] = cache + cache_idx * bs * bs; dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; +#endif cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos; if (++cache_idx >= cache_blocks) cache_idx = 0; } - // Apply the filter +// Apply the filter +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), + CONVERT_TO_SHORTPTR(dst.y_buffer), sstride, + dstride, xpos, ypos, bs, bs, width, height, + strength); + } else { + aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride, + xpos, ypos, bs, bs, width, height, strength); + } +#else aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride, xpos, ypos, bs, bs, width, height, strength); - +#endif } else { // Skip block, copy instead - if (!cache) + if (!cache) { +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + uint16_t *const d = CONVERT_TO_SHORTPTR(dst.y_buffer); + const uint16_t *const s = CONVERT_TO_SHORTPTR(rec->y_buffer); + for (c = 0; c < bs; c++) { + *(uint64_t *)(d + (ypos + c) * dstride + xpos) = + *(uint64_t *)(s + (ypos + c) * sstride + xpos); + *(uint64_t *)(d + (ypos + c) * dstride + xpos + 4) = + *(uint64_t *)(s + (ypos + c) * sstride + xpos + 4); + } + } else { + for (c = 0; c < bs; c++) + *(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = + *(uint64_t *)(rec->y_buffer + (ypos + c) * sstride + + xpos); + } +#else for (c = 0; c < bs; c++) *(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *( uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos); +#endif + } } } } } else { // Entire filter block is skip, copy - if (!cache) + if (!cache) { +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + for (m = 0; m < h; m++) + memcpy(CONVERT_TO_SHORTPTR(dst.y_buffer) + (yoff + m) * dstride + + xoff, + CONVERT_TO_SHORTPTR(rec->y_buffer) + (yoff + m) * sstride + + xoff, + w * 2); + } else { + for (m = 0; m < h; m++) + memcpy(dst.y_buffer + (yoff + m) * dstride + xoff, + rec->y_buffer + (yoff + m) * sstride + xoff, w); + } +#else for (m = 0; m < h; m++) memcpy(dst.y_buffer + (yoff + m) * dstride + xoff, rec->y_buffer + (yoff + m) * sstride + xoff, w); +#endif + } } block_index += !allskip; // Count number of blocks filtered } @@ -161,10 +271,27 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst, if (cache) { // Copy remaining blocks into the frame for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx]; - cache_idx++) + cache_idx++) { +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]); + for (c = 0; c < bs; c++) { + *(uint64_t *)(d + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); + *(uint64_t *)(d + c * sstride + 4) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); + } + } else { + for (c = 0; c < bs; c++) + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + } +#else for (c = 0; c < bs; c++) *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_ptr[cache_idx] + c * bs); +#endif + } aom_free(cache); aom_free(cache_ptr); diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index 544aa36f7..2507c151a 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h @@ -11,170 +11,129 @@ #include "./aom_dsp_rtcd.h" +SIMD_INLINE void calc_delta(v128 o, v128 x, v128 a, v128 b, v128 c, v128 d, + v128 e, v128 f, uint8_t *dst, v128 sp, v128 sm, + int dstride) { + const v128 c8 = v128_dup_8(8); + const v128 tmp = + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + const v128 delta = v128_add_8( + v128_add_8( + v128_shl_8( + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + o = v128_add_8( + o, + v128_shr_s8( + v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), + 4)); + v64_store_aligned(dst, v128_high_v64(o)); + v64_store_aligned(dst + dstride, v128_low_v64(o)); +} + static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizey, int width, int height, unsigned int strength) { + int bottom = height - 2 - y0; + const v128 sp = v128_dup_8(strength); + const v128 sm = v128_dup_8(-(int)strength); + const v128 c128 = v128_dup_8(128); dst += x0 + y0 * dstride; src += x0 + y0 * sstride; - { - int bottom = height - 2 - y0; - const v128 sp = v128_dup_8(strength); - const v128 sm = v128_dup_8(-(int)strength); - const v128 c8 = v128_dup_8(8); - const v128 c128 = v128_dup_8(128); - if (!x0) { // Clip left - const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), - v64_from_64(0x0504030201000000LL)); - const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), - v64_from_64(0x0605040302010000LL)); - int y; + if (!x0) { // Clip left + const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), + v64_from_64(0x0504030201000000LL)); + const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), + v64_from_64(0x0605040302010000LL)); + int y; - for (y = 0; y < sizey; y += 2) { - const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + sstride); - v128 o = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, o); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); - const v128 b = v128_shuffle_8(x, b_shuff); - const v128 c = v128_shuffle_8(x, c_shuff); - const v128 d = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src + 1), - v64_load_unaligned(src + 1 + sstride))); - const v128 e = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src + 2), - v64_load_unaligned(src + 2 + sstride))); - const v128 f = v128_add_8( - c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * sstride))); + for (y = 0; y < sizey; y += 2) { + const v64 l1 = v64_load_aligned(src); + const v64 l2 = v64_load_aligned(src + sstride); + v128 o = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, o); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); + const v128 b = v128_shuffle_8(x, b_shuff); + const v128 c = v128_shuffle_8(x, c_shuff); + const v128 d = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src + 1), + v64_load_unaligned(src + 1 + sstride))); + const v128 e = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src + 2), + v64_load_unaligned(src + 2 + sstride))); + const v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride))); + calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride); + src += sstride * 2; + dst += dstride * 2; + } + } else if (!(width - x0 - 8)) { // Clip right + const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), + v64_from_64(0x0707060504030201LL)); + const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), + v64_from_64(0x0707070605040302LL)); + int y; - const v128 tmp = - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); - const v128 delta = v128_add_8( - v128_add_8( - v128_shl_8( - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), - 2), - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - o = v128_add_8( - o, v128_shr_s8( - v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8( - delta, v128_zero()))), - 4)); - v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + dstride, v128_low_v64(o)); - src += sstride * 2; - dst += dstride * 2; - } - } else if (!(width - x0 - 8)) { // Clip right - const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), - v64_from_64(0x0707060504030201LL)); - const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), - v64_from_64(0x0707070605040302LL)); - int y; - - for (y = 0; y < sizey; y += 2) { - const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + sstride); - v128 o = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, o); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); - const v128 b = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src - 2), - v64_load_unaligned(src - 2 + sstride))); - const v128 c = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src - 1), - v64_load_unaligned(src - 1 + sstride))); - const v128 d = v128_shuffle_8(x, d_shuff); - const v128 e = v128_shuffle_8(x, e_shuff); - const v128 f = v128_add_8( - c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * sstride))); - - const v128 tmp = - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); - const v128 delta = v128_add_8( - v128_add_8( - v128_shl_8( - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), - 2), - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - o = v128_add_8( - o, v128_shr_s8( - v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8( - delta, v128_zero()))), - 4)); - v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + dstride, v128_low_v64(o)); - src += sstride * 2; - dst += dstride * 2; - } - } else { // No left/right clipping - int y; - for (y = 0; y < sizey; y += 2) { - const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + sstride); - v128 o = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, o); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); - const v128 b = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src - 2), - v64_load_unaligned(src - 2 + sstride))); - const v128 c = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src - 1), - v64_load_unaligned(src - 1 + sstride))); - const v128 d = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src + 1), - v64_load_unaligned(src + 1 + sstride))); - const v128 e = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src + 2), - v64_load_unaligned(src + 2 + sstride))); - const v128 f = v128_add_8( - c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * sstride))); - - const v128 tmp = - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); - const v128 delta = v128_add_8( - v128_add_8( - v128_shl_8( - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), - 2), - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - o = v128_add_8( - o, v128_shr_s8( - v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8( - delta, v128_zero()))), - 4)); - v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + dstride, v128_low_v64(o)); - src += sstride * 2; - dst += dstride * 2; - } + for (y = 0; y < sizey; y += 2) { + const v64 l1 = v64_load_aligned(src); + const v64 l2 = v64_load_aligned(src + sstride); + v128 o = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, o); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); + const v128 b = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src - 2), + v64_load_unaligned(src - 2 + sstride))); + const v128 c = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src - 1), + v64_load_unaligned(src - 1 + sstride))); + const v128 d = v128_shuffle_8(x, d_shuff); + const v128 e = v128_shuffle_8(x, e_shuff); + const v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride))); + calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride); + src += sstride * 2; + dst += dstride * 2; + } + } else { // No left/right clipping + int y; + for (y = 0; y < sizey; y += 2) { + const v64 l1 = v64_load_aligned(src); + const v64 l2 = v64_load_aligned(src + sstride); + v128 o = v128_from_v64(l1, l2); + const v128 x = v128_add_8(c128, o); + const v128 a = v128_add_8( + c128, + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); + const v128 b = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src - 2), + v64_load_unaligned(src - 2 + sstride))); + const v128 c = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src - 1), + v64_load_unaligned(src - 1 + sstride))); + const v128 d = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src + 1), + v64_load_unaligned(src + 1 + sstride))); + const v128 e = v128_add_8( + c128, v128_from_v64(v64_load_unaligned(src + 2), + v64_load_unaligned(src + 2 + sstride))); + const v128 f = v128_add_8( + c128, v128_from_v64( + l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride))); + calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride); + src += sstride * 2; + dst += dstride * 2; } } } @@ -197,3 +156,105 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, strength); } } + +#if CONFIG_AOM_HIGHBITDEPTH +static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, + v128 f, uint16_t *dst, v128 sp, v128 sm) { + const v128 c8 = v128_dup_16(8); + const v128 tmp = + v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm), + v128_max_s16(v128_min_s16(v128_sub_16(d, o), sp), sm)); + const v128 delta = v128_add_16( + v128_add_16( + v128_shl_16( + v128_add_16( + v128_max_s16(v128_min_s16(v128_sub_16(a, o), sp), sm), + v128_max_s16(v128_min_s16(v128_sub_16(f, o), sp), sm)), + 2), + v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm), + v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))), + v128_add_16(v128_add_16(tmp, tmp), tmp)); + v128_store_aligned( + dst, + v128_add_16( + o, v128_shr_s16( + v128_add_16(c8, v128_add_16(delta, v128_cmplt_s16( + delta, v128_zero()))), + 4))); +} + +SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride, + int dstride, int x0, int y0, int sizey, + int width, int height, unsigned int strength) { + int y; + int bottom = height - 2 - y0; + const v128 sp = v128_dup_16(strength); + const v128 sm = v128_dup_16(-(int)strength); + + dst += x0 + y0 * dstride; + src += x0 + y0 * sstride; + + if (!x0) { // Clip left + const v128 b_shuff = v128_from_v64(v64_from_64(0x0b0a090807060504LL), + v64_from_64(0x0302010001000100LL)); + const v128 c_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080706LL), + v64_from_64(0x0504030201000100LL)); + for (y = 0; y < sizey; y++) { + const v128 o = v128_load_aligned(src); + const v128 a = v128_load_aligned(src - (y != -y0) * sstride); + const v128 b = v128_shuffle_8(o, b_shuff); + const v128 c = v128_shuffle_8(o, c_shuff); + const v128 d = v128_load_unaligned(src + 1); + const v128 e = v128_load_unaligned(src + 2); + const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); + calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm); + src += sstride; + dst += dstride; + } + } else if (!(width - x0 - 8)) { // Clip right + const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0d0c0b0aLL), + v64_from_64(0x0908070605040302LL)); + const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0f0e0d0cLL), + v64_from_64(0x0b0a090807060504LL)); + for (y = 0; y < sizey; y++) { + const v128 o = v128_load_aligned(src); + const v128 a = v128_load_aligned(src - (y != -y0) * sstride); + const v128 b = v128_load_unaligned(src - 2); + const v128 c = v128_load_unaligned(src - 1); + const v128 d = v128_shuffle_8(o, d_shuff); + const v128 e = v128_shuffle_8(o, e_shuff); + const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); + calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm); + src += sstride; + dst += dstride; + } + } else { // No left/right clipping + for (y = 0; y < sizey; y++) { + const v128 o = v128_load_aligned(src); + const v128 a = v128_load_aligned(src - (y != -y0) * sstride); + const v128 b = v128_load_unaligned(src - 2); + const v128 c = v128_load_unaligned(src - 1); + const v128 d = v128_load_unaligned(src + 1); + const v128 e = v128_load_unaligned(src + 2); + const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); + calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm); + src += sstride; + dst += dstride; + } + } +} + +void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, + int sstride, int dstride, int x0, int y0, + int sizex, int sizey, int width, int height, + unsigned int strength) { + if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) { + // Fallback to C for odd sizes + aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, + width, height, strength); + } else { + clpf_block_hbd(src, dst, sstride, dstride, x0, y0, sizey, width, height, + strength); + } +} +#endif diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c index 4221505dd..7bb64ae5c 100644 --- a/av1/encoder/clpf_rdo.c +++ b/av1/encoder/clpf_rdo.c @@ -66,6 +66,62 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org, } } +#if CONFIG_AOM_HIGHBITDEPTH +// Identical to aom_clpf_detect_c() apart from "rec" and "org". +void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org, + int rstride, int ostride, int x0, int y0, int width, + int height, int *sum0, int *sum1, + unsigned int strength, int shift) { + int x, y; + for (y = y0; y < y0 + 8; y++) { + for (x = x0; x < x0 + 8; x++) { + int O = org[y * ostride + x] >> shift; + int X = rec[y * rstride + x] >> shift; + int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift; + int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift; + int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift; + int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift; + int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift; + int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift; + int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength >> shift); + int Y = X + delta; + *sum0 += (O - X) * (O - X); + *sum1 += (O - Y) * (O - Y); + } + } +} + +// aom_clpf_detect_multi_c() apart from "rec" and "org". +void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org, + int rstride, int ostride, int x0, int y0, + int width, int height, int *sum, int shift) { + int x, y; + + for (y = y0; y < y0 + 8; y++) { + for (x = x0; x < x0 + 8; x++) { + int O = org[y * ostride + x] >> shift; + int X = rec[y * rstride + x] >> shift; + int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift; + int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift; + int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift; + int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift; + int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift; + int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift; + int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1); + int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2); + int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4); + int F1 = X + delta1; + int F2 = X + delta2; + int F3 = X + delta3; + sum[0] += (O - X) * (O - X); + sum[1] += (O - F1) * (O - F1); + sum[2] += (O - F2) * (O - F2); + sum[3] += (O - F3) * (O - F3); + } + } +} +#endif + int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, int block_size, int w, int h, unsigned int strength, @@ -77,10 +133,25 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, int ypos = (k << fb_size_log2) + m * block_size; const int bs = MAX_MIB_SIZE; if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] - ->mbmi.skip) + ->mbmi.skip) { +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), + CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride, + org->y_stride, xpos, ypos, rec->y_crop_width, + rec->y_crop_height, &sum0, &sum1, strength, + cm->bit_depth - 8); + } else { + aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, + org->y_stride, xpos, ypos, rec->y_crop_width, + rec->y_crop_height, &sum0, &sum1, strength); + } +#else aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, org->y_stride, xpos, ypos, rec->y_crop_width, rec->y_crop_height, &sum0, &sum1, strength); +#endif + } } } *res = sum1 < sum0; @@ -145,9 +216,23 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride + xpos / MAX_MIB_SIZE] ->mbmi.skip) { +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), + CONVERT_TO_SHORTPTR(org->y_buffer), + rec->y_stride, org->y_stride, xpos, ypos, + rec->y_crop_width, rec->y_crop_height, sum, + cm->bit_depth - 8); + } else { + aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride, + org->y_stride, xpos, ypos, rec->y_crop_width, + rec->y_crop_height, sum); + } +#else aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride, org->y_stride, xpos, ypos, rec->y_crop_width, rec->y_crop_height, sum); +#endif filtered = 1; } } diff --git a/av1/encoder/clpf_rdo_simd.h b/av1/encoder/clpf_rdo_simd.h index abbbe7c07..1bc5af647 100644 --- a/av1/encoder/clpf_rdo_simd.h +++ b/av1/encoder/clpf_rdo_simd.h @@ -11,6 +11,27 @@ #include "aom_dsp/aom_simd.h" +SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, + v128 f, v128 sp, v128 sm) { + const v128 tmp = + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + v128 delta = v128_add_8( + v128_add_8( + v128_shl_8( + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + + return v128_shr_s8( + v128_add_8(v128_dup_8(8), + v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), + 4); +} + void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, @@ -54,27 +75,9 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, const v128 f = v128_add_8( c128, v128_from_v64( l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - - const v128 tmp = - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); - v128 delta = v128_add_8( - v128_add_8( - v128_shl_8( - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - - delta = v128_shr_s8( - v128_add_8(v128_dup_8(8), - v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), - 4); ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta)); + ssd1 = v128_ssd_u8( + ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); rec += rstride * 2; org += ostride * 2; } @@ -107,26 +110,9 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, const v128 f = v128_add_8( c128, v128_from_v64( l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - - const v128 tmp = - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); - v128 delta = v128_add_8( - v128_add_8( - v128_shl_8( - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - delta = v128_shr_s8( - v128_add_8(v128_dup_8(8), - v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), - 4); ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta)); + ssd1 = v128_ssd_u8( + ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); rec += rstride * 2; org += ostride * 2; } @@ -158,27 +144,9 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, const v128 f = v128_add_8( c128, v128_from_v64( l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - - const v128 tmp = - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); - v128 delta = v128_add_8( - v128_add_8( - v128_shl_8( - v128_add_8( - v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - delta = v128_shr_s8( - v128_add_8(v128_dup_8(8), - v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), - 4); - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8(ssd1, o, v128_add_8(q, delta)); + ssd1 = v128_ssd_u8( + ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); rec += rstride * 2; org += ostride * 2; } @@ -187,8 +155,73 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, *sum1 += v128_ssd_u8_sum(ssd1); } -// Test multiple filter strengths at once. Use a simpler filter (4 tap, every -// second line). +SIMD_INLINE void calc_delta_multi(v128 x, v128 q, v128 o, v128 a, v128 b, + v128 c, v128 d, v128 e, v128 f, v128 cp1, + v128 cm1, v128 cp2, v128 cm2, v128 cp4, + v128 cm4, ssd128_internal *ssd1, + ssd128_internal *ssd2, + ssd128_internal *ssd3) { + v128 tmp, delta1, delta2, delta3; + const v128 c8 = v128_dup_8(8); + + a = v128_ssub_s8(a, x); + b = v128_ssub_s8(b, x); + c = v128_ssub_s8(c, x); + d = v128_ssub_s8(d, x); + e = v128_ssub_s8(e, x); + f = v128_ssub_s8(f, x); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), + v128_max_s8(v128_min_s8(d, cp1), cm1)); + delta1 = v128_add_8( + v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), + v128_max_s8(v128_min_s8(f, cp1), cm1)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), + v128_max_s8(v128_min_s8(e, cp1), cm1))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), + v128_max_s8(v128_min_s8(d, cp2), cm2)); + delta2 = v128_add_8( + v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), + v128_max_s8(v128_min_s8(f, cp2), cm2)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), + v128_max_s8(v128_min_s8(e, cp2), cm2))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), + v128_max_s8(v128_min_s8(d, cp4), cm4)); + delta3 = v128_add_8( + v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), + v128_max_s8(v128_min_s8(f, cp4), cm4)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), + v128_max_s8(v128_min_s8(e, cp4), cm4))), + v128_add_8(v128_add_8(tmp, tmp), tmp)); + + *ssd1 = v128_ssd_u8( + *ssd1, o, + v128_add_8( + q, v128_shr_s8( + v128_add_8(c8, v128_add_8(delta1, + v128_cmplt_s8(delta1, v128_zero()))), + 4))); + *ssd2 = v128_ssd_u8( + *ssd2, o, + v128_add_8( + q, v128_shr_s8( + v128_add_8(c8, v128_add_8(delta2, + v128_cmplt_s8(delta2, v128_zero()))), + 4))); + *ssd3 = v128_ssd_u8( + *ssd3, o, + v128_add_8( + q, v128_shr_s8( + v128_add_8(c8, v128_add_8(delta3, + v128_cmplt_s8(delta3, v128_zero()))), + 4))); +} + +// Test multiple filter strengths at once. void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum) { @@ -199,7 +232,6 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, const v128 cm2 = v128_dup_8(-2); const v128 cp4 = v128_dup_8(4); const v128 cm4 = v128_dup_8(-4); - const v128 c8 = v128_dup_8(8); const int bottom = height - 2 - y0; ssd128_internal ssd0 = v128_ssd_u8_init(); ssd128_internal ssd1 = v128_ssd_u8_init(); @@ -238,70 +270,9 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, v128 f = v128_add_8( c128, v128_from_v64( l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - v128 tmp, delta1, delta2, delta3; - - a = v128_ssub_s8(a, x); - b = v128_ssub_s8(b, x); - c = v128_ssub_s8(c, x); - d = v128_ssub_s8(d, x); - e = v128_ssub_s8(e, x); - f = v128_ssub_s8(f, x); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), - v128_max_s8(v128_min_s8(d, cp1), cm1)); - delta1 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), - v128_max_s8(v128_min_s8(f, cp1), cm1)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), - v128_max_s8(v128_min_s8(e, cp1), cm1))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), - v128_max_s8(v128_min_s8(d, cp2), cm2)); - delta2 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), - v128_max_s8(v128_min_s8(f, cp2), cm2)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), - v128_max_s8(v128_min_s8(e, cp2), cm2))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), - v128_max_s8(v128_min_s8(d, cp4), cm4)); - delta3 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), - v128_max_s8(v128_min_s8(f, cp4), cm4)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), - v128_max_s8(v128_min_s8(e, cp4), cm4))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8( - delta1, v128_zero()))), - 4))); - ssd2 = v128_ssd_u8( - ssd2, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8( - delta2, v128_zero()))), - 4))); - ssd3 = v128_ssd_u8( - ssd3, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8( - delta3, v128_zero()))), - 4))); + calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, + &ssd1, &ssd2, &ssd3); rec += 2 * rstride; org += 2 * ostride; } @@ -334,70 +305,9 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, v128 f = v128_add_8( c128, v128_from_v64( l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - v128 tmp, delta1, delta2, delta3; - - a = v128_ssub_s8(a, x); - b = v128_ssub_s8(b, x); - c = v128_ssub_s8(c, x); - d = v128_ssub_s8(d, x); - e = v128_ssub_s8(e, x); - f = v128_ssub_s8(f, x); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), - v128_max_s8(v128_min_s8(d, cp1), cm1)); - delta1 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), - v128_max_s8(v128_min_s8(f, cp1), cm1)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), - v128_max_s8(v128_min_s8(e, cp1), cm1))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), - v128_max_s8(v128_min_s8(d, cp2), cm2)); - delta2 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), - v128_max_s8(v128_min_s8(f, cp2), cm2)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), - v128_max_s8(v128_min_s8(e, cp2), cm2))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), - v128_max_s8(v128_min_s8(d, cp4), cm4)); - delta3 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), - v128_max_s8(v128_min_s8(f, cp4), cm4)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), - v128_max_s8(v128_min_s8(e, cp4), cm4))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8( - delta1, v128_zero()))), - 4))); - ssd2 = v128_ssd_u8( - ssd2, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8( - delta2, v128_zero()))), - 4))); - ssd3 = v128_ssd_u8( - ssd3, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8( - delta3, v128_zero()))), - 4))); + calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, + &ssd1, &ssd2, &ssd3); rec += 2 * rstride; org += 2 * ostride; } @@ -429,70 +339,9 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, v128 f = v128_add_8( c128, v128_from_v64( l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - v128 tmp, delta1, delta2, delta3; - - a = v128_ssub_s8(a, x); - b = v128_ssub_s8(b, x); - c = v128_ssub_s8(c, x); - d = v128_ssub_s8(d, x); - e = v128_ssub_s8(e, x); - f = v128_ssub_s8(f, x); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), - v128_max_s8(v128_min_s8(d, cp1), cm1)); - delta1 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), - v128_max_s8(v128_min_s8(f, cp1), cm1)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), - v128_max_s8(v128_min_s8(e, cp1), cm1))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), - v128_max_s8(v128_min_s8(d, cp2), cm2)); - delta2 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), - v128_max_s8(v128_min_s8(f, cp2), cm2)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), - v128_max_s8(v128_min_s8(e, cp2), cm2))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), - v128_max_s8(v128_min_s8(d, cp4), cm4)); - delta3 = v128_add_8( - v128_add_8( - v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), - v128_max_s8(v128_min_s8(f, cp4), cm4)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), - v128_max_s8(v128_min_s8(e, cp4), cm4))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta1, v128_cmplt_s8( - delta1, v128_zero()))), - 4))); - ssd2 = v128_ssd_u8( - ssd2, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta2, v128_cmplt_s8( - delta2, v128_zero()))), - 4))); - ssd3 = v128_ssd_u8( - ssd3, o, - v128_add_8( - q, - v128_shr_s8( - v128_add_8(c8, v128_add_8(delta3, v128_cmplt_s8( - delta3, v128_zero()))), - 4))); + calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, + &ssd1, &ssd2, &ssd3); rec += 2 * rstride; org += 2 * ostride; } @@ -502,3 +351,321 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, sum[2] += v128_ssd_u8_sum(ssd2); sum[3] += v128_ssd_u8_sum(ssd3); } + +#if CONFIG_AOM_HIGHBITDEPTH +void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org, + int rstride, int ostride, int x0, int y0, + int width, int height, int *sum0, int *sum1, + unsigned int strength, int shift) { + ssd128_internal ssd0 = v128_ssd_u8_init(); + ssd128_internal ssd1 = v128_ssd_u8_init(); + const v128 c128 = v128_dup_8(128); + const v128 sp = v128_dup_8(strength >> shift); + const v128 sm = v128_dup_8(-(int)(strength >> shift)); + const int bottom = height - 2 - y0; + + rec += x0 + y0 * rstride; + org += x0 + y0 * ostride; + + if (!x0) { // Clip left + const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), + v64_from_64(0x0504030201000000LL)); + const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), + v64_from_64(0x0605040302010000LL)); + int y; + + for (y = 0; y < 8; y += 2) { + const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); + const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); + const v128 o = + v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), + v128_shr_u16(v128_load_aligned(org + ostride), shift)); + const v128 q = v128_unziplo_8(n1, n2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, v128_unziplo_8( + v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), + shift), + n1)); + const v128 b = v128_shuffle_8(x, b_shuff); + const v128 c = v128_shuffle_8(x, c_shuff); + const v128 d = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 1), shift), + v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); + const v128 e = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 2), shift), + v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); + const v128 f = v128_add_8( + c128, v128_unziplo_8( + n2, v128_shr_u16(v128_load_unaligned( + rec + ((y != bottom) + 1) * rstride), + shift))); + + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8( + ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); + rec += rstride * 2; + org += ostride * 2; + } + } else if (!(width - x0 - 8)) { // Clip right + const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), + v64_from_64(0x0707060504030201LL)); + const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), + v64_from_64(0x0707070605040302LL)); + int y; + + for (y = 0; y < 8; y += 2) { + const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); + const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); + const v128 o = + v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), + v128_shr_u16(v128_load_aligned(org + ostride), shift)); + const v128 q = v128_unziplo_8(n1, n2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, v128_unziplo_8( + v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), + shift), + n1)); + const v128 b = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 2), shift), + v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); + const v128 c = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 1), shift), + v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); + const v128 d = v128_shuffle_8(x, d_shuff); + const v128 e = v128_shuffle_8(x, e_shuff); + const v128 f = v128_add_8( + c128, v128_unziplo_8( + n2, v128_shr_u16(v128_load_unaligned( + rec + ((y != bottom) + 1) * rstride), + shift))); + + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8( + ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); + rec += rstride * 2; + org += ostride * 2; + } + } else { // No left/right clipping + int y; + for (y = 0; y < 8; y += 2) { + const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); + const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); + const v128 o = + v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), + v128_shr_u16(v128_load_aligned(org + ostride), shift)); + const v128 q = v128_unziplo_8(n1, n2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, v128_unziplo_8( + v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), + shift), + n1)); + const v128 b = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 2), shift), + v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); + const v128 c = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 1), shift), + v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); + const v128 d = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 1), shift), + v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); + const v128 e = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 2), shift), + v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); + const v128 f = v128_add_8( + c128, v128_unziplo_8( + n2, v128_shr_u16(v128_load_unaligned( + rec + ((y != bottom) + 1) * rstride), + shift))); + ssd0 = v128_ssd_u8(ssd0, o, q); + ssd1 = v128_ssd_u8( + ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); + rec += rstride * 2; + org += ostride * 2; + } + } + *sum0 += v128_ssd_u8_sum(ssd0); + *sum1 += v128_ssd_u8_sum(ssd1); +} + +void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec, + const uint16_t *org, int rstride, + int ostride, int x0, int y0, + int width, int height, int *sum, + int shift) { + const v128 c128 = v128_dup_8(128); + const v128 cp1 = v128_dup_8(1); + const v128 cm1 = v128_dup_8(-1); + const v128 cp2 = v128_dup_8(2); + const v128 cm2 = v128_dup_8(-2); + const v128 cp4 = v128_dup_8(4); + const v128 cm4 = v128_dup_8(-4); + const int bottom = height - 2 - y0; + ssd128_internal ssd0 = v128_ssd_u8_init(); + ssd128_internal ssd1 = v128_ssd_u8_init(); + ssd128_internal ssd2 = v128_ssd_u8_init(); + ssd128_internal ssd3 = v128_ssd_u8_init(); + + rec += x0 + y0 * rstride; + org += x0 + y0 * ostride; + + if (!x0) { // Clip left + const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), + v64_from_64(0x0504030201000000LL)); + const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), + v64_from_64(0x0605040302010000LL)); + int y; + + for (y = 0; y < 8; y += 2) { + const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); + const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); + const v128 o = + v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), + v128_shr_u16(v128_load_aligned(org + ostride), shift)); + const v128 q = v128_unziplo_8(n1, n2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, v128_unziplo_8( + v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), + shift), + n1)); + const v128 b = v128_shuffle_8(x, b_shuff); + const v128 c = v128_shuffle_8(x, c_shuff); + const v128 d = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 1), shift), + v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); + const v128 e = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 2), shift), + v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); + const v128 f = v128_add_8( + c128, v128_unziplo_8( + n2, v128_shr_u16(v128_load_unaligned( + rec + ((y != bottom) + 1) * rstride), + shift))); + + ssd0 = v128_ssd_u8(ssd0, o, q); + calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, + &ssd1, &ssd2, &ssd3); + rec += 2 * rstride; + org += 2 * ostride; + } + } else if (!(width - x0 - 8)) { // Clip right + const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), + v64_from_64(0x0707060504030201LL)); + const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), + v64_from_64(0x0707070605040302LL)); + int y; + + for (y = 0; y < 8; y += 2) { + const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); + const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); + const v128 o = + v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), + v128_shr_u16(v128_load_aligned(org + ostride), shift)); + const v128 q = v128_unziplo_8(n1, n2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, v128_unziplo_8( + v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), + shift), + n1)); + const v128 b = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 2), shift), + v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); + const v128 c = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 1), shift), + v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); + const v128 d = v128_shuffle_8(x, d_shuff); + const v128 e = v128_shuffle_8(x, e_shuff); + const v128 f = v128_add_8( + c128, v128_unziplo_8( + n2, v128_shr_u16(v128_load_unaligned( + rec + ((y != bottom) + 1) * rstride), + shift))); + + ssd0 = v128_ssd_u8(ssd0, o, q); + calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, + &ssd1, &ssd2, &ssd3); + rec += 2 * rstride; + org += 2 * ostride; + } + } else { // No left/right clipping + int y; + for (y = 0; y < 8; y += 2) { + const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); + const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); + const v128 o = + v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), + v128_shr_u16(v128_load_aligned(org + ostride), shift)); + const v128 q = v128_unziplo_8(n1, n2); + const v128 x = v128_add_8(c128, q); + const v128 a = v128_add_8( + c128, v128_unziplo_8( + v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), + shift), + n1)); + const v128 b = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 2), shift), + v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); + const v128 c = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 1), shift), + v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); + const v128 d = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 1), shift), + v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); + const v128 e = v128_add_8( + c128, + v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 2), shift), + v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); + const v128 f = v128_add_8( + c128, v128_unziplo_8( + n2, v128_shr_u16(v128_load_unaligned( + rec + ((y != bottom) + 1) * rstride), + shift))); + + ssd0 = v128_ssd_u8(ssd0, o, q); + calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, + &ssd1, &ssd2, &ssd3); + rec += 2 * rstride; + org += 2 * ostride; + } + } + sum[0] += v128_ssd_u8_sum(ssd0); + sum[1] += v128_ssd_u8_sum(ssd1); + sum[2] += v128_ssd_u8_sum(ssd2); + sum[3] += v128_ssd_u8_sum(ssd3); +} +#endif diff --git a/test/clpf_test.cc b/test/clpf_test.cc index 755d1f146..608a8ca9f 100644 --- a/test/clpf_test.cc +++ b/test/clpf_test.cc @@ -54,44 +54,84 @@ class ClpfBlockTest : public ::testing::TestWithParam { typedef ClpfBlockTest ClpfSpeedTest; -TEST_P(ClpfBlockTest, TestSIMDNoMismatch) { - int w = sizex; - int h = sizey; - const int size = 32; - ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, uint8_t, s[size * size]); - DECLARE_ALIGNED(16, uint8_t, d[size * size]); - DECLARE_ALIGNED(16, uint8_t, ref_d[size * size]); - memset(ref_d, 0, size * size); - memset(d, 0, size * size); +#if CONFIG_AOM_HIGHBITDEPTH +typedef void (*clpf_block_hbd_t)(const uint16_t *src, uint16_t *dst, + int sstride, int dstride, int x0, int y0, + int sizex, int sizey, int width, int height, + unsigned int strength); - int error = 0; - int pos = 0; - int strength = 0; - int xpos = 0, ypos = 0; - int bits; - int level; +typedef std::tr1::tuple + clpf_block_hbd_param_t; + +class ClpfBlockHbdTest + : public ::testing::TestWithParam { + public: + virtual ~ClpfBlockHbdTest() {} + virtual void SetUp() { + clpf = GET_PARAM(0); + ref_clpf = GET_PARAM(1); + sizex = GET_PARAM(2); + sizey = GET_PARAM(3); + } + + virtual void TearDown() { libaom_test::ClearSystemState(); } + + protected: + int sizex; + int sizey; + clpf_block_hbd_t clpf; + clpf_block_hbd_t ref_clpf; +}; + +typedef ClpfBlockHbdTest ClpfHbdSpeedTest; +#endif + +template +void test_clpf(int w, int h, int depth, int iterations, + void (*clpf)(const pixel *src, pixel *dst, int sstride, + int dstride, int x0, int y0, int sizex, int sizey, + int width, int height, unsigned int strength), + void (*ref_clpf)(const pixel *src, pixel *dst, int sstride, + int dstride, int x0, int y0, int sizex, + int sizey, int width, int height, + unsigned int strength)) { + const int size = 24; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, pixel, s[size * size]); + DECLARE_ALIGNED(16, pixel, d[size * size]); + DECLARE_ALIGNED(16, pixel, ref_d[size * size]); + memset(ref_d, 0, size * size * sizeof(*ref_d)); + memset(d, 0, size * size * sizeof(*d)); + + int error = 0, pos = 0, strength = 0, xpos = 0, ypos = 0; + int bits, level, count; // Test every combination of: - // * Input with 1-8 bits of noise - // * Noise level around every value from 0 to 255 + // * Input with up to bits of noise + // * Noise level around every value from 0 to (1< +void test_clpf_speed(int w, int h, int depth, int iterations, + void (*clpf)(const pixel *src, pixel *dst, int sstride, + int dstride, int x0, int y0, int sizex, + int sizey, int width, int height, + unsigned int strength), + void (*ref_clpf)(const pixel *src, pixel *dst, int sstride, + int dstride, int x0, int y0, int sizex, + int sizey, int width, int height, + unsigned int strength)) { aom_usec_timer ref_timer; aom_usec_timer timer; aom_usec_timer_start(&ref_timer); - for (int c = 0; c < 65536; c++) { - for (ypos = 0; ypos < size; ypos += h) { - for (xpos = 0; xpos < size; xpos += w) { - for (strength = 0; strength < 3; strength++) { - ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size, - 1 << strength); - } - } - } - } + test_clpf(w, h, depth, iterations, ref_clpf, ref_clpf); aom_usec_timer_mark(&ref_timer); int ref_elapsed_time = aom_usec_timer_elapsed(&ref_timer); aom_usec_timer_start(&timer); - for (int c = 0; c < 65536; c++) { - for (ypos = 0; ypos < size; ypos += h) { - for (xpos = 0; xpos < size; xpos += w) { - for (strength = 0; strength < 3; strength++) { - clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength); - } - } - } - } + test_clpf(w, h, depth, iterations, clpf, clpf); aom_usec_timer_mark(&timer); int elapsed_time = aom_usec_timer_elapsed(&timer); @@ -170,6 +190,24 @@ TEST_P(ClpfSpeedTest, TestSpeed) { << "SIMD time: " << elapsed_time << "ms" << std::endl; } +TEST_P(ClpfBlockTest, TestSIMDNoMismatch) { + test_clpf(sizex, sizey, 8, 1, clpf, ref_clpf); +} + +TEST_P(ClpfSpeedTest, TestSpeed) { + test_clpf_speed(sizex, sizey, 8, 16, clpf, ref_clpf); +} + +#if CONFIG_AOM_HIGHBITDEPTH +TEST_P(ClpfBlockHbdTest, TestSIMDNoMismatch) { + test_clpf(sizex, sizey, 12, 1, clpf, ref_clpf); +} + +TEST_P(ClpfHbdSpeedTest, TestSpeed) { + test_clpf_speed(sizex, sizey, 12, 1, clpf, ref_clpf); +} +#endif + using std::tr1::make_tuple; // Test all supported architectures and block sizes @@ -213,6 +251,48 @@ INSTANTIATE_TEST_CASE_P( 4))); #endif +#if CONFIG_AOM_HIGHBITDEPTH +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, ClpfBlockHbdTest, + ::testing::Values( + make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 8), + make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 8, 4), + make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 8), + make_tuple(&aom_clpf_block_hbd_sse2, &aom_clpf_block_hbd_c, 4, 4))); +#endif + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3, ClpfBlockHbdTest, + ::testing::Values( + make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 8), + make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 8, 4), + make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 8), + make_tuple(&aom_clpf_block_hbd_ssse3, &aom_clpf_block_hbd_c, 4, 4))); +#endif + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSSE4_1, ClpfBlockHbdTest, + ::testing::Values( + make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 8), + make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 8, 4), + make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 8), + make_tuple(&aom_clpf_block_hbd_sse4_1, &aom_clpf_block_hbd_c, 4, 4))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, ClpfBlockHbdTest, + ::testing::Values( + make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 8), + make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 8, 4), + make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 8), + make_tuple(&aom_clpf_block_hbd_neon, &aom_clpf_block_hbd_c, 4, 4))); +#endif +#endif + // Test speed for all supported architectures #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2, ClpfSpeedTest, @@ -237,4 +317,35 @@ INSTANTIATE_TEST_CASE_P(NEON, ClpfSpeedTest, ::testing::Values(make_tuple(&aom_clpf_block_neon, &aom_clpf_block_c, 8, 8))); #endif + +#if CONFIG_AOM_HIGHBITDEPTH +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, ClpfHbdSpeedTest, + ::testing::Values(make_tuple(&aom_clpf_block_hbd_sse2, + &aom_clpf_block_hbd_c, 8, + 8))); +#endif + +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P(SSSE3, ClpfHbdSpeedTest, + ::testing::Values(make_tuple(&aom_clpf_block_hbd_ssse3, + &aom_clpf_block_hbd_c, 8, + 8))); +#endif + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P(SSSE4_1, ClpfHbdSpeedTest, + ::testing::Values(make_tuple(&aom_clpf_block_hbd_ssse3, + &aom_clpf_block_hbd_c, 8, + 8))); +#endif + +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, ClpfHbdSpeedTest, + ::testing::Values(make_tuple(&aom_clpf_block_hbd_neon, + &aom_clpf_block_hbd_c, 8, + 8))); +#endif +#endif + } // namespace