From c4b69eb0eb9fc7e6147db88189036deff718f221 Mon Sep 17 00:00:00 2001 From: hui su Date: Thu, 4 Feb 2016 14:05:22 -0800 Subject: [PATCH] Speed-up for ext-intra -Avoid unnecessary calculations -Use SIMD when possible Encoder is about 5% faster with the extra intra prediction angles enabled. Change-Id: I131056befe327cedab217ad4a40d5f2a11318acc --- vp10/common/filter.c | 2 +- vp10/common/reconintra.c | 214 ++++++++++++++++++++++++++++++++------- 2 files changed, 178 insertions(+), 38 deletions(-) diff --git a/vp10/common/filter.c b/vp10/common/filter.c index a3aa3cf67..a5987f1f2 100644 --- a/vp10/common/filter.c +++ b/vp10/common/filter.c @@ -220,7 +220,7 @@ const InterpKernel *vp10_filter_kernels[SWITCHABLE_FILTERS + 1] = { #if CONFIG_EXT_INTRA const InterpKernel *vp10_intra_filter_kernels[INTRA_FILTERS] = { - NULL, // INTRA_FILTER_LINEAR + bilinear_filters, // INTRA_FILTER_LINEAR sub_pel_filters_8, // INTRA_FILTER_8TAP sub_pel_filters_8sharp, // INTRA_FILTER_8TAP_SHARP sub_pel_filters_8smooth, // INTRA_FILTER_8TAP_SMOOTH diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c index f25720010..99c33403a 100644 --- a/vp10/common/reconintra.c +++ b/vp10/common/reconintra.c @@ -276,28 +276,95 @@ static int intra_subpel_interp(int base, int shift, const uint8_t *ref, static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left, int dx, int dy, INTRA_FILTER filter_type) { - int r, c, x, y, base, shift, val; + int r, c, x, base, shift, val; (void)left; (void)dy; assert(dy == 1); assert(dx < 0); - for (r = 0; r < bs; ++r) { - y = r + 1; - for (c = 0; c < bs; ++c) { - x = (c << 8) - y * dx; + if (filter_type != INTRA_FILTER_LINEAR) { + const int pad_size = SUBPEL_TAPS >> 1; + int len; + DECLARE_ALIGNED(16, uint8_t, buf[SUBPEL_SHIFTS][64]); + DECLARE_ALIGNED(16, uint8_t, src[64 + SUBPEL_TAPS]); + uint8_t flags[SUBPEL_SHIFTS]; + + memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0])); + memset(src, above[0], pad_size * sizeof(above[0])); + memcpy(src + pad_size, above, 2 * bs * sizeof(above[0])); + memset(src + pad_size + 2 * bs, above[2 * bs - 1], + pad_size * sizeof(above[0])); + flags[0] = 1; + x = -dx; + for (r = 0; r < bs; ++r, dst += stride, x -= dx) { base = x >> 8; - shift = x - (base << 8); + shift = x & 0xFF; + shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS); + if (shift == SUBPEL_SHIFTS) { + base += 1; + shift = 0; + } + len = VPXMIN(bs, 2 * bs - 1 - base); + if (len <= 0) { + int i; + for (i = r; i < bs; ++i) { + memset(dst, above[2 * bs - 1], bs * sizeof(dst[0])); + dst += stride; + } + return; + } + + if (len <= (bs >> 1) && !flags[shift]) { + base = x >> 8; + shift = x & 0xFF; + for (c = 0; c < len; ++c) { + val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1, + filter_type); + dst[c] = clip_pixel(val); + ++base; + } + } else { + if (!flags[shift]) { + vpx_convolve8_horiz(src + pad_size, 2 * bs, buf[shift], 2 * bs, + vp10_intra_filter_kernels[filter_type][shift], 16, + NULL, 16, 2 * bs, 2 * bs < 16 ? 2 : 1); + flags[shift] = 1; + } + memcpy(dst, shift == 0 ? src + pad_size + base : &buf[shift][base], + len * sizeof(dst[0])); + } + + if (len < bs) + memset(dst + len, above[2 * bs - 1], (bs - len) * sizeof(dst[0])); + } + return; + } + + // For linear filter, C code is faster. + x = -dx; + for (r = 0; r < bs; ++r, dst += stride, x -= dx) { + base = x >> 8; + shift = x & 0xFF; + + if (base >= 2 * bs - 1) { + int i; + for (i = r; i < bs; ++i) { + memset(dst, above[2 * bs - 1], bs * sizeof(dst[0])); + dst += stride; + } + return; + } + + for (c = 0; c < bs; ++c, ++base) { if (base < 2 * bs - 1) { - val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1, - filter_type); + val = above[base] * (256 - shift) + above[base + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 8); dst[c] = clip_pixel(val); } else { dst[c] = above[2 * bs - 1]; } } - dst += stride; } } @@ -305,33 +372,33 @@ static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs, static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left, int dx, int dy, INTRA_FILTER filter_type) { - int r, c, x, y, shift, val, base; + int r, c, x, y, shift1, shift2, val, base1, base2, use_above; assert(dx > 0); assert(dy > 0); - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - y = r + 1; - x = (c << 8) - y * dx; - base = x >> 8; - if (base >= -1) { - shift = x - (base << 8); - val = intra_subpel_interp(base, shift, above, -1, bs - 1, filter_type); + x = -dx; + for (r = 0; r < bs; ++r, x -= dx, dst += stride) { + use_above = 0; + base1 = x >> 8; + y = (r << 8) - dy; + for (c = 0; c < bs; ++c, ++base1, y -= dy) { + if (base1 >= -1) { + shift1 = x & 0xFF; + val = intra_subpel_interp(base1, shift1, above, -1, bs - 1, + filter_type); } else { - x = c + 1; - y = (r << 8) - x * dy; - base = y >> 8; - if (base >= 0) { - shift = y - (base << 8); - val = intra_subpel_interp(base, shift, left, 0, bs - 1, filter_type); + base2 = y >> 8; + if (base2 >= 0) { + shift2 = y & 0xFF; + val = intra_subpel_interp(base2, shift2, left, 0, bs - 1, + filter_type); } else { val = left[0]; } } dst[c] = clip_pixel(val); } - dst += stride; } } @@ -339,7 +406,7 @@ static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs, static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left, int dx, int dy, INTRA_FILTER filter_type) { - int r, c, x, y, base, shift, val; + int r, c, y, base, shift, val; (void)above; (void)dx; @@ -347,21 +414,94 @@ static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs, assert(dx == 1); assert(dy < 0); - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - x = c + 1; - y = (r << 8) - x * dy; + if (filter_type != INTRA_FILTER_LINEAR) { + const int pad_size = SUBPEL_TAPS >> 1; + int len, i; + DECLARE_ALIGNED(16, uint8_t, buf[64][4 * SUBPEL_SHIFTS]); + DECLARE_ALIGNED(16, uint8_t, src[(64 + SUBPEL_TAPS) * 4]); + uint8_t flags[SUBPEL_SHIFTS]; + + memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0])); + for (i = 0; i < pad_size; ++i) + src[4 * i] = left[0]; + for (i = 0; i < 2 * bs; ++i) + src[4 * (i + pad_size)] = left[i]; + for (i = 0; i < pad_size; ++i) + src[4 * (i + 2 * bs + pad_size)] = left[2 * bs - 1]; + flags[0] = 1; + y = -dy; + for (c = 0; c < bs; ++c, y -= dy) { base = y >> 8; - shift = y - (base << 8); - if (base < 2 * bs - 1) { - val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1, - filter_type); - dst[c] = clip_pixel(val); + shift = y & 0xFF; + shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS); + if (shift == SUBPEL_SHIFTS) { + base += 1; + shift = 0; + } + len = VPXMIN(bs, 2 * bs - 1 - base); + + if (len <= 0) { + for (i = r; r < bs; ++r) { + dst[i * stride + c] = left[ 2 * bs - 1]; + } + continue; + } + + if (len <= (bs >> 1) && !flags[shift]) { + base = y >> 8; + shift = y & 0xFF; + for (r = 0; r < len; ++r) { + val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1, + filter_type); + dst[r * stride + c] = clip_pixel(val); + ++base; + } } else { - dst[c] = left[ 2 * bs - 1]; + if (!flags[shift]) { + vpx_convolve8_vert(src + 4 * pad_size, 4, + buf[0] + 4 * shift, 4 * SUBPEL_SHIFTS, NULL, 16, + vp10_intra_filter_kernels[filter_type][shift], 16, + 2 * bs < 16 ? 4 : 4, 2 * bs); + flags[shift] = 1; + } + + if (shift == 0) { + for (r = 0; r < len; ++r) { + dst[r * stride + c] = left[r + base]; + } + } else { + for (r = 0; r < len; ++r) { + dst[r * stride + c] = buf[r + base][4 * shift]; + } + } + } + + if (len < bs) { + for (r = len; r < bs; ++r) { + dst[r * stride + c] = left[ 2 * bs - 1]; + } + } + } + return; + } + + // For linear filter, C code is faster. + y = -dy; + for (c = 0; c < bs; ++c, y -= dy) { + base = y >> 8; + shift = y & 0xFF; + + for (r = 0; r < bs; ++r, ++base) { + if (base < 2 * bs - 1) { + val = left[base] * (256 - shift) + left[base + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 8); + dst[r * stride + c] = clip_pixel(val); + } else { + for (; r < bs; ++r) + dst[r * stride + c] = left[2 * bs - 1]; + break; } } - dst += stride; } }