Add AVX2 optimization to copy/avg functions

Change-Id: Ibcef70e4fead74e2c2909330a7044a29381a8074
This commit is contained in:
Yi Luo 2017-03-28 15:30:07 -07:00 committed by James Zern
parent 6bff6cb5a9
commit aa5a941992
4 changed files with 280 additions and 5 deletions

View File

@ -25,6 +25,7 @@
#include "vpx_dsp/vpx_filter.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_timer.h"
namespace {
@ -539,6 +540,46 @@ uint16_t *ConvolveTest::output16_ref_ = NULL;
TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
const int kNumTests = 5000000;
const int width = Width();
const int height = Height();
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
width, height);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
const int kNumTests = 5000000;
const int width = Width();
const int height = Height();
vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
for (int n = 0; n < kNumTests; ++n) {
UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
width, height);
}
vpx_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
printf("convolve_avg_%dx%d_%d: %d us\n", width, height,
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
TEST_P(ConvolveTest, Copy) {
uint8_t *const in = input();
uint8_t *const out = output();
@ -912,6 +953,17 @@ WRAP(convolve8_sse2, 12)
WRAP(convolve8_avg_sse2, 12)
#endif // HAVE_SSE2 && ARCH_X86_64
#if HAVE_AVX2
WRAP(convolve_copy_avx2, 8)
WRAP(convolve_avg_avx2, 8)
WRAP(convolve_copy_avx2, 10)
WRAP(convolve_avg_avx2, 10)
WRAP(convolve_copy_avx2, 12)
WRAP(convolve_avg_avx2, 12)
#endif // HAVE_AVX2
#if HAVE_NEON
WRAP(convolve_copy_neon, 8)
WRAP(convolve_avg_neon, 8)
@ -1057,18 +1109,48 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
::testing::ValuesIn(kArrayConvolve8_ssse3));
#endif
#if HAVE_AVX2 && HAVE_SSSE3
#if HAVE_AVX2
#if CONFIG_VP9_HIGHBITDEPTH
const ConvolveFunctions convolve8_avx2(
wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
wrap_convolve8_avg_c_8, wrap_convolve8_horiz_c_8,
wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
const ConvolveFunctions convolve10_avx2(
wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
10);
const ConvolveFunctions convolve12_avx2(
wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
12);
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2),
ALL_SIZES(convolve10_avx2),
ALL_SIZES(convolve12_avx2) };
INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
::testing::ValuesIn(kArrayConvolve8_avx2));
#else // !CONFIG_VP9_HIGHBITDEPTH
const ConvolveFunctions convolve8_avx2(
vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
::testing::ValuesIn(kArrayConvolve8_avx2));
#endif // HAVE_AVX2 && HAVE_SSSE3
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_AVX2
#if HAVE_NEON
#if CONFIG_VP9_HIGHBITDEPTH

View File

@ -95,6 +95,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c

View File

@ -373,10 +373,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Sub Pixel Filters
#
add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve_copy sse2 neon/;
specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve_avg sse2 neon/;
specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64";

View File

@ -0,0 +1,192 @@
/*
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <immintrin.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/convolve.h"
// -----------------------------------------------------------------------------
// Copy and average
void vpx_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int width, int h, int bd) {
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_y;
(void)filter_x_stride;
(void)filter_y_stride;
(void)bd;
assert(width % 4 == 0);
if (width > 32) { // width = 64
do {
const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
src += src_stride;
_mm256_storeu_si256((__m256i *)dst, p0);
_mm256_storeu_si256((__m256i *)(dst + 16), p1);
_mm256_storeu_si256((__m256i *)(dst + 32), p2);
_mm256_storeu_si256((__m256i *)(dst + 48), p3);
dst += dst_stride;
h--;
} while (h > 0);
} else if (width > 16) { // width = 32
do {
const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
src += src_stride;
_mm256_storeu_si256((__m256i *)dst, p0);
_mm256_storeu_si256((__m256i *)(dst + 16), p1);
dst += dst_stride;
h--;
} while (h > 0);
} else if (width > 8) { // width = 16
__m256i p0, p1;
do {
p0 = _mm256_loadu_si256((const __m256i *)src);
src += src_stride;
p1 = _mm256_loadu_si256((const __m256i *)src);
src += src_stride;
_mm256_storeu_si256((__m256i *)dst, p0);
dst += dst_stride;
_mm256_storeu_si256((__m256i *)dst, p1);
dst += dst_stride;
h -= 2;
} while (h > 0);
} else if (width > 4) { // width = 8
__m128i p0, p1;
do {
p0 = _mm_loadu_si128((const __m128i *)src);
src += src_stride;
p1 = _mm_loadu_si128((const __m128i *)src);
src += src_stride;
_mm_storeu_si128((__m128i *)dst, p0);
dst += dst_stride;
_mm_storeu_si128((__m128i *)dst, p1);
dst += dst_stride;
h -= 2;
} while (h > 0);
} else { // width = 4
__m128i p0, p1;
do {
p0 = _mm_loadl_epi64((const __m128i *)src);
src += src_stride;
p1 = _mm_loadl_epi64((const __m128i *)src);
src += src_stride;
_mm_storel_epi64((__m128i *)dst, p0);
dst += dst_stride;
_mm_storel_epi64((__m128i *)dst, p1);
dst += dst_stride;
h -= 2;
} while (h > 0);
}
}
void vpx_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int width, int h, int bd) {
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_y;
(void)filter_x_stride;
(void)filter_y_stride;
(void)bd;
assert(width % 4 == 0);
if (width > 32) { // width = 64
__m256i p0, p1, p2, p3, u0, u1, u2, u3;
do {
p0 = _mm256_loadu_si256((const __m256i *)src);
p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
src += src_stride;
u0 = _mm256_loadu_si256((const __m256i *)dst);
u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
_mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
_mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
_mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
_mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
dst += dst_stride;
h--;
} while (h > 0);
} else if (width > 16) { // width = 32
__m256i p0, p1, u0, u1;
do {
p0 = _mm256_loadu_si256((const __m256i *)src);
p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
src += src_stride;
u0 = _mm256_loadu_si256((const __m256i *)dst);
u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
_mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
_mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
dst += dst_stride;
h--;
} while (h > 0);
} else if (width > 8) { // width = 16
__m256i p0, p1, u0, u1;
do {
p0 = _mm256_loadu_si256((const __m256i *)src);
p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
src += src_stride << 1;
u0 = _mm256_loadu_si256((const __m256i *)dst);
u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
_mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
_mm256_storeu_si256((__m256i *)(dst + dst_stride),
_mm256_avg_epu16(p1, u1));
dst += dst_stride << 1;
h -= 2;
} while (h > 0);
} else if (width > 4) { // width = 8
__m128i p0, p1, u0, u1;
do {
p0 = _mm_loadu_si128((const __m128i *)src);
p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
src += src_stride << 1;
u0 = _mm_loadu_si128((const __m128i *)dst);
u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
_mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
_mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
dst += dst_stride << 1;
h -= 2;
} while (h > 0);
} else { // width = 4
__m128i p0, p1, u0, u1;
do {
p0 = _mm_loadl_epi64((const __m128i *)src);
p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
src += src_stride << 1;
u0 = _mm_loadl_epi64((const __m128i *)dst);
u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
_mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
_mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
dst += dst_stride << 1;
h -= 2;
} while (h > 0);
}
}