Add AVX2 optimization to copy/avg functions
Change-Id: Ibcef70e4fead74e2c2909330a7044a29381a8074
This commit is contained in:
parent
6bff6cb5a9
commit
aa5a941992
@ -25,6 +25,7 @@
|
||||
#include "vpx_dsp/vpx_filter.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vpx_ports/vpx_timer.h"
|
||||
|
||||
namespace {
|
||||
|
||||
@ -539,6 +540,46 @@ uint16_t *ConvolveTest::output16_ref_ = NULL;
|
||||
|
||||
TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
|
||||
|
||||
TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
|
||||
const uint8_t *const in = input();
|
||||
uint8_t *const out = output();
|
||||
const int kNumTests = 5000000;
|
||||
const int width = Width();
|
||||
const int height = Height();
|
||||
vpx_usec_timer timer;
|
||||
|
||||
vpx_usec_timer_start(&timer);
|
||||
for (int n = 0; n < kNumTests; ++n) {
|
||||
UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
|
||||
width, height);
|
||||
}
|
||||
vpx_usec_timer_mark(&timer);
|
||||
|
||||
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
|
||||
printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
|
||||
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
||||
}
|
||||
|
||||
TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
|
||||
const uint8_t *const in = input();
|
||||
uint8_t *const out = output();
|
||||
const int kNumTests = 5000000;
|
||||
const int width = Width();
|
||||
const int height = Height();
|
||||
vpx_usec_timer timer;
|
||||
|
||||
vpx_usec_timer_start(&timer);
|
||||
for (int n = 0; n < kNumTests; ++n) {
|
||||
UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
|
||||
width, height);
|
||||
}
|
||||
vpx_usec_timer_mark(&timer);
|
||||
|
||||
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
|
||||
printf("convolve_avg_%dx%d_%d: %d us\n", width, height,
|
||||
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
||||
}
|
||||
|
||||
TEST_P(ConvolveTest, Copy) {
|
||||
uint8_t *const in = input();
|
||||
uint8_t *const out = output();
|
||||
@ -912,6 +953,17 @@ WRAP(convolve8_sse2, 12)
|
||||
WRAP(convolve8_avg_sse2, 12)
|
||||
#endif // HAVE_SSE2 && ARCH_X86_64
|
||||
|
||||
#if HAVE_AVX2
|
||||
WRAP(convolve_copy_avx2, 8)
|
||||
WRAP(convolve_avg_avx2, 8)
|
||||
|
||||
WRAP(convolve_copy_avx2, 10)
|
||||
WRAP(convolve_avg_avx2, 10)
|
||||
|
||||
WRAP(convolve_copy_avx2, 12)
|
||||
WRAP(convolve_avg_avx2, 12)
|
||||
#endif // HAVE_AVX2
|
||||
|
||||
#if HAVE_NEON
|
||||
WRAP(convolve_copy_neon, 8)
|
||||
WRAP(convolve_avg_neon, 8)
|
||||
@ -1057,18 +1109,48 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
|
||||
::testing::ValuesIn(kArrayConvolve8_ssse3));
|
||||
#endif
|
||||
|
||||
#if HAVE_AVX2 && HAVE_SSSE3
|
||||
#if HAVE_AVX2
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const ConvolveFunctions convolve8_avx2(
|
||||
wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
|
||||
wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
|
||||
wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
|
||||
wrap_convolve8_avg_c_8, wrap_convolve8_horiz_c_8,
|
||||
wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
|
||||
wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
|
||||
const ConvolveFunctions convolve10_avx2(
|
||||
wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
|
||||
wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
|
||||
wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
|
||||
wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
|
||||
wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
|
||||
wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
|
||||
10);
|
||||
const ConvolveFunctions convolve12_avx2(
|
||||
wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
|
||||
wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
|
||||
wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
|
||||
wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
|
||||
wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
|
||||
wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
|
||||
12);
|
||||
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2),
|
||||
ALL_SIZES(convolve10_avx2),
|
||||
ALL_SIZES(convolve12_avx2) };
|
||||
INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
|
||||
::testing::ValuesIn(kArrayConvolve8_avx2));
|
||||
#else // !CONFIG_VP9_HIGHBITDEPTH
|
||||
const ConvolveFunctions convolve8_avx2(
|
||||
vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
|
||||
vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
|
||||
vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
|
||||
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
|
||||
vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
|
||||
|
||||
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
|
||||
INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
|
||||
::testing::ValuesIn(kArrayConvolve8_avx2));
|
||||
#endif // HAVE_AVX2 && HAVE_SSSE3
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
#endif // HAVE_AVX2
|
||||
|
||||
#if HAVE_NEON
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
|
@ -95,6 +95,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c
|
||||
|
@ -373,10 +373,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
# Sub Pixel Filters
|
||||
#
|
||||
add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
|
||||
specialize qw/vpx_highbd_convolve_copy sse2 neon/;
|
||||
specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
|
||||
|
||||
add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
|
||||
specialize qw/vpx_highbd_convolve_avg sse2 neon/;
|
||||
specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
|
||||
|
||||
add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
|
||||
specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64";
|
||||
|
192
vpx_dsp/x86/highbd_convolve_avx2.c
Normal file
192
vpx_dsp/x86/highbd_convolve_avx2.c
Normal file
@ -0,0 +1,192 @@
|
||||
/*
|
||||
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/x86/convolve.h"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Copy and average
|
||||
|
||||
void vpx_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
|
||||
uint8_t *dst8, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int filter_x_stride,
|
||||
const int16_t *filter_y, int filter_y_stride,
|
||||
int width, int h, int bd) {
|
||||
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
|
||||
(void)filter_x;
|
||||
(void)filter_y;
|
||||
(void)filter_x_stride;
|
||||
(void)filter_y_stride;
|
||||
(void)bd;
|
||||
|
||||
assert(width % 4 == 0);
|
||||
if (width > 32) { // width = 64
|
||||
do {
|
||||
const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
|
||||
const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
|
||||
const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
|
||||
const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
|
||||
src += src_stride;
|
||||
_mm256_storeu_si256((__m256i *)dst, p0);
|
||||
_mm256_storeu_si256((__m256i *)(dst + 16), p1);
|
||||
_mm256_storeu_si256((__m256i *)(dst + 32), p2);
|
||||
_mm256_storeu_si256((__m256i *)(dst + 48), p3);
|
||||
dst += dst_stride;
|
||||
h--;
|
||||
} while (h > 0);
|
||||
} else if (width > 16) { // width = 32
|
||||
do {
|
||||
const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
|
||||
const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
|
||||
src += src_stride;
|
||||
_mm256_storeu_si256((__m256i *)dst, p0);
|
||||
_mm256_storeu_si256((__m256i *)(dst + 16), p1);
|
||||
dst += dst_stride;
|
||||
h--;
|
||||
} while (h > 0);
|
||||
} else if (width > 8) { // width = 16
|
||||
__m256i p0, p1;
|
||||
do {
|
||||
p0 = _mm256_loadu_si256((const __m256i *)src);
|
||||
src += src_stride;
|
||||
p1 = _mm256_loadu_si256((const __m256i *)src);
|
||||
src += src_stride;
|
||||
|
||||
_mm256_storeu_si256((__m256i *)dst, p0);
|
||||
dst += dst_stride;
|
||||
_mm256_storeu_si256((__m256i *)dst, p1);
|
||||
dst += dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} else if (width > 4) { // width = 8
|
||||
__m128i p0, p1;
|
||||
do {
|
||||
p0 = _mm_loadu_si128((const __m128i *)src);
|
||||
src += src_stride;
|
||||
p1 = _mm_loadu_si128((const __m128i *)src);
|
||||
src += src_stride;
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, p0);
|
||||
dst += dst_stride;
|
||||
_mm_storeu_si128((__m128i *)dst, p1);
|
||||
dst += dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} else { // width = 4
|
||||
__m128i p0, p1;
|
||||
do {
|
||||
p0 = _mm_loadl_epi64((const __m128i *)src);
|
||||
src += src_stride;
|
||||
p1 = _mm_loadl_epi64((const __m128i *)src);
|
||||
src += src_stride;
|
||||
|
||||
_mm_storel_epi64((__m128i *)dst, p0);
|
||||
dst += dst_stride;
|
||||
_mm_storel_epi64((__m128i *)dst, p1);
|
||||
dst += dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride,
|
||||
uint8_t *dst8, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int filter_x_stride,
|
||||
const int16_t *filter_y, int filter_y_stride,
|
||||
int width, int h, int bd) {
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
|
||||
(void)filter_x;
|
||||
(void)filter_y;
|
||||
(void)filter_x_stride;
|
||||
(void)filter_y_stride;
|
||||
(void)bd;
|
||||
|
||||
assert(width % 4 == 0);
|
||||
if (width > 32) { // width = 64
|
||||
__m256i p0, p1, p2, p3, u0, u1, u2, u3;
|
||||
do {
|
||||
p0 = _mm256_loadu_si256((const __m256i *)src);
|
||||
p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
|
||||
p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
|
||||
p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
|
||||
src += src_stride;
|
||||
u0 = _mm256_loadu_si256((const __m256i *)dst);
|
||||
u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
|
||||
u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
|
||||
u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
|
||||
_mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
|
||||
_mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
|
||||
_mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
|
||||
_mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
|
||||
dst += dst_stride;
|
||||
h--;
|
||||
} while (h > 0);
|
||||
} else if (width > 16) { // width = 32
|
||||
__m256i p0, p1, u0, u1;
|
||||
do {
|
||||
p0 = _mm256_loadu_si256((const __m256i *)src);
|
||||
p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
|
||||
src += src_stride;
|
||||
u0 = _mm256_loadu_si256((const __m256i *)dst);
|
||||
u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
|
||||
_mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
|
||||
_mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
|
||||
dst += dst_stride;
|
||||
h--;
|
||||
} while (h > 0);
|
||||
} else if (width > 8) { // width = 16
|
||||
__m256i p0, p1, u0, u1;
|
||||
do {
|
||||
p0 = _mm256_loadu_si256((const __m256i *)src);
|
||||
p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
|
||||
src += src_stride << 1;
|
||||
u0 = _mm256_loadu_si256((const __m256i *)dst);
|
||||
u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
|
||||
|
||||
_mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
|
||||
_mm256_storeu_si256((__m256i *)(dst + dst_stride),
|
||||
_mm256_avg_epu16(p1, u1));
|
||||
dst += dst_stride << 1;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} else if (width > 4) { // width = 8
|
||||
__m128i p0, p1, u0, u1;
|
||||
do {
|
||||
p0 = _mm_loadu_si128((const __m128i *)src);
|
||||
p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
|
||||
src += src_stride << 1;
|
||||
u0 = _mm_loadu_si128((const __m128i *)dst);
|
||||
u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
|
||||
_mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
|
||||
dst += dst_stride << 1;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} else { // width = 4
|
||||
__m128i p0, p1, u0, u1;
|
||||
do {
|
||||
p0 = _mm_loadl_epi64((const __m128i *)src);
|
||||
p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
|
||||
src += src_stride << 1;
|
||||
u0 = _mm_loadl_epi64((const __m128i *)dst);
|
||||
u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
|
||||
|
||||
_mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
|
||||
_mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
|
||||
dst += dst_stride << 1;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user