Add AVX2 version of vpx_convolve8_avg.
vpx_convolve8_avg works by first running a normal horizontal filter then a vertical filter averages at the end. The added vpx_convolve8_avg_avx2 calls pre-existing AVX2 code for the horizontal step. vpx_convolve8_avg_vert_avx2 is also added, but only uses ssse3 code. Change-Id: If5160c0c8e778e10de61ee9bf42ee4be5975c983
This commit is contained in:
parent
807248ec81
commit
9ca06bcdd2
@ -603,6 +603,29 @@ TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
|
|||||||
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
|
||||||
|
const uint8_t *const in = input();
|
||||||
|
uint8_t *const out = output();
|
||||||
|
const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
|
||||||
|
const int kNumTests = 5000000;
|
||||||
|
const int width = Width();
|
||||||
|
const int height = Height();
|
||||||
|
vpx_usec_timer timer;
|
||||||
|
|
||||||
|
SetConstantInput(127);
|
||||||
|
|
||||||
|
vpx_usec_timer_start(&timer);
|
||||||
|
for (int n = 0; n < kNumTests; ++n) {
|
||||||
|
UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
|
||||||
|
width, height);
|
||||||
|
}
|
||||||
|
vpx_usec_timer_mark(&timer);
|
||||||
|
|
||||||
|
const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
|
||||||
|
printf("convolve8_avg_%dx%d_%d: %d us\n", width, height,
|
||||||
|
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_P(ConvolveTest, Copy) {
|
TEST_P(ConvolveTest, Copy) {
|
||||||
uint8_t *const in = input();
|
uint8_t *const in = input();
|
||||||
uint8_t *const out = output();
|
uint8_t *const out = output();
|
||||||
@ -1178,7 +1201,7 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
|
|||||||
const ConvolveFunctions convolve8_avx2(
|
const ConvolveFunctions convolve8_avx2(
|
||||||
vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
|
vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
|
||||||
vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
|
vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
|
||||||
vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
|
vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2,
|
||||||
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
|
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
|
||||||
vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
|
vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
|
||||||
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
|
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
|
||||||
|
@ -364,13 +364,13 @@ add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride
|
|||||||
specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
|
specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
|
||||||
|
|
||||||
add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
|
add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
|
||||||
specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/;
|
specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/;
|
||||||
|
|
||||||
add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
|
add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
|
||||||
specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;
|
specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;
|
||||||
|
|
||||||
add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
|
add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
|
||||||
specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/;
|
specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
|
||||||
|
|
||||||
add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
|
add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
|
||||||
specialize qw/vpx_scaled_2d ssse3 neon/;
|
specialize qw/vpx_scaled_2d ssse3 neon/;
|
||||||
|
@ -539,6 +539,12 @@ filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
|
|||||||
#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
|
#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
|
||||||
#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
|
#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
|
||||||
#endif // ARCH_X86_64
|
#endif // ARCH_X86_64
|
||||||
|
filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
|
||||||
|
filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
|
||||||
|
filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
|
||||||
|
#define vpx_filter_block1d16_v8_avg_avx2 vpx_filter_block1d16_v8_avg_ssse3
|
||||||
|
#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
|
||||||
|
#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
|
||||||
filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
|
filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
|
||||||
filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
|
filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
|
||||||
filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
|
filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
|
||||||
@ -552,6 +558,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
|
|||||||
#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
|
#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
|
||||||
#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
|
#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
|
||||||
#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
|
#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
|
||||||
|
filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
|
||||||
|
filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
|
||||||
|
filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
|
||||||
|
#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
|
||||||
|
#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
|
||||||
|
#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
|
||||||
// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
||||||
// uint8_t *dst, ptrdiff_t dst_stride,
|
// uint8_t *dst, ptrdiff_t dst_stride,
|
||||||
// const InterpKernel *filter, int x0_q4,
|
// const InterpKernel *filter, int x0_q4,
|
||||||
@ -562,13 +574,25 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
|
|||||||
// const InterpKernel *filter, int x0_q4,
|
// const InterpKernel *filter, int x0_q4,
|
||||||
// int32_t x_step_q4, int y0_q4, int y_step_q4,
|
// int32_t x_step_q4, int y0_q4, int y_step_q4,
|
||||||
// int w, int h);
|
// int w, int h);
|
||||||
|
// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
||||||
|
// uint8_t *dst, ptrdiff_t dst_stride,
|
||||||
|
// const InterpKernel *filter, int x0_q4,
|
||||||
|
// int32_t x_step_q4, int y0_q4,
|
||||||
|
// int y_step_q4, int w, int h);
|
||||||
FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
|
FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
|
||||||
FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
|
FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
|
||||||
|
FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
|
||||||
|
|
||||||
// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
||||||
// uint8_t *dst, ptrdiff_t dst_stride,
|
// uint8_t *dst, ptrdiff_t dst_stride,
|
||||||
// const InterpKernel *filter, int x0_q4,
|
// const InterpKernel *filter, int x0_q4,
|
||||||
// int32_t x_step_q4, int y0_q4, int y_step_q4,
|
// int32_t x_step_q4, int y0_q4, int y_step_q4,
|
||||||
// int w, int h);
|
// int w, int h);
|
||||||
|
// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
|
||||||
|
// uint8_t *dst, ptrdiff_t dst_stride,
|
||||||
|
// const InterpKernel *filter, int x0_q4,
|
||||||
|
// int32_t x_step_q4, int y0_q4, int y_step_q4,
|
||||||
|
// int w, int h);
|
||||||
FUN_CONV_2D(, avx2);
|
FUN_CONV_2D(, avx2);
|
||||||
|
FUN_CONV_2D(avg_, avx2);
|
||||||
#endif // HAVE_AX2 && HAVE_SSSE3
|
#endif // HAVE_AX2 && HAVE_SSSE3
|
||||||
|
Loading…
x
Reference in New Issue
Block a user