diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 6aaacefa2..ba48ebb99 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -884,6 +884,7 @@ using std::tr1::make_tuple; vpx_highbd_##func(src, src_stride, dst, dst_stride, filter_x, \ filter_x_stride, filter_y, filter_y_stride, w, h, bd); \ } + #if HAVE_SSE2 && ARCH_X86_64 WRAP(convolve_copy_sse2, 8) WRAP(convolve_avg_sse2, 8) @@ -911,6 +912,15 @@ WRAP(convolve8_sse2, 12) WRAP(convolve8_avg_sse2, 12) #endif // HAVE_SSE2 && ARCH_X86_64 +#if HAVE_NEON +WRAP(convolve_copy_neon, 8) +WRAP(convolve_avg_neon, 8) +WRAP(convolve_copy_neon, 10) +WRAP(convolve_avg_neon, 10) +WRAP(convolve_copy_neon, 12) +WRAP(convolve_avg_neon, 12) +#endif // HAVE_NEON + WRAP(convolve_copy_c, 8) WRAP(convolve_avg_c, 8) WRAP(convolve8_horiz_c, 8) @@ -1043,6 +1053,34 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, #endif // HAVE_AVX2 && HAVE_SSSE3 #if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_neon( + wrap_convolve_copy_neon_8, wrap_convolve_avg_neon_8, + wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, + wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, + wrap_convolve8_avg_c_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_neon( + wrap_convolve_copy_neon_10, wrap_convolve_avg_neon_10, + wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, + wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, + 10); +const ConvolveFunctions convolve12_neon( + wrap_convolve_copy_neon_12, wrap_convolve_avg_neon_12, + wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, + wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, + 12); +const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon), + ALL_SIZES(convolve10_neon), + ALL_SIZES(convolve12_neon) }; +#else const ConvolveFunctions convolve8_neon( vpx_convolve_copy_neon, vpx_convolve_avg_neon, vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon, @@ -1050,9 +1088,10 @@ const ConvolveFunctions convolve8_neon( vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); -const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES(convolve8_neon) }; +const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) }; +#endif // CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_neon)); + ::testing::ValuesIn(kArrayConvolve_neon)); #endif // HAVE_NEON #if HAVE_DSPR2 diff --git a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c new file mode 100644 index 000000000..f4d70761e --- /dev/null +++ b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + (void)bd; + + if (w < 8) { // avg4 + uint16x4_t s0, s1, d0, d1; + uint16x8_t s01, d01; + do { + s0 = vld1_u16(src); + d0 = vld1_u16(dst); + src += src_stride; + s1 = vld1_u16(src); + d1 = vld1_u16(dst + dst_stride); + src += src_stride; + s01 = vcombine_u16(s0, s1); + d01 = vcombine_u16(d0, d1); + d01 = vrhaddq_u16(s01, d01); + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 8) { // avg8 + uint16x8_t s0, s1, d0, d1; + do { + s0 = vld1q_u16(src); + d0 = vld1q_u16(dst); + src += src_stride; + s1 = vld1q_u16(src); + d1 = vld1q_u16(dst + dst_stride); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + + vst1q_u16(dst, d0); + dst += dst_stride; + vst1q_u16(dst, d1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w < 32) { // avg16 + uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h; + do { + s0l = vld1q_u16(src); + s0h = vld1q_u16(src + 8); + d0l = vld1q_u16(dst); + d0h = vld1q_u16(dst + 8); + src += src_stride; + s1l = vld1q_u16(src); + s1h = vld1q_u16(src + 8); + d1l = vld1q_u16(dst + dst_stride); + d1h = vld1q_u16(dst + dst_stride + 8); + src += src_stride; + + d0l = vrhaddq_u16(s0l, d0l); + d0h = vrhaddq_u16(s0h, d0h); + d1l = vrhaddq_u16(s1l, d1l); + d1h = vrhaddq_u16(s1h, d1h); + + vst1q_u16(dst, d0l); + vst1q_u16(dst + 8, d0h); + dst += dst_stride; + vst1q_u16(dst, d1l); + vst1q_u16(dst + 8, d1h); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 32) { // avg32 + uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + dst += dst_stride; + + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // avg64 + uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + + s0 = vld1q_u16(src + 32); + s1 = vld1q_u16(src + 40); + s2 = vld1q_u16(src + 48); + s3 = vld1q_u16(src + 56); + d0 = vld1q_u16(dst + 32); + d1 = vld1q_u16(dst + 40); + d2 = vld1q_u16(dst + 48); + d3 = vld1q_u16(dst + 56); + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst + 32, d0); + vst1q_u16(dst + 40, d1); + vst1q_u16(dst + 48, d2); + vst1q_u16(dst + 56, d3); + src += src_stride; + dst += dst_stride; + } while (--h); + } +} diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c new file mode 100644 index 000000000..68d57779b --- /dev/null +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + (void)bd; + + if (w < 8) { // copy4 + do { + vst1_u16(dst, vld1_u16(src)); + src += src_stride; + dst += dst_stride; + vst1_u16(dst, vld1_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 8) { // copy8 + do { + vst1q_u16(dst, vld1q_u16(src)); + src += src_stride; + dst += dst_stride; + vst1q_u16(dst, vld1q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w < 32) { // copy16 + do { + vst1q_u16(dst, vld1q_u16(src)); + vst1q_u16(dst + 8, vld1q_u16(src + 8)); + src += src_stride; + dst += dst_stride; + vst1q_u16(dst, vld1q_u16(src)); + vst1q_u16(dst + 8, vld1q_u16(src + 8)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 32) { // copy32 + do { + vst1q_u16(dst, vld1q_u16(src)); + vst1q_u16(dst + 8, vld1q_u16(src + 8)); + vst1q_u16(dst + 16, vld1q_u16(src + 16)); + vst1q_u16(dst + 24, vld1q_u16(src + 24)); + src += src_stride; + dst += dst_stride; + vst1q_u16(dst, vld1q_u16(src)); + vst1q_u16(dst + 8, vld1q_u16(src + 8)); + vst1q_u16(dst + 16, vld1q_u16(src + 16)); + vst1q_u16(dst + 24, vld1q_u16(src + 24)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // copy64 + do { + vst1q_u16(dst, vld1q_u16(src)); + vst1q_u16(dst + 8, vld1q_u16(src + 8)); + vst1q_u16(dst + 16, vld1q_u16(src + 16)); + vst1q_u16(dst + 24, vld1q_u16(src + 24)); + vst1q_u16(dst + 32, vld1q_u16(src + 32)); + vst1q_u16(dst + 40, vld1q_u16(src + 40)); + vst1q_u16(dst + 48, vld1q_u16(src + 48)); + vst1q_u16(dst + 56, vld1q_u16(src + 56)); + src += src_stride; + dst += dst_stride; + } while (--h); + } +} diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c index f17281b36..45097c8f4 100644 --- a/vpx_dsp/vpx_convolve.c +++ b/vpx_dsp/vpx_convolve.c @@ -25,6 +25,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -46,6 +47,7 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -72,7 +74,7 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) @@ -95,7 +97,7 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) @@ -128,7 +130,7 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint8_t temp[135 * 64]; + uint8_t temp[64 * 135]; int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; @@ -219,7 +221,6 @@ void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, int w, int h) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); @@ -231,7 +232,7 @@ void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - /* Fixed size intermediate buffer places limits on parameters. */ + // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); @@ -272,7 +273,6 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); - src += src_stride; dst += dst_stride; } @@ -334,9 +334,10 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -357,9 +358,10 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -382,9 +384,10 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { @@ -407,9 +410,10 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { @@ -470,6 +474,7 @@ void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; (void)y_step_q4; @@ -484,6 +489,7 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; (void)y_step_q4; @@ -498,6 +504,7 @@ void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, int h, int bd) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; (void)x_step_q4; @@ -512,6 +519,7 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h, int bd) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; (void)x_step_q4; @@ -526,7 +534,6 @@ void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); @@ -556,11 +563,12 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int r; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; - (void)filter_y; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; (void)bd; @@ -577,18 +585,17 @@ void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; - (void)filter_y; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; (void)bd; for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); - } + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); src += src_stride; dst += dst_stride; } diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index f0f5bf878..1a5f182e1 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -86,6 +86,8 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c endif DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index cd5d46884..7bc9f48be 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -392,10 +392,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Sub Pixel Filters # add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve_copy sse2/; + specialize qw/vpx_highbd_convolve_copy sse2 neon/; add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve_avg sse2/; + specialize qw/vpx_highbd_convolve_avg sse2 neon/; add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";