diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 0aecb6995..1eeb0ecdb 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -1541,6 +1541,20 @@ cvtScale_( const short* src, size_t sstep, _mm_storeu_si128((__m128i*)(dst + x), r0); } } + #elif CV_NEON + float32x4_t v_shift = vdupq_n_f32(shift); + for(; x <= size.width - 8; x += 8 ) + { + int16x8_t v_src = vld1q_s16(src + x); + float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))); + float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))); + + v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift); + v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift); + + vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)), + vqmovn_s32(cv_vrndq_s32_f32(v_tmp2)))); + } #endif for(; x < size.width; x++ ) @@ -1580,6 +1594,20 @@ cvtScale_( const short* src, size_t sstep, _mm_storeu_si128((__m128i*)(dst + x + 4), r1); } } + #elif CV_NEON + float32x4_t v_shift = vdupq_n_f32(shift); + for(; x <= size.width - 8; x += 8 ) + { + int16x8_t v_src = vld1q_s16(src + x); + float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))); + float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))); + + v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift); + v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift); + + vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1)); + vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2)); + } #endif //We will wait Haswell @@ -2134,6 +2162,14 @@ cvt_( const float* src, size_t sstep, _mm_storeu_si128((__m128i*)(dst + x),src1_int128); } } + #elif CV_NEON + for( ; x <= size.width - 8; x += 8 ) + { + float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4); + int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)), + vqmovn_s32(cv_vrndq_s32_f32(v_src2))); + vst1q_s16(dst + x, v_dst); + } #endif for( ; x < size.width; x++ ) dst[x] = saturate_cast(src[x]); diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp index 484a39aa0..7ffb8c2f8 100644 --- a/modules/core/src/copy.cpp +++ b/modules/core/src/copy.cpp @@ -107,6 +107,14 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mste _mm_storeu_si128((__m128i*)(dst + x), rDst); } } + #elif CV_NEON + uint8x16_t v_zero = vdupq_n_u8(0); + for( ; x <= size.width - 16; x += 16 ) + { + uint8x16_t v_mask = vcgtq_u8(vld1q_u8(mask + x), v_zero); + uint8x16_t v_dst = vld1q_u8(dst + x), v_src = vld1q_u8(src + x); + vst1q_u8(dst + x, vbslq_u8(v_mask, v_src, v_dst)); + } #endif for( ; x < size.width; x++ ) if( mask[x] ) @@ -143,6 +151,17 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mst _mm_storeu_si128((__m128i*)(dst + x), rDst); } } + #elif CV_NEON + uint8x8_t v_zero = vdup_n_u8(0); + for( ; x <= size.width - 8; x += 8 ) + { + uint8x8_t v_mask = vcgt_u8(vld1_u8(mask + x), v_zero); + uint8x8x2_t v_mask2 = vzip_u8(v_mask, v_mask); + uint16x8_t v_mask_res = vreinterpretq_u16_u8(vcombine_u8(v_mask2.val[0], v_mask2.val[1])); + + uint16x8_t v_src = vld1q_u16(src + x), v_dst = vld1q_u16(dst + x); + vst1q_u16(dst + x, vbslq_u16(v_mask_res, v_src, v_dst)); + } #endif for( ; x < size.width; x++ ) if( mask[x] ) diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 45161f9d5..e240e5714 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -261,6 +261,19 @@ static void Magnitude_32f(const float* x, const float* y, float* mag, int len) _mm_storeu_ps(mag + i, x0); _mm_storeu_ps(mag + i + 4, x1); } } +#elif CV_NEON + float CV_DECL_ALIGNED(16) m[4]; + + for( ; i <= len - 4; i += 4 ) + { + float32x4_t v_x = vld1q_f32(x + i), v_y = vld1q_f32(y + i); + vst1q_f32(m, vaddq_f32(vmulq_f32(v_x, v_x), vmulq_f32(v_y, v_y))); + + mag[i] = std::sqrt(m[0]); + mag[i+1] = std::sqrt(m[1]); + mag[i+2] = std::sqrt(m[2]); + mag[i+3] = std::sqrt(m[3]); + } #endif for( ; i < len; i++ ) @@ -2554,12 +2567,14 @@ void patchNaNs( InputOutputArray _a, double _val ) NAryMatIterator it(arrays, (uchar**)ptrs); size_t len = it.size*a.channels(); Cv32suf val; - float fval = (float)_val; - val.f = fval; + val.f = (float)_val; #if CV_SSE2 __m128i v_mask1 = _mm_set1_epi32(0x7fffffff), v_mask2 = _mm_set1_epi32(0x7f800000); __m128i v_val = _mm_set1_epi32(val.i); +#elif CV_NEON + int32x4_t v_mask1 = vdupq_n_s32(0x7fffffff), v_mask2 = vdupq_n_s32(0x7f800000), + v_val = vdupq_n_s32(val.i); #endif for( size_t i = 0; i < it.nplanes; i++, ++it ) @@ -2570,7 +2585,7 @@ void patchNaNs( InputOutputArray _a, double _val ) #if CV_SSE2 if (USE_SSE2) { - for ( ; j < len; j += 4) + for ( ; j + 4 <= len; j += 4) { __m128i v_src = _mm_loadu_si128((__m128i const *)(tptr + j)); __m128i v_cmp_mask = _mm_cmplt_epi32(v_mask2, _mm_and_si128(v_src, v_mask1)); @@ -2578,6 +2593,14 @@ void patchNaNs( InputOutputArray _a, double _val ) _mm_storeu_si128((__m128i *)(tptr + j), v_res); } } +#elif CV_NEON + for ( ; j + 4 <= len; j += 4) + { + int32x4_t v_src = vld1q_s32(tptr + j); + uint32x4_t v_cmp_mask = vcltq_s32(v_mask2, vandq_s32(v_src, v_mask1)); + int32x4_t v_dst = vbslq_s32(v_cmp_mask, v_val, v_src); + vst1q_s32(tptr + j, v_dst); + } #endif for( ; j < len; j++ )