diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp index 09bcd72d5..80bed397b 100644 --- a/modules/core/include/opencv2/core/hal/hal.hpp +++ b/modules/core/include/opencv2/core/hal/hal.hpp @@ -85,7 +85,8 @@ CV_EXPORTS void exp64f(const double* src, double* dst, int n); CV_EXPORTS void log32f(const float* src, float* dst, int n); CV_EXPORTS void log64f(const double* src, double* dst, int n); -CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees); +CV_EXPORTS void fastAtan32f(const float* y, const float* x, float* dst, int n, bool angleInDegrees); +CV_EXPORTS void fastAtan64f(const double* y, const double* x, double* dst, int n, bool angleInDegrees); CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n); CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n); CV_EXPORTS void sqrt32f(const float* src, float* dst, int len); @@ -228,6 +229,7 @@ CV_EXPORTS void exp(const double* src, double* dst, int n); CV_EXPORTS void log(const float* src, float* dst, int n); CV_EXPORTS void log(const double* src, double* dst, int n); +CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees); CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n); CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n); CV_EXPORTS void sqrt(const float* src, float* dst, int len); diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 33e14b486..6da8fdfd1 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -317,4 +317,98 @@ template struct V_SIMD128Traits //! @} +//================================================================================================== + +//! @cond IGNORED + +namespace cv { + +template struct V_RegTrait128; + +template <> struct V_RegTrait128 { + typedef v_uint8x16 reg; + typedef v_uint16x8 w_reg; + typedef v_uint32x4 q_reg; + typedef v_uint8x16 u_reg; + static v_uint8x16 zero() { return v_setzero_u8(); } + static v_uint8x16 all(uchar val) { return v_setall_u8(val); } +}; + +template <> struct V_RegTrait128 { + typedef v_int8x16 reg; + typedef v_int16x8 w_reg; + typedef v_int32x4 q_reg; + typedef v_uint8x16 u_reg; + static v_int8x16 zero() { return v_setzero_s8(); } + static v_int8x16 all(schar val) { return v_setall_s8(val); } +}; + +template <> struct V_RegTrait128 { + typedef v_uint16x8 reg; + typedef v_uint32x4 w_reg; + typedef v_int16x8 int_reg; + typedef v_uint16x8 u_reg; + static v_uint16x8 zero() { return v_setzero_u16(); } + static v_uint16x8 all(ushort val) { return v_setall_u16(val); } +}; + +template <> struct V_RegTrait128 { + typedef v_int16x8 reg; + typedef v_int32x4 w_reg; + typedef v_uint16x8 u_reg; + static v_int16x8 zero() { return v_setzero_s16(); } + static v_int16x8 all(short val) { return v_setall_s16(val); } +}; + +template <> struct V_RegTrait128 { + typedef v_uint32x4 reg; + typedef v_uint64x2 w_reg; + typedef v_int32x4 int_reg; + typedef v_uint32x4 u_reg; + static v_uint32x4 zero() { return v_setzero_u32(); } + static v_uint32x4 all(unsigned val) { return v_setall_u32(val); } +}; + +template <> struct V_RegTrait128 { + typedef v_int32x4 reg; + typedef v_int64x2 w_reg; + typedef v_uint32x4 u_reg; + static v_int32x4 zero() { return v_setzero_s32(); } + static v_int32x4 all(int val) { return v_setall_s32(val); } +}; + +template <> struct V_RegTrait128 { + typedef v_uint64x2 reg; + static v_uint64x2 zero() { return v_setzero_u64(); } + static v_uint64x2 all(uint64 val) { return v_setall_u64(val); } +}; + +template <> struct V_RegTrait128 { + typedef v_int64x2 reg; + static v_int64x2 zero() { return v_setzero_s64(); } + static v_int64x2 all(int64 val) { return v_setall_s64(val); } +}; + +template <> struct V_RegTrait128 { + typedef v_float32x4 reg; + typedef v_int32x4 int_reg; + typedef v_float32x4 u_reg; + static v_float32x4 zero() { return v_setzero_f32(); } + static v_float32x4 all(float val) { return v_setall_f32(val); } +}; + +#if CV_SIMD128_64F +template <> struct V_RegTrait128 { + typedef v_float64x2 reg; + typedef v_int32x4 int_reg; + typedef v_float64x2 u_reg; + static v_float64x2 zero() { return v_setzero_f64(); } + static v_float64x2 all(double val) { return v_setall_f64(val); } +}; +#endif + +} // cv:: + +//! @endcond + #endif diff --git a/modules/core/perf/perf_math.cpp b/modules/core/perf/perf_math.cpp index 267cc9c40..eb3fbb0b2 100644 --- a/modules/core/perf/perf_math.cpp +++ b/modules/core/perf/perf_math.cpp @@ -25,6 +25,20 @@ PERF_TEST_P(VectorLength, phase32f, testing::Values(128, 1000, 128*1024, 512*102 SANITY_CHECK(angle, 5e-5); } +PERF_TEST_P(VectorLength, phase64f, testing::Values(128, 1000, 128*1024, 512*1024, 1024*1024)) +{ + size_t length = GetParam(); + vector X(length); + vector Y(length); + vector angle(length); + + declare.in(X, Y, WARMUP_RNG).out(angle); + + TEST_CYCLE_N(200) cv::phase(X, Y, angle, true); + + SANITY_CHECK(angle, 5e-5); +} + PERF_TEST_P( MaxDim_MaxPoints, kmeans, testing::Combine( testing::Values( 16, 32, 64 ), testing::Values( 300, 400, 500) ) ) diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp index 93476c459..b3766dca4 100644 --- a/modules/core/src/hal_replacement.hpp +++ b/modules/core/src/hal_replacement.hpp @@ -376,6 +376,110 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int #define cv_hal_merge64s hal_ni_merge64s //! @endcond + +/** +@param y,x source Y and X arrays +@param dst destination array +@param len length of arrays +@param angleInDegrees if set to true return angles in degrees, otherwise in radians + */ +//! @addtogroup core_hal_interface_fastAtan Atan calculation +//! @{ +inline int hal_ni_fastAtan32f(const float* y, const float* x, float* dst, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_fastAtan64f(const double* y, const double* x, double* dst, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + +//! @cond IGNORED +#define cv_hal_fastAtan32f hal_ni_fastAtan32f +#define cv_hal_fastAtan64f hal_ni_fastAtan64f +//! @endcond + + +/** +@param x,y source X and Y arrays +@param dst destination array +@param len length of arrays + */ +//! @addtogroup core_hal_interface_magnitude Magnitude calculation +//! @{ +inline int hal_ni_magnitude32f(const float *x, const float *y, float *dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_magnitude64f(const double *x, const double *y, double *dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + +//! @cond IGNORED +#define cv_hal_magnitude32f hal_ni_magnitude32f +#define cv_hal_magnitude64f hal_ni_magnitude64f +//! @endcond + + +/** +@param src source array +@param dst destination array +@param len length of arrays + */ +//! @addtogroup core_hal_interface_invSqrt Inverse square root calculation +//! @{ +inline int hal_ni_invSqrt32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_invSqrt64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + +//! @cond IGNORED +#define cv_hal_invSqrt32f hal_ni_invSqrt32f +#define cv_hal_invSqrt64f hal_ni_invSqrt64f +//! @endcond + + +/** +@param src source array +@param dst destination array +@param len length of arrays + */ +//! @addtogroup core_hal_interface_sqrt Square root calculation +//! @{ +inline int hal_ni_sqrt32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_sqrt64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + +//! @cond IGNORED +#define cv_hal_sqrt32f hal_ni_sqrt32f +#define cv_hal_sqrt64f hal_ni_sqrt64f +//! @endcond + + +/** +@param src source array +@param dst destination array +@param len length of arrays + */ +//! @addtogroup core_hal_interface_log Natural logarithm calculation +//! @{ +inline int hal_ni_log32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_log64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + +//! @cond IGNORED +#define cv_hal_log32f hal_ni_log32f +#define cv_hal_log64f hal_ni_log64f +//! @endcond + + +/** +@param src source array +@param dst destination array +@param len length of arrays + */ +//! @addtogroup core_hal_interface_exp Exponent calculation +//! @{ +inline int hal_ni_exp32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_exp64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + +//! @cond IGNORED +#define cv_hal_exp32f hal_ni_exp32f +#define cv_hal_exp64f hal_ni_exp64f +//! @endcond + + /** @brief Dummy structure storing DFT/DCT context diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 495711f8d..e974776c8 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -51,11 +51,6 @@ namespace cv typedef void (*MathFunc)(const void* src, void* dst, int len); -static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI); -static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI); -static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI); -static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI); - #ifdef HAVE_OPENCL enum { OCL_OP_LOG=0, OCL_OP_EXP=1, OCL_OP_MAG=2, OCL_OP_PHASE_DEGREES=3, OCL_OP_PHASE_RADIANS=4 }; @@ -100,29 +95,6 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in #endif -float fastAtan2( float y, float x ) -{ - float ax = std::abs(x), ay = std::abs(y); - float a, c, c2; - if( ax >= ay ) - { - c = ay/(ax + (float)DBL_EPSILON); - c2 = c*c; - a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; - } - else - { - c = ax/(ay + (float)DBL_EPSILON); - c2 = c*c; - a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; - } - if( x < 0 ) - a = 180.f - a; - if( y < 0 ) - a = 360.f - a; - return a; -} - /* ************************************************************************** *\ Fast cube root by Ken Turkowski (http://www.worldserver.com/turk/computergraphics/papers.html) @@ -202,7 +174,6 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst ) } } - void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegrees ) { int type = src1.type(), depth = src1.depth(), cn = src1.channels(); @@ -218,19 +189,8 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre const Mat* arrays[] = {&X, &Y, &Angle, 0}; uchar* ptrs[3]; NAryMatIterator it(arrays, ptrs); - cv::AutoBuffer _buf; - float* buf[2] = {0, 0}; - int j, k, total = (int)(it.size*cn), blockSize = total; + int j, total = (int)(it.size*cn), blockSize = total; size_t esz1 = X.elemSize1(); - - if( depth == CV_64F ) - { - blockSize = std::min(blockSize, ((BLOCK_SIZE+cn-1)/cn)*cn); - _buf.allocate(blockSize*2); - buf[0] = _buf; - buf[1] = buf[0] + blockSize; - } - for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( j = 0; j < total; j += blockSize ) @@ -240,53 +200,13 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre { const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1]; float *angle = (float*)ptrs[2]; - hal::fastAtan2( y, x, angle, len, angleInDegrees ); + hal::fastAtan32f( y, x, angle, len, angleInDegrees ); } else { const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1]; double *angle = (double*)ptrs[2]; - k = 0; - -#if CV_SSE2 - if (USE_SSE2) - { - for ( ; k <= len - 4; k += 4) - { - __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), - _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); - __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), - _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); - - _mm_storeu_ps(buf[0] + k, v_dst0); - _mm_storeu_ps(buf[1] + k, v_dst1); - } - } -#endif - - for( ; k < len; k++ ) - { - buf[0][k] = (float)x[k]; - buf[1][k] = (float)y[k]; - } - - hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees ); - k = 0; - -#if CV_SSE2 - if (USE_SSE2) - { - for ( ; k <= len - 4; k += 4) - { - __m128 v_src = _mm_loadu_ps(buf[0] + k); - _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); - _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); - } - } -#endif - - for( ; k < len; k++ ) - angle[k] = buf[0][k]; + hal::fastAtan64f(y, x, angle, len, angleInDegrees); } ptrs[0] += len*esz1; ptrs[1] += len*esz1; @@ -353,18 +273,9 @@ void cartToPolar( InputArray src1, InputArray src2, const Mat* arrays[] = {&X, &Y, &Mag, &Angle, 0}; uchar* ptrs[4]; NAryMatIterator it(arrays, ptrs); - cv::AutoBuffer _buf; - float* buf[2] = {0, 0}; - int j, k, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn); + int j, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn); size_t esz1 = X.elemSize1(); - if( depth == CV_64F ) - { - _buf.allocate(blockSize*2); - buf[0] = _buf; - buf[1] = buf[0] + blockSize; - } - for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( j = 0; j < total; j += blockSize ) @@ -375,55 +286,14 @@ void cartToPolar( InputArray src1, InputArray src2, const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1]; float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3]; hal::magnitude32f( x, y, mag, len ); - hal::fastAtan2( y, x, angle, len, angleInDegrees ); + hal::fastAtan32f( y, x, angle, len, angleInDegrees ); } else { const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1]; double *angle = (double*)ptrs[3]; - hal::magnitude64f(x, y, (double*)ptrs[2], len); - k = 0; - -#if CV_SSE2 - if (USE_SSE2) - { - for ( ; k <= len - 4; k += 4) - { - __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)), - _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2))); - __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)), - _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2))); - - _mm_storeu_ps(buf[0] + k, v_dst0); - _mm_storeu_ps(buf[1] + k, v_dst1); - } - } -#endif - - for( ; k < len; k++ ) - { - buf[0][k] = (float)x[k]; - buf[1][k] = (float)y[k]; - } - - hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees ); - k = 0; - -#if CV_SSE2 - if (USE_SSE2) - { - for ( ; k <= len - 4; k += 4) - { - __m128 v_src = _mm_loadu_ps(buf[0] + k); - _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src)); - _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)))); - } - } -#endif - - for( ; k < len; k++ ) - angle[k] = buf[0][k]; + hal::fastAtan64f(y, x, angle, len, angleInDegrees); } ptrs[0] += len*esz1; ptrs[1] += len*esz1; diff --git a/modules/core/src/mathfuncs_core.cpp b/modules/core/src/mathfuncs_core.cpp index 7b3ec319c..de292fae9 100644 --- a/modules/core/src/mathfuncs_core.cpp +++ b/modules/core/src/mathfuncs_core.cpp @@ -42,116 +42,188 @@ #include "precomp.hpp" +using namespace std; + #undef HAVE_IPP -namespace cv { namespace hal { +namespace { -///////////////////////////////////// ATAN2 //////////////////////////////////// static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI); static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI); static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI); static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI); -void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) -{ - int i = 0; - float scale = angleInDegrees ? 1 : (float)(CV_PI/180); +using namespace cv; -#ifdef HAVE_TEGRA_OPTIMIZATION - if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale)) - return; +#if CV_SIMD128 + +template +struct v_atan +{ + typedef V_RegTrait128 Trait; + typedef typename Trait::reg VT; // vector type + enum { WorkWidth = VT::nlanes * 2 }; + + v_atan(const T & scale) + : s(Trait::all(scale)) + { + eps = Trait::all(DBL_EPSILON); + z = Trait::zero(); + p7 = Trait::all(atan2_p7); + p5 = Trait::all(atan2_p5); + p3 = Trait::all(atan2_p3); + p1 = Trait::all(atan2_p1); + val90 = Trait::all(90.f); + val180 = Trait::all(180.f); + val360 = Trait::all(360.f); + } + + inline int operator()(int len, const T * Y, const T * X, T * angle) + { + int i = 0; + const int c = VT::nlanes; + for ( ; i <= len - c * 2; i += c * 2) + { + VT x1 = v_load(X + i); + VT x2 = v_load(X + i + c); + VT y1 = v_load(Y + i); + VT y2 = v_load(Y + i + c); + v_store(&angle[i], s * one(x1, y1)); + v_store(&angle[i + c], s * one(x2, y2)); + } + return i; + } + +private: + inline VT one(VT & x, VT & y) + { + VT ax = v_abs(x); + VT ay = v_abs(y); + VT c = v_min(ax, ay) / (v_max(ax, ay) + eps); + VT cc = c * c; + VT a = (((p7 * cc + p5) * cc + p3) * cc + p1) * c; + a = v_select(ax >= ay, a, val90 - a); + a = v_select(x < z, val180 - a, a); + a = v_select(y < z, val360 - a, a); + return a; + } + +private: + VT eps; + VT z; + VT p7; + VT p5; + VT p3; + VT p1; + VT val90; + VT val180; + VT val360; + VT s; +}; + +#if !CV_SIMD128_64F + +// emulation +template <> +struct v_atan +{ + v_atan(double scale) : impl(static_cast(scale)) {} + inline int operator()(int len, const double * Y, const double * X, double * angle) + { + int i = 0; + const int c = v_atan::WorkWidth; + float bufY[c]; + float bufX[c]; + float bufA[c]; + for ( ; i <= len - c ; i += c) + { + for (int j = 0; j < c; ++j) + { + bufY[j] = static_cast(Y[i + j]); + bufX[j] = static_cast(X[i + j]); + } + impl(c, bufY, bufX, bufA); + for (int j = 0; j < c; ++j) + { + angle[i + j] = bufA[j]; + } + } + return i; + } +private: + v_atan impl; +}; #endif -#if CV_SSE2 - Cv32suf iabsmask; iabsmask.i = 0x7fffffff; - __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f); - __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f); - __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale); - __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3); - __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7); +#endif - for( ; i <= len - 4; i += 4 ) +template +static inline T atanImpl(T y, T x) +{ + T ax = std::abs(x), ay = std::abs(y); + T a, c, c2; + if( ax >= ay ) { - __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i); - __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask); - __m128 mask = _mm_cmplt_ps(ax, ay); - __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay); - __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps)); - __m128 c2 = _mm_mul_ps(c, c); - __m128 a = _mm_mul_ps(c2, p7); - a = _mm_mul_ps(_mm_add_ps(a, p5), c2); - a = _mm_mul_ps(_mm_add_ps(a, p3), c2); - a = _mm_mul_ps(_mm_add_ps(a, p1), c); - - __m128 b = _mm_sub_ps(_90, a); - a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); - - b = _mm_sub_ps(_180, a); - mask = _mm_cmplt_ps(x, z); - a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); - - b = _mm_sub_ps(_360, a); - mask = _mm_cmplt_ps(y, z); - a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask)); - - a = _mm_mul_ps(a, scale4); - _mm_storeu_ps(angle + i, a); + c = ay/(ax + static_cast(DBL_EPSILON)); + c2 = c*c; + a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; } -#elif CV_NEON - float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON); - float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f); - float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale); - float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3); - float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7); - - for( ; i <= len - 4; i += 4 ) + else { - float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i); - float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y); - float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); - float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps))); - float32x4_t c2 = vmulq_f32(c, c); - float32x4_t a = vmulq_f32(c2, p7); - a = vmulq_f32(vaddq_f32(a, p5), c2); - a = vmulq_f32(vaddq_f32(a, p3), c2); - a = vmulq_f32(vaddq_f32(a, p1), c); - - a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); - a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a); - a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a); - - vst1q_f32(angle + i, vmulq_f32(a, scale4)); + c = ax/(ay + static_cast(DBL_EPSILON)); + c2 = c*c; + a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; } + if( x < 0 ) + a = 180.f - a; + if( y < 0 ) + a = 360.f - a; + return a; +} + +template +static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angleInDegrees) +{ + int i = 0; + T scale = angleInDegrees ? 1 : static_cast(CV_PI/180); + +#if CV_SIMD128 + i = v_atan(scale)(len, Y, X, angle); #endif for( ; i < len; i++ ) { - float x = X[i], y = Y[i]; - float ax = std::abs(x), ay = std::abs(y); - float a, c, c2; - if( ax >= ay ) - { - c = ay/(ax + (float)DBL_EPSILON); - c2 = c*c; - a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; - } - else - { - c = ax/(ay + (float)DBL_EPSILON); - c2 = c*c; - a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c; - } - if( x < 0 ) - a = 180.f - a; - if( y < 0 ) - a = 360.f - a; - angle[i] = (float)(a*scale); + angle[i] = atanImpl(Y[i], X[i]) * scale; } } +} // anonymous:: + +namespace cv { namespace hal { + +///////////////////////////////////// ATAN2 //////////////////////////////////// + +void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) +{ + CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees); + atanImpl(Y, X, angle, len, angleInDegrees); +} + +void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees) +{ + CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees); + atanImpl(Y, X, angle, len, angleInDegrees); +} + +// deprecated +void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) +{ + fastAtan32f(Y, X, angle, len, angleInDegrees); +} void magnitude32f(const float* x, const float* y, float* mag, int len) { + CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len); #if defined HAVE_IPP CV_IPP_CHECK() { @@ -188,6 +260,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len) void magnitude64f(const double* x, const double* y, double* mag, int len) { + CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len); #if defined(HAVE_IPP) CV_IPP_CHECK() { @@ -225,6 +298,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len) void invSqrt32f(const float* src, float* dst, int len) { + CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len); #if defined(HAVE_IPP) CV_IPP_CHECK() { @@ -256,6 +330,7 @@ void invSqrt32f(const float* src, float* dst, int len) void invSqrt64f(const double* src, double* dst, int len) { + CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len); int i = 0; #if CV_SSE2 @@ -271,6 +346,7 @@ void invSqrt64f(const double* src, double* dst, int len) void sqrt32f(const float* src, float* dst, int len) { + CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len); #if defined(HAVE_IPP) CV_IPP_CHECK() { @@ -302,6 +378,7 @@ void sqrt32f(const float* src, float* dst, int len) void sqrt64f(const double* src, double* dst, int len) { + CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len); #if defined(HAVE_IPP) CV_IPP_CHECK() { @@ -433,6 +510,7 @@ static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) < void exp32f( const float *_x, float *y, int n ) { + CALL_HAL(exp32f, cv_hal_exp32f, _x, y, n); static const float A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), @@ -632,6 +710,7 @@ void exp32f( const float *_x, float *y, int n ) void exp64f( const double *_x, double *y, int n ) { + CALL_HAL(exp64f, cv_hal_exp64f, _x, y, n); static const double A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, @@ -1076,6 +1155,7 @@ static const double ln_2 = 0.69314718055994530941723212145818; void log32f( const float *_x, float *y, int n ) { + CALL_HAL(log32f, cv_hal_log32f, _x, y, n); static const float shift[] = { 0, -1.f/512 }; static const float A0 = 0.3333333333333333333333333f, @@ -1220,6 +1300,7 @@ void log32f( const float *_x, float *y, int n ) void log64f( const double *x, double *y, int n ) { + CALL_HAL(log64f, cv_hal_log64f, x, y, n); static const double shift[] = { 0, -1./512 }; static const double A7 = 1.0, @@ -1457,4 +1538,10 @@ void invSqrt(const double* src, double* dst, int len) } -}} // cv::hal:: +} // cv::hal:: +} // cv:: + +float cv::fastAtan2( float y, float x ) +{ + return atanImpl(y, x); +} diff --git a/modules/core/test/test_intrin.cpp b/modules/core/test/test_intrin.cpp index 42b0cfcfd..ca9d3dc7b 100644 --- a/modules/core/test/test_intrin.cpp +++ b/modules/core/test/test_intrin.cpp @@ -69,8 +69,8 @@ template struct TheTest EXPECT_EQ(d, res); // zero, all - Data resZ = RegTrait::zero(); - Data resV = RegTrait::all(8); + Data resZ = V_RegTrait128::zero(); + Data resV = V_RegTrait128::all(8); for (int i = 0; i < R::nlanes; ++i) { EXPECT_EQ((LaneType)0, resZ[i]); @@ -135,7 +135,7 @@ template struct TheTest // v_expand and v_load_expand TheTest & test_expand() { - typedef typename RegTrait::w_reg Rx2; + typedef typename V_RegTrait128::w_reg Rx2; Data dataA; R a = dataA; @@ -158,7 +158,7 @@ template struct TheTest TheTest & test_expand_q() { - typedef typename RegTrait::q_reg Rx4; + typedef typename V_RegTrait128::q_reg Rx4; Data data; Data out = v_load_expand_q(data.d); const int n = Rx4::nlanes; @@ -232,7 +232,7 @@ template struct TheTest TheTest & test_mul_expand() { - typedef typename RegTrait::w_reg Rx2; + typedef typename V_RegTrait128::w_reg Rx2; Data dataA, dataB(2); R a = dataA, b = dataB; Rx2 c, d; @@ -295,7 +295,7 @@ template struct TheTest TheTest & test_dot_prod() { - typedef typename RegTrait::w_reg Rx2; + typedef typename V_RegTrait128::w_reg Rx2; Data dataA, dataB(2); R a = dataA, b = dataB; @@ -361,7 +361,7 @@ template struct TheTest TheTest & test_absdiff() { - typedef typename RegTrait::u_reg Ru; + typedef typename V_RegTrait128::u_reg Ru; typedef typename Ru::lane_type u_type; Data dataA(std::numeric_limits::max()), dataB(std::numeric_limits::min()); @@ -445,7 +445,7 @@ template struct TheTest template TheTest & test_pack() { - typedef typename RegTrait::w_reg Rx2; + typedef typename V_RegTrait128::w_reg Rx2; typedef typename Rx2::lane_type w_type; Data dataA, dataB; dataA += std::numeric_limits::is_signed ? -10 : 10; @@ -480,8 +480,8 @@ template struct TheTest template TheTest & test_pack_u() { - typedef typename RegTrait::w_reg Rx2; - typedef typename RegTrait::int_reg Ri2; + typedef typename V_TypeTraits::w_type LaneType_w; + typedef typename V_RegTrait128::int_reg Ri2; typedef typename Ri2::lane_type w_type; Data dataA, dataB; @@ -572,7 +572,7 @@ template struct TheTest TheTest & test_float_math() { - typedef typename RegTrait::int_reg Ri; + typedef typename V_RegTrait128::int_reg Ri; Data data1, data2, data3; data1 *= 1.1; data2 += 10; diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index a0eab56a5..1f8a78d98 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -155,80 +155,4 @@ template std::ostream & operator<<(std::ostream & out, const Data struct RegTrait; - -template <> struct RegTrait { - typedef cv::v_uint16x8 w_reg; - typedef cv::v_uint32x4 q_reg; - typedef cv::v_uint8x16 u_reg; - static cv::v_uint8x16 zero() { return cv::v_setzero_u8(); } - static cv::v_uint8x16 all(uchar val) { return cv::v_setall_u8(val); } -}; -template <> struct RegTrait { - typedef cv::v_int16x8 w_reg; - typedef cv::v_int32x4 q_reg; - typedef cv::v_uint8x16 u_reg; - static cv::v_int8x16 zero() { return cv::v_setzero_s8(); } - static cv::v_int8x16 all(schar val) { return cv::v_setall_s8(val); } -}; - -template <> struct RegTrait { - typedef cv::v_uint32x4 w_reg; - typedef cv::v_int16x8 int_reg; - typedef cv::v_uint16x8 u_reg; - static cv::v_uint16x8 zero() { return cv::v_setzero_u16(); } - static cv::v_uint16x8 all(ushort val) { return cv::v_setall_u16(val); } -}; - -template <> struct RegTrait { - typedef cv::v_int32x4 w_reg; - typedef cv::v_uint16x8 u_reg; - static cv::v_int16x8 zero() { return cv::v_setzero_s16(); } - static cv::v_int16x8 all(short val) { return cv::v_setall_s16(val); } -}; - -template <> struct RegTrait { - typedef cv::v_uint64x2 w_reg; - typedef cv::v_int32x4 int_reg; - typedef cv::v_uint32x4 u_reg; - static cv::v_uint32x4 zero() { return cv::v_setzero_u32(); } - static cv::v_uint32x4 all(unsigned val) { return cv::v_setall_u32(val); } -}; - -template <> struct RegTrait { - typedef cv::v_int64x2 w_reg; - typedef cv::v_uint32x4 u_reg; - static cv::v_int32x4 zero() { return cv::v_setzero_s32(); } - static cv::v_int32x4 all(int val) { return cv::v_setall_s32(val); } -}; - -template <> struct RegTrait { - static cv::v_uint64x2 zero() { return cv::v_setzero_u64(); } - static cv::v_uint64x2 all(uint64 val) { return cv::v_setall_u64(val); } -}; - -template <> struct RegTrait { - static cv::v_int64x2 zero() { return cv::v_setzero_s64(); } - static cv::v_int64x2 all(int64 val) { return cv::v_setall_s64(val); } -}; - -template <> struct RegTrait { - typedef cv::v_int32x4 int_reg; - typedef cv::v_float32x4 u_reg; - static cv::v_float32x4 zero() { return cv::v_setzero_f32(); } - static cv::v_float32x4 all(float val) { return cv::v_setall_f32(val); } -}; - -#if CV_SIMD128_64F -template <> struct RegTrait { - typedef cv::v_int32x4 int_reg; - typedef cv::v_float64x2 u_reg; - static cv::v_float64x2 zero() { return cv::v_setzero_f64(); } - static cv::v_float64x2 all(double val) { return cv::v_setall_f64(val); } -}; - -#endif - #endif diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp index 65e3861ed..288c0c1f4 100644 --- a/modules/core/test/test_math.cpp +++ b/modules/core/test/test_math.cpp @@ -2411,8 +2411,9 @@ TEST(Core_SolvePoly, regression_5599) class Core_PhaseTest : public cvtest::BaseTest { + int t; public: - Core_PhaseTest() {} + Core_PhaseTest(int t_) : t(t_) {} ~Core_PhaseTest() {} protected: virtual void run(int) @@ -2421,9 +2422,9 @@ protected: const int axisCount = 8; const int dim = theRNG().uniform(1,10); const float scale = theRNG().uniform(1.f, 100.f); - Mat x(axisCount + 1, dim, CV_32FC1), - y(axisCount + 1, dim, CV_32FC1); - Mat anglesInDegrees(axisCount + 1, dim, CV_32FC1); + Mat x(axisCount + 1, dim, t), + y(axisCount + 1, dim, t); + Mat anglesInDegrees(axisCount + 1, dim, t); // fill the data x.row(0).setTo(Scalar(0)); @@ -2696,8 +2697,8 @@ TEST(Core_SVD, accuracy) { Core_SVDTest test; test.safe_run(); } TEST(Core_SVBkSb, accuracy) { Core_SVBkSbTest test; test.safe_run(); } TEST(Core_Trace, accuracy) { Core_TraceTest test; test.safe_run(); } TEST(Core_SolvePoly, accuracy) { Core_SolvePolyTest test; test.safe_run(); } -TEST(Core_Phase, accuracy) { Core_PhaseTest test; test.safe_run(); } - +TEST(Core_Phase, accuracy32f) { Core_PhaseTest test(CV_32FC1); test.safe_run(); } +TEST(Core_Phase, accuracy64f) { Core_PhaseTest test(CV_64FC1); test.safe_run(); } TEST(Core_SVD, flt) { diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp index 6f1b610cc..d5a029944 100644 --- a/modules/features2d/src/kaze/AKAZEFeatures.cpp +++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp @@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));