HAL math interfaces: fastAtan2, magnitude, sqrt, invSqrt, log, exp

2016-05-24 13:57:27 +03:00
parent f6fa1cee2b
commit 1e667de1f3
10 changed files with 412 additions and 316 deletions
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -376,6 +376,110 @@ inline int hal_ni_merge64s(const int64 **src_data, int64 *dst_data, int len, int
 #define cv_hal_merge64s hal_ni_merge64s
 //! @endcond

+
+/**
+@param y,x source Y and X arrays
+@param dst destination array
+@param len length of arrays
+@param angleInDegrees if set to true return angles in degrees, otherwise in radians
+ */
+//! @addtogroup core_hal_interface_fastAtan Atan calculation
+//! @{
+inline int hal_ni_fastAtan32f(const float* y, const float* x, float* dst, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_fastAtan64f(const double* y, const double* x, double* dst, int len, bool angleInDegrees) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_fastAtan32f hal_ni_fastAtan32f
+#define cv_hal_fastAtan64f hal_ni_fastAtan64f
+//! @endcond
+
+
+/**
+@param x,y source X and Y arrays
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_magnitude Magnitude calculation
+//! @{
+inline int hal_ni_magnitude32f(const float *x, const float *y, float *dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_magnitude64f(const double *x, const double  *y, double *dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_magnitude32f hal_ni_magnitude32f
+#define cv_hal_magnitude64f hal_ni_magnitude64f
+//! @endcond
+
+
+/**
+@param src source array
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_invSqrt Inverse square root calculation
+//! @{
+inline int hal_ni_invSqrt32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_invSqrt64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_invSqrt32f hal_ni_invSqrt32f
+#define cv_hal_invSqrt64f hal_ni_invSqrt64f
+//! @endcond
+
+
+/**
+@param src source array
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_sqrt Square root calculation
+//! @{
+inline int hal_ni_sqrt32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sqrt64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_sqrt32f hal_ni_sqrt32f
+#define cv_hal_sqrt64f hal_ni_sqrt64f
+//! @endcond
+
+
+/**
+@param src source array
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_log Natural logarithm calculation
+//! @{
+inline int hal_ni_log32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_log64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_log32f hal_ni_log32f
+#define cv_hal_log64f hal_ni_log64f
+//! @endcond
+
+
+/**
+@param src source array
+@param dst destination array
+@param len length of arrays
+ */
+//! @addtogroup core_hal_interface_exp Exponent calculation
+//! @{
+inline int hal_ni_exp32f(const float* src, float* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_exp64f(const double* src, double* dst, int len) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_exp32f hal_ni_exp32f
+#define cv_hal_exp64f hal_ni_exp64f
+//! @endcond
+
+
 /**
@brief Dummy structure storing DFT/DCT context

--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -51,11 +51,6 @@ namespace cv

 typedef void (*MathFunc)(const void* src, void* dst, int len);

-static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
-static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
-static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
-static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
-
 #ifdef HAVE_OPENCL

 enum { OCL_OP_LOG=0, OCL_OP_EXP=1, OCL_OP_MAG=2, OCL_OP_PHASE_DEGREES=3, OCL_OP_PHASE_RADIANS=4 };
@@ -100,29 +95,6 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in

 #endif

-float fastAtan2( float y, float x )
-{
-    float ax = std::abs(x), ay = std::abs(y);
-    float a, c, c2;
-    if( ax >= ay )
-    {
-        c = ay/(ax + (float)DBL_EPSILON);
-        c2 = c*c;
-        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-    }
-    else
-    {
-        c = ax/(ay + (float)DBL_EPSILON);
-        c2 = c*c;
-        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-    }
-    if( x < 0 )
-        a = 180.f - a;
-    if( y < 0 )
-        a = 360.f - a;
-    return a;
-}
-
 /* ************************************************************************** *\
   Fast cube root by Ken Turkowski
   (http://www.worldserver.com/turk/computergraphics/papers.html)
@@ -202,7 +174,6 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
    }
 }

-
 void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegrees )
 {
    int type = src1.type(), depth = src1.depth(), cn = src1.channels();
@@ -218,19 +189,8 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
    const Mat* arrays[] = {&X, &Y, &Angle, 0};
    uchar* ptrs[3];
    NAryMatIterator it(arrays, ptrs);
-    cv::AutoBuffer<float> _buf;
-    float* buf[2] = {0, 0};
-    int j, k, total = (int)(it.size*cn), blockSize = total;
+    int j, total = (int)(it.size*cn), blockSize = total;
    size_t esz1 = X.elemSize1();
-
-    if( depth == CV_64F )
-    {
-        blockSize = std::min(blockSize, ((BLOCK_SIZE+cn-1)/cn)*cn);
-        _buf.allocate(blockSize*2);
-        buf[0] = _buf;
-        buf[1] = buf[0] + blockSize;
-    }
-
    for( size_t i = 0; i < it.nplanes; i++, ++it )
    {
        for( j = 0; j < total; j += blockSize )
@@ -240,53 +200,13 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
            {
                const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                float *angle = (float*)ptrs[2];
-                hal::fastAtan2( y, x, angle, len, angleInDegrees );
+                hal::fastAtan32f( y, x, angle, len, angleInDegrees );
            }
            else
            {
                const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                double *angle = (double*)ptrs[2];
-                k = 0;
-
-#if CV_SSE2
-                if (USE_SSE2)
-                {
-                    for ( ; k <= len - 4; k += 4)
-                    {
-                        __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
-                                                      _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
-                        __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
-                                                      _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
-
-                        _mm_storeu_ps(buf[0] + k, v_dst0);
-                        _mm_storeu_ps(buf[1] + k, v_dst1);
-                    }
-                }
-#endif
-
-                for( ; k < len; k++ )
-                {
-                    buf[0][k] = (float)x[k];
-                    buf[1][k] = (float)y[k];
-                }
-
-                hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees );
-                k = 0;
-
-#if CV_SSE2
-                if (USE_SSE2)
-                {
-                    for ( ; k <= len - 4; k += 4)
-                    {
-                        __m128 v_src = _mm_loadu_ps(buf[0] + k);
-                        _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
-                        _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
-                    }
-                }
-#endif
-
-                for( ; k < len; k++ )
-                    angle[k] = buf[0][k];
+                hal::fastAtan64f(y, x, angle, len, angleInDegrees);
            }
            ptrs[0] += len*esz1;
            ptrs[1] += len*esz1;
@@ -353,18 +273,9 @@ void cartToPolar( InputArray src1, InputArray src2,
    const Mat* arrays[] = {&X, &Y, &Mag, &Angle, 0};
    uchar* ptrs[4];
    NAryMatIterator it(arrays, ptrs);
-    cv::AutoBuffer<float> _buf;
-    float* buf[2] = {0, 0};
-    int j, k, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn);
+    int j, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn);
    size_t esz1 = X.elemSize1();

-    if( depth == CV_64F )
-    {
-        _buf.allocate(blockSize*2);
-        buf[0] = _buf;
-        buf[1] = buf[0] + blockSize;
-    }
-
    for( size_t i = 0; i < it.nplanes; i++, ++it )
    {
        for( j = 0; j < total; j += blockSize )
@@ -375,55 +286,14 @@ void cartToPolar( InputArray src1, InputArray src2,
                const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
                hal::magnitude32f( x, y, mag, len );
-                hal::fastAtan2( y, x, angle, len, angleInDegrees );
+                hal::fastAtan32f( y, x, angle, len, angleInDegrees );
            }
            else
            {
                const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                double *angle = (double*)ptrs[3];
-
                hal::magnitude64f(x, y, (double*)ptrs[2], len);
-                k = 0;
-
-#if CV_SSE2
-                if (USE_SSE2)
-                {
-                    for ( ; k <= len - 4; k += 4)
-                    {
-                        __m128 v_dst0 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(x + k)),
-                                                      _mm_cvtpd_ps(_mm_loadu_pd(x + k + 2)));
-                        __m128 v_dst1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(y + k)),
-                                                      _mm_cvtpd_ps(_mm_loadu_pd(y + k + 2)));
-
-                        _mm_storeu_ps(buf[0] + k, v_dst0);
-                        _mm_storeu_ps(buf[1] + k, v_dst1);
-                    }
-                }
-#endif
-
-                for( ; k < len; k++ )
-                {
-                    buf[0][k] = (float)x[k];
-                    buf[1][k] = (float)y[k];
-                }
-
-                hal::fastAtan2( buf[1], buf[0], buf[0], len, angleInDegrees );
-                k = 0;
-
-#if CV_SSE2
-                if (USE_SSE2)
-                {
-                    for ( ; k <= len - 4; k += 4)
-                    {
-                        __m128 v_src = _mm_loadu_ps(buf[0] + k);
-                        _mm_storeu_pd(angle + k, _mm_cvtps_pd(v_src));
-                        _mm_storeu_pd(angle + k + 2, _mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8))));
-                    }
-                }
-#endif
-
-                for( ; k < len; k++ )
-                    angle[k] = buf[0][k];
+                hal::fastAtan64f(y, x, angle, len, angleInDegrees);
            }
            ptrs[0] += len*esz1;
            ptrs[1] += len*esz1;
--- a/modules/core/src/mathfuncs_core.cpp
+++ b/modules/core/src/mathfuncs_core.cpp
@@ -42,116 +42,188 @@

 #include "precomp.hpp"

+using namespace std;
+
 #undef HAVE_IPP

-namespace cv { namespace hal {
+namespace {

-///////////////////////////////////// ATAN2 ////////////////////////////////////
 static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI);
 static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
 static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
 static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);

-void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
-{
-    int i = 0;
-    float scale = angleInDegrees ? 1 : (float)(CV_PI/180);
+using namespace cv;

-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if (tegra::useTegra() && tegra::FastAtan2_32f(Y, X, angle, len, scale))
-        return;
+#if CV_SIMD128
+
+template <typename T>
+struct v_atan
+{
+    typedef V_RegTrait128<T> Trait;
+    typedef typename Trait::reg VT; // vector type
+    enum { WorkWidth = VT::nlanes * 2 };
+
+    v_atan(const T & scale)
+        : s(Trait::all(scale))
+    {
+        eps = Trait::all(DBL_EPSILON);
+        z = Trait::zero();
+        p7 = Trait::all(atan2_p7);
+        p5 = Trait::all(atan2_p5);
+        p3 = Trait::all(atan2_p3);
+        p1 = Trait::all(atan2_p1);
+        val90 = Trait::all(90.f);
+        val180 = Trait::all(180.f);
+        val360 = Trait::all(360.f);
+    }
+
+    inline int operator()(int len, const T * Y, const T * X, T * angle)
+    {
+        int i = 0;
+        const int c = VT::nlanes;
+        for ( ; i <= len - c * 2; i += c * 2)
+        {
+            VT x1 = v_load(X + i);
+            VT x2 = v_load(X + i + c);
+            VT y1 = v_load(Y + i);
+            VT y2 = v_load(Y + i + c);
+            v_store(&angle[i], s * one(x1, y1));
+            v_store(&angle[i + c], s * one(x2, y2));
+        }
+        return i;
+    }
+
+private:
+    inline VT one(VT & x, VT & y)
+    {
+        VT ax = v_abs(x);
+        VT ay = v_abs(y);
+        VT c = v_min(ax, ay) / (v_max(ax, ay) + eps);
+        VT cc = c * c;
+        VT a = (((p7 * cc + p5) * cc + p3) * cc + p1) * c;
+        a = v_select(ax >= ay, a, val90 - a);
+        a = v_select(x < z, val180 - a, a);
+        a = v_select(y < z, val360 - a, a);
+        return a;
+    }
+
+private:
+    VT eps;
+    VT z;
+    VT p7;
+    VT p5;
+    VT p3;
+    VT p1;
+    VT val90;
+    VT val180;
+    VT val360;
+    VT s;
+};
+
+#if !CV_SIMD128_64F
+
+// emulation
+template <>
+struct v_atan<double>
+{
+    v_atan(double scale) : impl(static_cast<float>(scale)) {}
+    inline int operator()(int len, const double * Y, const double * X, double * angle)
+    {
+        int i = 0;
+        const int c = v_atan<float>::WorkWidth;
+        float bufY[c];
+        float bufX[c];
+        float bufA[c];
+        for ( ; i <= len - c ; i += c)
+        {
+            for (int j = 0; j < c; ++j)
+            {
+                bufY[j] = static_cast<float>(Y[i + j]);
+                bufX[j] = static_cast<float>(X[i + j]);
+            }
+            impl(c, bufY, bufX, bufA);
+            for (int j = 0; j < c; ++j)
+            {
+                angle[i + j] = bufA[j];
+            }
+        }
+        return i;
+    }
+private:
+    v_atan<float> impl;
+};
 #endif

-#if CV_SSE2
-    Cv32suf iabsmask; iabsmask.i = 0x7fffffff;
-    __m128 eps = _mm_set1_ps((float)DBL_EPSILON), absmask = _mm_set1_ps(iabsmask.f);
-    __m128 _90 = _mm_set1_ps(90.f), _180 = _mm_set1_ps(180.f), _360 = _mm_set1_ps(360.f);
-    __m128 z = _mm_setzero_ps(), scale4 = _mm_set1_ps(scale);
-    __m128 p1 = _mm_set1_ps(atan2_p1), p3 = _mm_set1_ps(atan2_p3);
-    __m128 p5 = _mm_set1_ps(atan2_p5), p7 = _mm_set1_ps(atan2_p7);
+#endif

-    for( ; i <= len - 4; i += 4 )
+template <typename T>
+static inline T atanImpl(T y, T x)
+{
+    T ax = std::abs(x), ay = std::abs(y);
+    T a, c, c2;
+    if( ax >= ay )
    {
-        __m128 x = _mm_loadu_ps(X + i), y = _mm_loadu_ps(Y + i);
-        __m128 ax = _mm_and_ps(x, absmask), ay = _mm_and_ps(y, absmask);
-        __m128 mask = _mm_cmplt_ps(ax, ay);
-        __m128 tmin = _mm_min_ps(ax, ay), tmax = _mm_max_ps(ax, ay);
-        __m128 c = _mm_div_ps(tmin, _mm_add_ps(tmax, eps));
-        __m128 c2 = _mm_mul_ps(c, c);
-        __m128 a = _mm_mul_ps(c2, p7);
-        a = _mm_mul_ps(_mm_add_ps(a, p5), c2);
-        a = _mm_mul_ps(_mm_add_ps(a, p3), c2);
-        a = _mm_mul_ps(_mm_add_ps(a, p1), c);
-
-        __m128 b = _mm_sub_ps(_90, a);
-        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-        b = _mm_sub_ps(_180, a);
-        mask = _mm_cmplt_ps(x, z);
-        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-        b = _mm_sub_ps(_360, a);
-        mask = _mm_cmplt_ps(y, z);
-        a = _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(a, b), mask));
-
-        a = _mm_mul_ps(a, scale4);
-        _mm_storeu_ps(angle + i, a);
+        c = ay/(ax + static_cast<T>(DBL_EPSILON));
+        c2 = c*c;
+        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
    }
-#elif CV_NEON
-    float32x4_t eps = vdupq_n_f32((float)DBL_EPSILON);
-    float32x4_t _90 = vdupq_n_f32(90.f), _180 = vdupq_n_f32(180.f), _360 = vdupq_n_f32(360.f);
-    float32x4_t z = vdupq_n_f32(0.0f), scale4 = vdupq_n_f32(scale);
-    float32x4_t p1 = vdupq_n_f32(atan2_p1), p3 = vdupq_n_f32(atan2_p3);
-    float32x4_t p5 = vdupq_n_f32(atan2_p5), p7 = vdupq_n_f32(atan2_p7);
-
-    for( ; i <= len - 4; i += 4 )
+    else
    {
-        float32x4_t x = vld1q_f32(X + i), y = vld1q_f32(Y + i);
-        float32x4_t ax = vabsq_f32(x), ay = vabsq_f32(y);
-        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);
-        float32x4_t c = vmulq_f32(tmin, cv_vrecpq_f32(vaddq_f32(tmax, eps)));
-        float32x4_t c2 = vmulq_f32(c, c);
-        float32x4_t a = vmulq_f32(c2, p7);
-        a = vmulq_f32(vaddq_f32(a, p5), c2);
-        a = vmulq_f32(vaddq_f32(a, p3), c2);
-        a = vmulq_f32(vaddq_f32(a, p1), c);
-
-        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));
-        a = vbslq_f32(vcltq_f32(x, z), vsubq_f32(_180, a), a);
-        a = vbslq_f32(vcltq_f32(y, z), vsubq_f32(_360, a), a);
-
-        vst1q_f32(angle + i, vmulq_f32(a, scale4));
+        c = ax/(ay + static_cast<T>(DBL_EPSILON));
+        c2 = c*c;
+        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
    }
+    if( x < 0 )
+        a = 180.f - a;
+    if( y < 0 )
+        a = 360.f - a;
+    return a;
+}
+
+template <typename T>
+static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angleInDegrees)
+{
+    int i = 0;
+    T scale = angleInDegrees ? 1 : static_cast<T>(CV_PI/180);
+
+#if CV_SIMD128
+    i = v_atan<T>(scale)(len, Y, X, angle);
 #endif

    for( ; i < len; i++ )
    {
-        float x = X[i], y = Y[i];
-        float ax = std::abs(x), ay = std::abs(y);
-        float a, c, c2;
-        if( ax >= ay )
-        {
-            c = ay/(ax + (float)DBL_EPSILON);
-            c2 = c*c;
-            a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-        }
-        else
-        {
-            c = ax/(ay + (float)DBL_EPSILON);
-            c2 = c*c;
-            a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
-        }
-        if( x < 0 )
-            a = 180.f - a;
-        if( y < 0 )
-            a = 360.f - a;
-        angle[i] = (float)(a*scale);
+        angle[i] = atanImpl<T>(Y[i], X[i]) * scale;
    }
 }

+} // anonymous::
+
+namespace cv { namespace hal {
+
+///////////////////////////////////// ATAN2 ////////////////////////////////////
+
+void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+{
+    CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees);
+    atanImpl<float>(Y, X, angle, len, angleInDegrees);
+}
+
+void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
+{
+    CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees);
+    atanImpl<double>(Y, X, angle, len, angleInDegrees);
+}
+
+// deprecated
+void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+{
+    fastAtan32f(Y, X, angle, len, angleInDegrees);
+}

 void magnitude32f(const float* x, const float* y, float* mag, int len)
 {
+    CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
 #if defined HAVE_IPP
    CV_IPP_CHECK()
    {
@@ -188,6 +260,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)

 void magnitude64f(const double* x, const double* y, double* mag, int len)
 {
+    CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len);
 #if defined(HAVE_IPP)
    CV_IPP_CHECK()
    {
@@ -225,6 +298,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)

 void invSqrt32f(const float* src, float* dst, int len)
 {
+    CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len);
 #if defined(HAVE_IPP)
    CV_IPP_CHECK()
    {
@@ -256,6 +330,7 @@ void invSqrt32f(const float* src, float* dst, int len)

 void invSqrt64f(const double* src, double* dst, int len)
 {
+    CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len);
    int i = 0;

 #if CV_SSE2
@@ -271,6 +346,7 @@ void invSqrt64f(const double* src, double* dst, int len)

 void sqrt32f(const float* src, float* dst, int len)
 {
+    CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len);
 #if defined(HAVE_IPP)
    CV_IPP_CHECK()
    {
@@ -302,6 +378,7 @@ void sqrt32f(const float* src, float* dst, int len)

 void sqrt64f(const double* src, double* dst, int len)
 {
+    CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len);
 #if defined(HAVE_IPP)
    CV_IPP_CHECK()
    {
@@ -433,6 +510,7 @@ static const double exp_max_val = 3000.*(1 << EXPTAB_SCALE); // log10(DBL_MAX) <

 void exp32f( const float *_x, float *y, int n )
 {
+    CALL_HAL(exp32f, cv_hal_exp32f, _x, y, n);
    static const float
    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
    A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
@@ -632,6 +710,7 @@ void exp32f( const float *_x, float *y, int n )

 void exp64f( const double *_x, double *y, int n )
 {
+    CALL_HAL(exp64f, cv_hal_exp64f, _x, y, n);
    static const double
    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
@@ -1076,6 +1155,7 @@ static const double ln_2 = 0.69314718055994530941723212145818;

 void log32f( const float *_x, float *y, int n )
 {
+    CALL_HAL(log32f, cv_hal_log32f, _x, y, n);
    static const float shift[] = { 0, -1.f/512 };
    static const float
    A0 = 0.3333333333333333333333333f,
@@ -1220,6 +1300,7 @@ void log32f( const float *_x, float *y, int n )

 void log64f( const double *x, double *y, int n )
 {
+    CALL_HAL(log64f, cv_hal_log64f, x, y, n);
    static const double shift[] = { 0, -1./512 };
    static const double
    A7 = 1.0,
@@ -1457,4 +1538,10 @@ void invSqrt(const double* src, double* dst, int len)
 }


-}} // cv::hal::
+} // cv::hal::
+} // cv::
+
+float cv::fastAtan2( float y, float x )
+{
+    return atanImpl<float>(y, x);
+}