fully implemented SSE and NEON cases of intrin.hpp; extended the HAL with some basic math functions

2015-04-16 23:00:26 +03:00
parent a2bba1b9e6
commit ee11a2d266
18 changed files with 2460 additions and 2003 deletions
--- a/modules/calib3d/test/test_fisheye.cpp
+++ b/modules/calib3d/test/test_fisheye.cpp
@@ -381,7 +381,7 @@ TEST_F(fisheyeTest, EtimateUncertainties)
    EXPECT_MAT_NEAR(errors.c, cv::Vec2d(0.890439368129246, 0.816096854937896), 1e-10);
    EXPECT_MAT_NEAR(errors.k, cv::Vec4d(0.00516248605191506, 0.0168181467500934, 0.0213118690274604, 0.00916010877545648), 1e-10);
    EXPECT_MAT_NEAR(err_std, cv::Vec2d(0.187475975266883, 0.185678953263995), 1e-10);
-    CV_Assert(abs(rms - 0.263782587133546) < 1e-10);
+    CV_Assert(fabs(rms - 0.263782587133546) < 1e-10);
    CV_Assert(errors.alpha == 0);
 }
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -53,6 +53,7 @@
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
 #include "opencv2/hal.hpp"
 namespace cv
 {
@@ -419,6 +420,12 @@ typedef Hamming HammingLUT;
 /////////////////////////////////// inline norms ////////////////////////////////////
 template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
 inline int cv_abs(uchar x) { return x; }
 inline int cv_abs(schar x) { return std::abs(x); }
 inline int cv_abs(ushort x) { return x; }
 inline int cv_abs(short x) { return std::abs(x); }
 template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, int n)
 {
@@ -447,12 +454,12 @@ _AccTp normL1(const _Tp* a, int n)
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4 )
    {
-        s += (_AccTp)std::abs(a[i]) + (_AccTp)std::abs(a[i+1]) +
+        s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
-            (_AccTp)std::abs(a[i+2]) + (_AccTp)std::abs(a[i+3]);
+            (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
    }
 #endif
    for( ; i < n; i++ )
-        s += std::abs(a[i]);
+        s += cv_abs(a[i]);
    return s;
 }
@@ -461,7 +468,7 @@ _AccTp normInf(const _Tp* a, int n)
 {
    _AccTp s = 0;
    for( int i = 0; i < n; i++ )
-        s = std::max(s, (_AccTp)std::abs(a[i]));
+        s = std::max(s, (_AccTp)cv_abs(a[i]));
    return s;
 }
@@ -485,11 +492,10 @@ _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
    return s;
 }
-template<> inline
+inline float normL2Sqr(const float* a, const float* b, int n)
 float normL2Sqr(const float* a, const float* b, int n)
 {
    if( n >= 8 )
-        return normL2Sqr_(a, b, n);
+        return hal::normL2Sqr_(a, b, n);
    float s = 0;
    for( int i = 0; i < n; i++ )
    {
@@ -519,11 +525,10 @@ _AccTp normL1(const _Tp* a, const _Tp* b, int n)
    return s;
 }
-template<> inline
+inline float normL1(const float* a, const float* b, int n)
 float normL1(const float* a, const float* b, int n)
 {
    if( n >= 8 )
-        return normL1_(a, b, n);
+        return hal::normL1_(a, b, n);
    float s = 0;
    for( int i = 0; i < n; i++ )
    {
@@ -533,10 +538,9 @@ float normL1(const float* a, const float* b, int n)
    return s;
 }
-template<> inline
+inline int normL1(const uchar* a, const uchar* b, int n)
 int normL1(const uchar* a, const uchar* b, int n)
 {
-    return normL1_(a, b, n);
+    return hal::normL1_(a, b, n);
 }
 template<typename _Tp, typename _AccTp> static inline
@@ -551,6 +555,23 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
    return s;
 }
 /** @brief Computes the cube root of an argument.
 The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
 NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
 single-precision data.
 @param val A function argument.
 */
 CV_EXPORTS_W float cubeRoot(float val);
 /** @brief Calculates the angle of a 2D vector in degrees.
 The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
 in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
 @param x x-coordinate of the vector.
 @param y y-coordinate of the vector.
 */
 CV_EXPORTS_W float fastAtan2(float y, float x);
 ////////////////// forward declarations for important OpenCV types //////////////////
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -427,7 +427,7 @@ template<typename _Tp, int m> struct Matx_DetOp
    double operator ()(const Matx<_Tp, m, m>& a) const
    {
        Matx<_Tp, m, m> temp = a;
-        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        double p = hal::LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
        if( p == 0 )
            return p;
        for( int i = 0; i < m; i++ )
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -72,9 +72,9 @@ template<typename _Tp, int m> struct Matx_FastInvOp
            b(i, i) = (_Tp)1;
        if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+            return hal::Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
-        return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        return hal::LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
    }
 };
--- a/modules/core/src/kmeans.cpp
+++ b/modules/core/src/kmeans.cpp
@@ -79,7 +79,7 @@ public:
        for ( int i = begin; i<end; i++ )
        {
-            tdist2[i] = std::min(normL2Sqr_(data + step*i, data + stepci, dims), dist[i]);
+            tdist2[i] = std::min(normL2Sqr(data + step*i, data + stepci, dims), dist[i]);
        }
    }
@@ -114,7 +114,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
    for( i = 0; i < N; i++ )
    {
-        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
+        dist[i] = normL2Sqr(data + step*i, data + step*centers[0], dims);
        sum0 += dist[i];
    }
@@ -189,7 +189,7 @@ public:
            for( int k = 0; k < K; k++ )
            {
                const float* center = centers.ptr<float>(k);
-                const double dist = normL2Sqr_(sample, center, dims);
+                const double dist = normL2Sqr(sample, center, dims);
                if( min_dist > dist )
                {
@@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K,
                        if( labels[i] != max_k )
                            continue;
                        sample = data.ptr<float>(i);
-                        double dist = normL2Sqr_(sample, _old_center, dims);
+                        double dist = normL2Sqr(sample, _old_center, dims);
                        if( max_dist <= dist )
                        {
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -50,168 +50,6 @@
 namespace cv
 {
 /****************************************************************************************\
 *                     LU & Cholesky implementation for small matrices                    *
 \****************************************************************************************/
 template<typename _Tp> static inline int
 LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
 {
    int i, j, k, p = 1;
    astep /= sizeof(A[0]);
    bstep /= sizeof(b[0]);
    for( i = 0; i < m; i++ )
    {
        k = i;
        for( j = i+1; j < m; j++ )
            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
                k = j;
        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
            return 0;
        if( k != i )
        {
            for( j = i; j < m; j++ )
                std::swap(A[i*astep + j], A[k*astep + j]);
            if( b )
                for( j = 0; j < n; j++ )
                    std::swap(b[i*bstep + j], b[k*bstep + j]);
            p = -p;
        }
        _Tp d = -1/A[i*astep + i];
        for( j = i+1; j < m; j++ )
        {
            _Tp alpha = A[j*astep + i]*d;
            for( k = i+1; k < m; k++ )
                A[j*astep + k] += alpha*A[i*astep + k];
            if( b )
                for( k = 0; k < n; k++ )
                    b[j*bstep + k] += alpha*b[i*bstep + k];
        }
        A[i*astep + i] = -d;
    }
    if( b )
    {
        for( i = m-1; i >= 0; i-- )
            for( j = 0; j < n; j++ )
            {
                _Tp s = b[i*bstep + j];
                for( k = i+1; k < m; k++ )
                    s -= A[i*astep + k]*b[k*bstep + j];
                b[i*bstep + j] = s*A[i*astep + i];
            }
    }
    return p;
 }
 int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
    return LUImpl(A, astep, m, b, bstep, n);
 }
 int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
    return LUImpl(A, astep, m, b, bstep, n);
 }
 template<typename _Tp> static inline bool
 CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
 {
    _Tp* L = A;
    int i, j, k;
    double s;
    astep /= sizeof(A[0]);
    bstep /= sizeof(b[0]);
    for( i = 0; i < m; i++ )
    {
        for( j = 0; j < i; j++ )
        {
            s = A[i*astep + j];
            for( k = 0; k < j; k++ )
                s -= L[i*astep + k]*L[j*astep + k];
            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
        }
        s = A[i*astep + i];
        for( k = 0; k < j; k++ )
        {
            double t = L[i*astep + k];
            s -= t*t;
        }
        if( s < std::numeric_limits<_Tp>::epsilon() )
            return false;
        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
    }
    if( !b )
        return true;
    // LLt x = b
    // 1: L y = b
    // 2. Lt x = y
    /*
     [ L00             ]  y0   b0
     [ L10 L11         ]  y1 = b1
     [ L20 L21 L22     ]  y2   b2
     [ L30 L31 L32 L33 ]  y3   b3
     [ L00 L10 L20 L30 ]  x0   y0
     [     L11 L21 L31 ]  x1 = y1
     [         L22 L32 ]  x2   y2
     [             L33 ]  x3   y3
    */
    for( i = 0; i < m; i++ )
    {
        for( j = 0; j < n; j++ )
        {
            s = b[i*bstep + j];
            for( k = 0; k < i; k++ )
                s -= L[i*astep + k]*b[k*bstep + j];
            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
        }
    }
    for( i = m-1; i >= 0; i-- )
    {
        for( j = 0; j < n; j++ )
        {
            s = b[i*bstep + j];
            for( k = m-1; k > i; k-- )
                s -= L[k*astep + i]*b[k*bstep + j];
            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
        }
    }
    return true;
 }
 bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
    return CholImpl(A, astep, m, b, bstep, n);
 }
 bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
    return CholImpl(A, astep, m, b, bstep, n);
 }
 template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
 {
    a = std::abs(a);
@@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_32F, (uchar*)buffer);
            mat.copyTo(a);
-            result = LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_64F, (uchar*)buffer);
            mat.copyTo(a);
-            result = LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
    setIdentity(dst);
    if( method == DECOMP_LU && type == CV_32F )
-        result = LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
    else if( method == DECOMP_LU && type == CV_64F )
-        result = LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
    else if( method == DECOMP_CHOLESKY && type == CV_32F )
-        result = Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
    else
-        result = Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
    if( !result )
        dst = Scalar(0);
@@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
    if( method == DECOMP_LU )
    {
        if( type == CV_32F )
-            result = LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
        else
-            result = LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
    }
    else if( method == DECOMP_CHOLESKY )
    {
        if( type == CV_32F )
-            result = Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
        else
-            result = Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
    }
    else
    {
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2416,140 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
 namespace cv
 {
 float normL2Sqr_(const float* a, const float* b, int n)
 {
    int j = 0; float d = 0.f;
 #if CV_SSE
    if( USE_SSE2 )
    {
        float CV_DECL_ALIGNED(16) buf[4];
        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
        for( ; j <= n - 8; j += 8 )
        {
            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
        }
        _mm_store_ps(buf, _mm_add_ps(d0, d1));
        d = buf[0] + buf[1] + buf[2] + buf[3];
    }
    else
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
        }
    }
    for( ; j < n; j++ )
    {
        float t = a[j] - b[j];
        d += t*t;
    }
    return d;
 }
 float normL1_(const float* a, const float* b, int n)
 {
    int j = 0; float d = 0.f;
 #if CV_SSE
    if( USE_SSE2 )
    {
        float CV_DECL_ALIGNED(16) buf[4];
        static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
        __m128 absmask = _mm_load_ps((const float*)absbuf);
        for( ; j <= n - 8; j += 8 )
        {
            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
            d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
            d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
        }
        _mm_store_ps(buf, _mm_add_ps(d0, d1));
        d = buf[0] + buf[1] + buf[2] + buf[3];
    }
    else
 #elif CV_NEON
    float32x4_t v_sum = vdupq_n_f32(0.0f);
    for ( ; j <= n - 4; j += 4)
        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
    float CV_DECL_ALIGNED(16) buf[4];
    vst1q_f32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
 }
 int normL1_(const uchar* a, const uchar* b, int n)
 {
    int j = 0, d = 0;
 #if CV_SSE
    if( USE_SSE2 )
    {
        __m128i d0 = _mm_setzero_si128();
        for( ; j <= n - 16; j += 16 )
        {
            __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
            __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
        }
        for( ; j <= n - 4; j += 4 )
        {
            __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
            __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
        }
        d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
    }
    else
 #elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
    {
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    }
    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
 }
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
@@ -2564,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result = std::max(result, ST(std::abs(src[k])));
+                    result = std::max(result, ST(cv_abs(src[k])));
            }
    }
    *_result = result;
@@ -2585,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result += std::abs(src[k]);
+                    result += cv_abs(src[k]);
            }
    }
    *_result = result;
@@ -2684,9 +2550,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
 Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
 {
-    int result = 0;
+    return cv::hal::normHamming(a, b, size);
    cv::hal::normHamming(a, b, size, result);
    return result;
 }
 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
@@ -3037,16 +2901,12 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
                if( normType == NORM_HAMMING )
                {
-                    int result = 0;
+                    return hal::normHamming(data, (int)len);
                    cv::hal::normHamming(data, (int)len, result);
                    return result;
                }
                if( normType == NORM_HAMMING2 )
                {
-                    int result = 0;
+                    return hal::normHamming(data, (int)len, 2);
                    hal::normHamming(data, (int)len, 2, result);
                    return result;
                }
            }
        }
@@ -3072,9 +2932,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
-            int one = 0;
+            result += hal::normHamming(ptrs[0], total, cellSize);
            cv::hal::normHamming(ptrs[0], total, cellSize, one);
            result += one;
        }
        return result;
@@ -3558,9 +3416,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
-            int one = 0;
+            result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
            hal::normHamming(ptrs[0], ptrs[1], total, cellSize, one);
            result += one;
        }
        return result;
@@ -3698,7 +3554,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
-             hal::normHamming(src1, src2 + step2*i, len, dist[i]);
+             dist[i] = hal::normHamming(src1, src2 + step2*i, len);
    }
    else
    {
@@ -3706,7 +3562,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
        for( int i = 0; i < nvecs; i++ )
        {
            if (mask[i])
-                hal::normHamming(src1, src2 + step2*i, len, dist[i]);
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len);
            else
                dist[i] = val0;
        }
@@ -3720,7 +3576,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
-            hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]);
+            dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
    }
    else
    {
@@ -3728,7 +3584,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
        for( int i = 0; i < nvecs; i++ )
        {
            if (mask[i])
-                hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]);
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
            else
                dist[i] = val0;
        }
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -812,7 +812,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
      }
    }
  }
-  fastAtan2(resY, resX, Ang, ang_size, false);
+  hal::fastAtan2(resY, resX, Ang, ang_size, false);
  // Loop slides pi/3 window around feature point
  for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) {
    ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@@ -81,28 +81,17 @@ float normL1_(const float* a, const float* b, int n);
 float normL2Sqr_(const float* a, const float* b, int n);
 void exp(const float* src, float* dst, int n);
 void exp(const double* src, double* dst, int n);
 void log(const float* src, float* dst, int n);
 void log(const double* src, double* dst, int n);
 void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
 void magnitude(const float* x, const float* y, float* dst, int n);
-
+void magnitude(const double* x, const double* y, double* dst, int n);
-/** @brief Computes the cube root of an argument.
+void sqrt(const float* src, float* dst, int len);
-
+void sqrt(const double* src, double* dst, int len);
- The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+void invSqrt(const float* src, float* dst, int len);
- NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+void invSqrt(const double* src, double* dst, int len);
 single-precision data.
 @param val A function argument.
 */
 float cubeRoot(float val);
 /** @brief Calculates the angle of a 2D vector in degrees.
 The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
 in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
 @param x x-coordinate of the vector.
 @param y y-coordinate of the vector.
 */
 float fastAtan2(float y, float x);
 }} //cv::hal
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -380,7 +380,7 @@ cvRound( double value )
    TEGRA_ROUND_DBL(value);
 #elif defined CV_ICC || defined __GNUC__
 # if CV_VFP
-    ARM_ROUND_DBL(value)
+    ARM_ROUND_DBL(value);
 # else
    return (int)lrint(value);
 # endif
@@ -488,7 +488,7 @@ CV_INLINE int cvRound(float value)
    TEGRA_ROUND_FLT(value);
 #elif defined CV_ICC || defined __GNUC__
 # if CV_VFP
-    ARM_ROUND_FLT(value)
+    ARM_ROUND_FLT(value);
 # else
    return (int)lrintf(value);
 # endif
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
--- a/modules/hal/src/mathfuncs.cpp
+++ b/modules/hal/src/mathfuncs.cpp
--- a/modules/hal/src/matrix.cpp
+++ b/modules/hal/src/matrix.cpp
@@ -44,4 +44,165 @@
 namespace cv { namespace hal {
 /****************************************************************************************\
 *                     LU & Cholesky implementation for small matrices                    *
 \****************************************************************************************/
 template<typename _Tp> static inline int
 LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
 {
    int i, j, k, p = 1;
    astep /= sizeof(A[0]);
    bstep /= sizeof(b[0]);
    for( i = 0; i < m; i++ )
    {
        k = i;
        for( j = i+1; j < m; j++ )
            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
                k = j;
        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
            return 0;
        if( k != i )
        {
            for( j = i; j < m; j++ )
                std::swap(A[i*astep + j], A[k*astep + j]);
            if( b )
                for( j = 0; j < n; j++ )
                    std::swap(b[i*bstep + j], b[k*bstep + j]);
            p = -p;
        }
        _Tp d = -1/A[i*astep + i];
        for( j = i+1; j < m; j++ )
        {
            _Tp alpha = A[j*astep + i]*d;
            for( k = i+1; k < m; k++ )
                A[j*astep + k] += alpha*A[i*astep + k];
            if( b )
                for( k = 0; k < n; k++ )
                    b[j*bstep + k] += alpha*b[i*bstep + k];
        }
        A[i*astep + i] = -d;
    }
    if( b )
    {
        for( i = m-1; i >= 0; i-- )
            for( j = 0; j < n; j++ )
            {
                _Tp s = b[i*bstep + j];
                for( k = i+1; k < m; k++ )
                    s -= A[i*astep + k]*b[k*bstep + j];
                b[i*bstep + j] = s*A[i*astep + i];
            }
    }
    return p;
 }
 int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
    return LUImpl(A, astep, m, b, bstep, n);
 }
 int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
    return LUImpl(A, astep, m, b, bstep, n);
 }
 template<typename _Tp> static inline bool
 CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
 {
    _Tp* L = A;
    int i, j, k;
    double s;
    astep /= sizeof(A[0]);
    bstep /= sizeof(b[0]);
    for( i = 0; i < m; i++ )
    {
        for( j = 0; j < i; j++ )
        {
            s = A[i*astep + j];
            for( k = 0; k < j; k++ )
                s -= L[i*astep + k]*L[j*astep + k];
            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
        }
        s = A[i*astep + i];
        for( k = 0; k < j; k++ )
        {
            double t = L[i*astep + k];
            s -= t*t;
        }
        if( s < std::numeric_limits<_Tp>::epsilon() )
            return false;
        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
    }
    if( !b )
        return true;
    // LLt x = b
    // 1: L y = b
    // 2. Lt x = y
    /*
     [ L00             ]  y0   b0
     [ L10 L11         ]  y1 = b1
     [ L20 L21 L22     ]  y2   b2
     [ L30 L31 L32 L33 ]  y3   b3
     [ L00 L10 L20 L30 ]  x0   y0
     [     L11 L21 L31 ]  x1 = y1
     [         L22 L32 ]  x2   y2
     [             L33 ]  x3   y3
     */
    for( i = 0; i < m; i++ )
    {
        for( j = 0; j < n; j++ )
        {
            s = b[i*bstep + j];
            for( k = 0; k < i; k++ )
                s -= L[i*astep + k]*b[k*bstep + j];
            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
        }
    }
    for( i = m-1; i >= 0; i-- )
    {
        for( j = 0; j < n; j++ )
        {
            s = b[i*bstep + j];
            for( k = m-1; k > i; k-- )
                s -= L[k*astep + i]*b[k*bstep + j];
            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
        }
    }
    return true;
 }
 bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
    return CholImpl(A, astep, m, b, bstep, n);
 }
 bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
    return CholImpl(A, astep, m, b, bstep, n);
 }
 }}
--- a/modules/hal/src/precomp.hpp
+++ b/modules/hal/src/precomp.hpp
@@ -42,3 +42,7 @@
 #include "opencv2/hal.hpp"
 #include "opencv2/hal/intrin.hpp"
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
 #include <float.h>
--- a/modules/hal/src/stat.cpp
+++ b/modules/hal/src/stat.cpp
@@ -80,10 +80,10 @@ static const uchar popCountTable4[] =
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 };
-Error::Code normHamming(const uchar* a, int n, int & result)
+int normHamming(const uchar* a, int n)
 {
    int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_NEON
    {
        uint32x4_t bits = vmovq_n_u32(0);
@@ -104,13 +104,13 @@ Error::Code normHamming(const uchar* a, int n, int & result)
            popCountTable[a[i+2]] + popCountTable[a[i+3]];
    for( ; i < n; i++ )
        result += popCountTable[a[i]];
-    return Error::Ok;
+    return result;
 }
-Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result)
+int normHamming(const uchar* a, const uchar* b, int n)
 {
    int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_NEON
    {
        uint32x4_t bits = vmovq_n_u32(0);
@@ -133,44 +133,44 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result)
                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
    for( ; i < n; i++ )
        result += popCountTable[a[i] ^ b[i]];
-    return Error::Ok;
+    return result;
 }
-Error::Code normHamming(const uchar* a, int n, int cellSize, int & result)
+int normHamming(const uchar* a, int n, int cellSize)
 {
    if( cellSize == 1 )
-        return normHamming(a, n, result);
+        return normHamming(a, n);
    const uchar* tab = 0;
    if( cellSize == 2 )
        tab = popCountTable2;
    else if( cellSize == 4 )
        tab = popCountTable4;
    else
-        return Error::Unknown;
+        return -1;
    int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_ENABLE_UNROLLED
    for( ; i <= n - 4; i += 4 )
        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
 #endif
    for( ; i < n; i++ )
        result += tab[a[i]];
-    return Error::Ok;
+    return result;
 }
-Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int & result)
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 {
    if( cellSize == 1 )
-        return normHamming(a, b, n, result);
+        return normHamming(a, b, n);
    const uchar* tab = 0;
    if( cellSize == 2 )
        tab = popCountTable2;
    else if( cellSize == 4 )
        tab = popCountTable4;
    else
-        return Error::Unknown;
+        return -1;
    int i = 0;
-    result = 0;
+    int result = 0;
    #if CV_ENABLE_UNROLLED
    for( ; i <= n - 4; i += 4 )
        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
@@ -178,7 +178,129 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int
    #endif
    for( ; i < n; i++ )
        result += tab[a[i] ^ b[i]];
-    return Error::Ok;
+    return result;
 }
 float normL2Sqr_(const float* a, const float* b, int n)
 {
    int j = 0; float d = 0.f;
 #if CV_SSE
    float CV_DECL_ALIGNED(16) buf[4];
    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
    for( ; j <= n - 8; j += 8 )
    {
        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
    }
    _mm_store_ps(buf, _mm_add_ps(d0, d1));
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
        }
    }
    for( ; j < n; j++ )
    {
        float t = a[j] - b[j];
        d += t*t;
    }
    return d;
 }
 float normL1_(const float* a, const float* b, int n)
 {
    int j = 0; float d = 0.f;
 #if CV_SSE
    float CV_DECL_ALIGNED(16) buf[4];
    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
    __m128 absmask = _mm_load_ps((const float*)absbuf);
    for( ; j <= n - 8; j += 8 )
    {
        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
    }
    _mm_store_ps(buf, _mm_add_ps(d0, d1));
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #elif CV_NEON
    float32x4_t v_sum = vdupq_n_f32(0.0f);
    for ( ; j <= n - 4; j += 4)
        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
    float CV_DECL_ALIGNED(16) buf[4];
    vst1q_f32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
 }
 int normL1_(const uchar* a, const uchar* b, int n)
 {
    int j = 0, d = 0;
 #if CV_SSE
    __m128i d0 = _mm_setzero_si128();
    for( ; j <= n - 16; j += 16 )
    {
        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    for( ; j <= n - 4; j += 4 )
    {
        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
 #elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
    {
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    }
    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
 #endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
 }
 }} //cv::hal
--- a/modules/photo/src/arrays.hpp
+++ b/modules/photo/src/arrays.hpp
@@ -44,6 +44,9 @@
 #ifndef __OPENCV_DENOISING_ARRAYS_HPP__
 #define __OPENCV_DENOISING_ARRAYS_HPP__
 namespace cv
 {
 template <class T>
 struct Array2d
 {
@@ -176,4 +179,6 @@ struct Array4d
    }
 };
 }
 #endif
--- a/modules/stitching/src/autocalib.cpp
+++ b/modules/stitching/src/autocalib.cpp
@@ -49,7 +49,7 @@ namespace {
 template<typename _Tp> static inline bool
 decomposeCholesky(_Tp* A, size_t astep, int m)
 {
-    if (!Cholesky(A, astep, m, 0, 0, 0))
+    if (!hal::Cholesky(A, astep, m, 0, 0, 0))
        return false;
    astep /= sizeof(A[0]);
    for (int i = 0; i < m; ++i)