fully implemented SSE and NEON cases of intrin.hpp; extended the HAL with some basic math functions

2015-04-16 23:00:26 +03:00
parent a2bba1b9e6
commit ee11a2d266
18 changed files with 2460 additions and 2003 deletions
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -50,168 +50,6 @@
 namespace cv
 {

-/****************************************************************************************\
-*                     LU & Cholesky implementation for small matrices                    *
-\****************************************************************************************/
-
-template<typename _Tp> static inline int
-LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    int i, j, k, p = 1;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        k = i;
-
-        for( j = i+1; j < m; j++ )
-            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
-                k = j;
-
-        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
-            return 0;
-
-        if( k != i )
-        {
-            for( j = i; j < m; j++ )
-                std::swap(A[i*astep + j], A[k*astep + j]);
-            if( b )
-                for( j = 0; j < n; j++ )
-                    std::swap(b[i*bstep + j], b[k*bstep + j]);
-            p = -p;
-        }
-
-        _Tp d = -1/A[i*astep + i];
-
-        for( j = i+1; j < m; j++ )
-        {
-            _Tp alpha = A[j*astep + i]*d;
-
-            for( k = i+1; k < m; k++ )
-                A[j*astep + k] += alpha*A[i*astep + k];
-
-            if( b )
-                for( k = 0; k < n; k++ )
-                    b[j*bstep + k] += alpha*b[i*bstep + k];
-        }
-
-        A[i*astep + i] = -d;
-    }
-
-    if( b )
-    {
-        for( i = m-1; i >= 0; i-- )
-            for( j = 0; j < n; j++ )
-            {
-                _Tp s = b[i*bstep + j];
-                for( k = i+1; k < m; k++ )
-                    s -= A[i*astep + k]*b[k*bstep + j];
-                b[i*bstep + j] = s*A[i*astep + i];
-            }
-    }
-
-    return p;
-}
-
-
-int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-template<typename _Tp> static inline bool
-CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    _Tp* L = A;
-    int i, j, k;
-    double s;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < i; j++ )
-        {
-            s = A[i*astep + j];
-            for( k = 0; k < j; k++ )
-                s -= L[i*astep + k]*L[j*astep + k];
-            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
-        }
-        s = A[i*astep + i];
-        for( k = 0; k < j; k++ )
-        {
-            double t = L[i*astep + k];
-            s -= t*t;
-        }
-        if( s < std::numeric_limits<_Tp>::epsilon() )
-            return false;
-        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
-    }
-
-    if( !b )
-        return true;
-
-    // LLt x = b
-    // 1: L y = b
-    // 2. Lt x = y
-
-    /*
-     [ L00             ]  y0   b0
-     [ L10 L11         ]  y1 = b1
-     [ L20 L21 L22     ]  y2   b2
-     [ L30 L31 L32 L33 ]  y3   b3
-
-     [ L00 L10 L20 L30 ]  x0   y0
-     [     L11 L21 L31 ]  x1 = y1
-     [         L22 L32 ]  x2   y2
-     [             L33 ]  x3   y3
-    */
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = 0; k < i; k++ )
-                s -= L[i*astep + k]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    for( i = m-1; i >= 0; i-- )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = m-1; k > i; k-- )
-                s -= L[k*astep + i]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    return true;
-}
-
-
-bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-
 template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
 {
    a = std::abs(a);
@@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_32F, (uchar*)buffer);
            mat.copyTo(a);

-            result = LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_64F, (uchar*)buffer);
            mat.copyTo(a);

-            result = LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
    setIdentity(dst);

    if( method == DECOMP_LU && type == CV_32F )
-        result = LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
    else if( method == DECOMP_LU && type == CV_64F )
-        result = LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
    else if( method == DECOMP_CHOLESKY && type == CV_32F )
-        result = Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
    else
-        result = Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);

    if( !result )
        dst = Scalar(0);
@@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
    if( method == DECOMP_LU )
    {
        if( type == CV_32F )
-            result = LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
        else
-            result = LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
    }
    else if( method == DECOMP_CHOLESKY )
    {
        if( type == CV_32F )
-            result = Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
        else
-            result = Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
    }
    else
    {