fully implemented SSE and NEON cases of intrin.hpp; extended the HAL with some basic math functions

2015-04-16 23:00:26 +03:00
parent a2bba1b9e6
commit ee11a2d266
18 changed files with 2460 additions and 2003 deletions
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@@ -81,28 +81,17 @@ float normL1_(const float* a, const float* b, int n);
 float normL2Sqr_(const float* a, const float* b, int n);

 void exp(const float* src, float* dst, int n);
+void exp(const double* src, double* dst, int n);
 void log(const float* src, float* dst, int n);
+void log(const double* src, double* dst, int n);

 void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
 void magnitude(const float* x, const float* y, float* dst, int n);
-
-/** @brief Computes the cube root of an argument.
-
- The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
- NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
- single-precision data.
- @param val A function argument.
- */
-float cubeRoot(float val);
-
-/** @brief Calculates the angle of a 2D vector in degrees.
-
- The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
- in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
- @param x x-coordinate of the vector.
- @param y y-coordinate of the vector.
- */
-float fastAtan2(float y, float x);
+void magnitude(const double* x, const double* y, double* dst, int n);
+void sqrt(const float* src, float* dst, int len);
+void sqrt(const double* src, double* dst, int len);
+void invSqrt(const float* src, float* dst, int len);
+void invSqrt(const double* src, double* dst, int len);

 }} //cv::hal

--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -380,7 +380,7 @@ cvRound( double value )
    TEGRA_ROUND_DBL(value);
 #elif defined CV_ICC || defined __GNUC__
 # if CV_VFP
-    ARM_ROUND_DBL(value)
+    ARM_ROUND_DBL(value);
 # else
    return (int)lrint(value);
 # endif
@@ -488,7 +488,7 @@ CV_INLINE int cvRound(float value)
    TEGRA_ROUND_FLT(value);
 #elif defined CV_ICC || defined __GNUC__
 # if CV_VFP
-    ARM_ROUND_FLT(value)
+    ARM_ROUND_FLT(value);
 # else
    return (int)lrintf(value);
 # endif
--- a/modules/hal/include/opencv2/hal/intrin.hpp
+++ b/modules/hal/include/opencv2/hal/intrin.hpp
--- a/modules/hal/src/mathfuncs.cpp
+++ b/modules/hal/src/mathfuncs.cpp
--- a/modules/hal/src/matrix.cpp
+++ b/modules/hal/src/matrix.cpp
@@ -44,4 +44,165 @@

 namespace cv { namespace hal {

+/****************************************************************************************\
+*                     LU & Cholesky implementation for small matrices                    *
+\****************************************************************************************/
+
+template<typename _Tp> static inline int
+LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    int i, j, k, p = 1;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        k = i;
+
+        for( j = i+1; j < m; j++ )
+            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
+                k = j;
+
+        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
+            return 0;
+
+        if( k != i )
+        {
+            for( j = i; j < m; j++ )
+                std::swap(A[i*astep + j], A[k*astep + j]);
+            if( b )
+                for( j = 0; j < n; j++ )
+                    std::swap(b[i*bstep + j], b[k*bstep + j]);
+            p = -p;
+        }
+
+        _Tp d = -1/A[i*astep + i];
+
+        for( j = i+1; j < m; j++ )
+        {
+            _Tp alpha = A[j*astep + i]*d;
+
+            for( k = i+1; k < m; k++ )
+                A[j*astep + k] += alpha*A[i*astep + k];
+
+            if( b )
+                for( k = 0; k < n; k++ )
+                    b[j*bstep + k] += alpha*b[i*bstep + k];
+        }
+
+        A[i*astep + i] = -d;
+    }
+
+    if( b )
+    {
+        for( i = m-1; i >= 0; i-- )
+            for( j = 0; j < n; j++ )
+            {
+                _Tp s = b[i*bstep + j];
+                for( k = i+1; k < m; k++ )
+                    s -= A[i*astep + k]*b[k*bstep + j];
+                b[i*bstep + j] = s*A[i*astep + i];
+            }
+    }
+
+    return p;
+}
+
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n);
+}
+
+
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n);
+}
+
+
+template<typename _Tp> static inline bool
+CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    _Tp* L = A;
+    int i, j, k;
+    double s;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < i; j++ )
+        {
+            s = A[i*astep + j];
+            for( k = 0; k < j; k++ )
+                s -= L[i*astep + k]*L[j*astep + k];
+            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
+        }
+        s = A[i*astep + i];
+        for( k = 0; k < j; k++ )
+        {
+            double t = L[i*astep + k];
+            s -= t*t;
+        }
+        if( s < std::numeric_limits<_Tp>::epsilon() )
+            return false;
+        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
+    }
+
+    if( !b )
+        return true;
+
+    // LLt x = b
+    // 1: L y = b
+    // 2. Lt x = y
+
+    /*
+     [ L00             ]  y0   b0
+     [ L10 L11         ]  y1 = b1
+     [ L20 L21 L22     ]  y2   b2
+     [ L30 L31 L32 L33 ]  y3   b3
+
+     [ L00 L10 L20 L30 ]  x0   y0
+     [     L11 L21 L31 ]  x1 = y1
+     [         L22 L32 ]  x2   y2
+     [             L33 ]  x3   y3
+     */
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = 0; k < i; k++ )
+                s -= L[i*astep + k]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    for( i = m-1; i >= 0; i-- )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = m-1; k > i; k-- )
+                s -= L[k*astep + i]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    return true;
+}
+
+
+bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
 }}
--- a/modules/hal/src/precomp.hpp
+++ b/modules/hal/src/precomp.hpp
@@ -42,3 +42,7 @@

 #include "opencv2/hal.hpp"
 #include "opencv2/hal/intrin.hpp"
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <float.h>
--- a/modules/hal/src/stat.cpp
+++ b/modules/hal/src/stat.cpp
@@ -80,10 +80,10 @@ static const uchar popCountTable4[] =
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 };

-Error::Code normHamming(const uchar* a, int n, int & result)
+int normHamming(const uchar* a, int n)
 {
    int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_NEON
    {
        uint32x4_t bits = vmovq_n_u32(0);
@@ -104,13 +104,13 @@ Error::Code normHamming(const uchar* a, int n, int & result)
            popCountTable[a[i+2]] + popCountTable[a[i+3]];
    for( ; i < n; i++ )
        result += popCountTable[a[i]];
-    return Error::Ok;
+    return result;
 }

-Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result)
+int normHamming(const uchar* a, const uchar* b, int n)
 {
    int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_NEON
    {
        uint32x4_t bits = vmovq_n_u32(0);
@@ -133,44 +133,44 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int & result)
                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
    for( ; i < n; i++ )
        result += popCountTable[a[i] ^ b[i]];
-    return Error::Ok;
+    return result;
 }

-Error::Code normHamming(const uchar* a, int n, int cellSize, int & result)
+int normHamming(const uchar* a, int n, int cellSize)
 {
    if( cellSize == 1 )
-        return normHamming(a, n, result);
+        return normHamming(a, n);
    const uchar* tab = 0;
    if( cellSize == 2 )
        tab = popCountTable2;
    else if( cellSize == 4 )
        tab = popCountTable4;
    else
-        return Error::Unknown;
+        return -1;
    int i = 0;
-    result = 0;
+    int result = 0;
 #if CV_ENABLE_UNROLLED
    for( ; i <= n - 4; i += 4 )
        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
 #endif
    for( ; i < n; i++ )
        result += tab[a[i]];
-    return Error::Ok;
+    return result;
 }

-Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int & result)
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 {
    if( cellSize == 1 )
-        return normHamming(a, b, n, result);
+        return normHamming(a, b, n);
    const uchar* tab = 0;
    if( cellSize == 2 )
        tab = popCountTable2;
    else if( cellSize == 4 )
        tab = popCountTable4;
    else
-        return Error::Unknown;
+        return -1;
    int i = 0;
-    result = 0;
+    int result = 0;
    #if CV_ENABLE_UNROLLED
    for( ; i <= n - 4; i += 4 )
        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
@@ -178,7 +178,129 @@ Error::Code normHamming(const uchar* a, const uchar* b, int n, int cellSize, int
    #endif
    for( ; i < n; i++ )
        result += tab[a[i] ^ b[i]];
-    return Error::Ok;
+    return result;
+}
+
+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+    __m128 absmask = _mm_load_ps((const float*)absbuf);
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#elif CV_NEON
+    float32x4_t v_sum = vdupq_n_f32(0.0f);
+    for ( ; j <= n - 4; j += 4)
+        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
+
+    float CV_DECL_ALIGNED(16) buf[4];
+    vst1q_f32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    __m128i d0 = _mm_setzero_si128();
+
+    for( ; j <= n - 16; j += 16 )
+    {
+        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+
+    for( ; j <= n - 4; j += 4 )
+    {
+        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+#elif CV_NEON
+    uint32x4_t v_sum = vdupq_n_u32(0.0f);
+    for ( ; j <= n - 16; j += 16)
+    {
+        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
+        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    vst1q_u32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
 }

 }} //cv::hal