Merge pull request #3935 from vpisarev:extending_hal_part1

2015-04-21 14:02:02 +00:00
parent dce0405c4d 926754a66e
commit 063e4004ba
30 changed files with 5964 additions and 2173 deletions
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -53,6 +53,7 @@

 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
+#include "opencv2/hal.hpp"

 namespace cv
 {
@@ -400,136 +401,6 @@ configurations while CV_DbgAssert is only retained in the Debug configuration.
 #  define CV_DbgAssert(expr)
 #endif

-
-/////////////// saturate_cast (used in image & signal processing) ///////////////////
-
-/**
-Template function for accurate conversion from one primitive type to another.
-
-The functions saturate_cast resemble the standard C++ cast operations, such as static_cast\<T\>()
-and others. They perform an efficient and accurate conversion from one primitive type to another
-(see the introduction chapter). saturate in the name means that when the input value v is out of the
-range of the target type, the result is not formed just by taking low bits of the input, but instead
-the value is clipped. For example:
-@code
-    uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
-    short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
-@endcode
-Such clipping is done when the target type is unsigned char , signed char , unsigned short or
-signed short . For 32-bit integers, no clipping is done.
-
-When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
-the floating-point value is first rounded to the nearest integer and then clipped if needed (when
-the target type is 8- or 16-bit).
-
-This operation is used in the simplest or most complex image processing functions in OpenCV.
-
-@param v Function parameter.
-@sa add, subtract, multiply, divide, Mat::convertTo
-*/
-template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
-/** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
-
-//! @cond IGNORED
-
-template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
-template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
-template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
-template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
-template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
-
-template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
-template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
-template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
-template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
-template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
-
-template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
-template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
-template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
-template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
-
-template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
-template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
-template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
-template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
-template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
-
-template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
-template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
-
-// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
-template<> inline unsigned saturate_cast<unsigned>(float v)  { return cvRound(v); }
-template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
-
-//! @endcond
-
-//////////////////////////////// low-level functions ////////////////////////////////
-
-CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
-CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
-
-CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
-CV_EXPORTS float normL1_(const float* a, const float* b, int n);
-CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
-
-CV_EXPORTS void exp(const float* src, float* dst, int n);
-CV_EXPORTS void log(const float* src, float* dst, int n);
-
-CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
-CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
-
-/** @brief Computes the cube root of an argument.
-
-The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
-NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
-single-precision data.
-@param val A function argument.
- */
-CV_EXPORTS_W float cubeRoot(float val);
-
-/** @brief Calculates the angle of a 2D vector in degrees.
-
-The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
-in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
-@param x x-coordinate of the vector.
-@param y y-coordinate of the vector.
- */
-CV_EXPORTS_W float fastAtan2(float y, float x);
-
 /*
 * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
 * bit count of A exclusive XOR'ed with B
@@ -549,6 +420,11 @@ typedef Hamming HammingLUT;

 /////////////////////////////////// inline norms ////////////////////////////////////

+template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
+inline int cv_abs(uchar x) { return x; }
+inline int cv_abs(schar x) { return std::abs(x); }
+inline int cv_abs(ushort x) { return x; }
+inline int cv_abs(short x) { return std::abs(x); }

 template<typename _Tp, typename _AccTp> static inline
 _AccTp normL2Sqr(const _Tp* a, int n)
@@ -578,12 +454,12 @@ _AccTp normL1(const _Tp* a, int n)
 #if CV_ENABLE_UNROLLED
    for(; i <= n - 4; i += 4 )
    {
-        s += (_AccTp)std::abs(a[i]) + (_AccTp)std::abs(a[i+1]) +
-            (_AccTp)std::abs(a[i+2]) + (_AccTp)std::abs(a[i+3]);
+        s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
+            (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
    }
 #endif
    for( ; i < n; i++ )
-        s += std::abs(a[i]);
+        s += cv_abs(a[i]);
    return s;
 }

@@ -592,7 +468,7 @@ _AccTp normInf(const _Tp* a, int n)
 {
    _AccTp s = 0;
    for( int i = 0; i < n; i++ )
-        s = std::max(s, (_AccTp)std::abs(a[i]));
+        s = std::max(s, (_AccTp)cv_abs(a[i]));
    return s;
 }

@@ -616,11 +492,10 @@ _AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
    return s;
 }

-template<> inline
-float normL2Sqr(const float* a, const float* b, int n)
+inline float normL2Sqr(const float* a, const float* b, int n)
 {
    if( n >= 8 )
-        return normL2Sqr_(a, b, n);
+        return hal::normL2Sqr_(a, b, n);
    float s = 0;
    for( int i = 0; i < n; i++ )
    {
@@ -650,11 +525,10 @@ _AccTp normL1(const _Tp* a, const _Tp* b, int n)
    return s;
 }

-template<> inline
-float normL1(const float* a, const float* b, int n)
+inline float normL1(const float* a, const float* b, int n)
 {
    if( n >= 8 )
-        return normL1_(a, b, n);
+        return hal::normL1_(a, b, n);
    float s = 0;
    for( int i = 0; i < n; i++ )
    {
@@ -664,10 +538,9 @@ float normL1(const float* a, const float* b, int n)
    return s;
 }

-template<> inline
-int normL1(const uchar* a, const uchar* b, int n)
+inline int normL1(const uchar* a, const uchar* b, int n)
 {
-    return normL1_(a, b, n);
+    return hal::normL1_(a, b, n);
 }

 template<typename _Tp, typename _AccTp> static inline
@@ -682,6 +555,23 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
    return s;
 }

+/** @brief Computes the cube root of an argument.
+
+ The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+ NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+ single-precision data.
+ @param val A function argument.
+ */
+CV_EXPORTS_W float cubeRoot(float val);
+
+/** @brief Calculates the angle of a 2D vector in degrees.
+
+ The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
+ in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
+ @param x x-coordinate of the vector.
+ @param y y-coordinate of the vector.
+ */
+CV_EXPORTS_W float fastAtan2(float y, float x);

 ////////////////// forward declarations for important OpenCV types //////////////////

--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -70,16 +70,6 @@
 #  define CV_EXPORTS
 #endif

-#ifndef CV_INLINE
-#  if defined __cplusplus
-#    define CV_INLINE static inline
-#  elif defined _MSC_VER
-#    define CV_INLINE __inline
-#  else
-#    define CV_INLINE static
-#  endif
-#endif
-
 #ifndef CV_EXTERN_C
 #  ifdef __cplusplus
 #    define CV_EXTERN_C extern "C"
@@ -186,19 +176,6 @@
 #define CV_ELEM_SIZE(type) \
    (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))

-
-/****************************************************************************************\
-*                                      fast math                                         *
-\****************************************************************************************/
-
-#if defined __BORLANDC__
-#  include <fastmath.h>
-#elif defined __cplusplus
-#  include <cmath>
-#else
-#  include <math.h>
-#endif
-
 #ifndef MIN
 #  define MIN(a,b)  ((a) > (b) ? (b) : (a))
 #endif
@@ -207,164 +184,6 @@
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif

-#ifdef HAVE_TEGRA_OPTIMIZATION
-#  include "tegra_round.hpp"
-#endif
-
-//! @addtogroup core_utils
-//! @{
-
-#if CV_VFP
-// 1. general scheme
-#define ARM_ROUND(_value, _asm_string) \
-    int res; \
-    float temp; \
-    asm(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
-    return res;
-// 2. version for double
-#ifdef __clang__
-#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
-#else
-#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
-#endif
-// 3. version for float
-#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
-#endif // CV_VFP
-
-/** @brief Rounds floating-point number to the nearest integer
-
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
- */
-CV_INLINE int cvRound( double value )
-{
-#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    return _mm_cvtsd_si32(t);
-#elif defined _MSC_VER && defined _M_IX86
-    int t;
-    __asm
-    {
-        fld value;
-        fistp t;
-    }
-    return t;
-#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_DBL(value);
-#elif defined CV_ICC || defined __GNUC__
-# if CV_VFP
-    ARM_ROUND_DBL(value)
-# else
-    return (int)lrint(value);
-# endif
-#else
-    double intpart, fractpart;
-    fractpart = modf(value, &intpart);
-    if ((fabs(fractpart) != 0.5) || ((((int)intpart) % 2) != 0))
-        return (int)(value + (value >= 0 ? 0.5 : -0.5));
-    else
-        return (int)intpart;
-#endif
-}
-
-#ifdef __cplusplus
-
-/** @overload */
-CV_INLINE int cvRound(float value)
-{
-#if defined ANDROID && (defined CV_ICC || defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
-    TEGRA_ROUND_FLT(value);
-#elif CV_VFP && !defined HAVE_TEGRA_OPTIMIZATION
-    ARM_ROUND_FLT(value)
-#else
-    return cvRound((double)value);
-#endif
-}
-
-/** @overload */
-CV_INLINE int cvRound(int value)
-{
-    return value;
-}
-
-#endif // __cplusplus
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
-The function computes an integer i such that:
-\f[i \le \texttt{value} < i+1\f]
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
- */
-CV_INLINE int cvFloor( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t,i)));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i - (i > value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(value - i);
-    return i - (diff < 0);
-#endif
-}
-
-/** @brief Rounds floating-point number to the nearest integer not larger than the original.
-
-The function computes an integer i such that:
-\f[i \le \texttt{value} < i+1\f]
-@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
-result is not defined.
-*/
-CV_INLINE int cvCeil( double value )
-{
-#if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__)
-    __m128d t = _mm_set_sd( value );
-    int i = _mm_cvtsd_si32(t);
-    return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t));
-#elif defined __GNUC__
-    int i = (int)value;
-    return i + (i < value);
-#else
-    int i = cvRound(value);
-    float diff = (float)(i - value);
-    return i + (diff < 0);
-#endif
-}
-
-/** @brief Determines if the argument is Not A Number.
-
-@param value The input floating-point value
-
-The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
-otherwise. */
-CV_INLINE int cvIsNaN( double value )
-{
-    union { uint64 u; double f; } ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
-           ((unsigned)ieee754.u != 0) > 0x7ff00000;
-}
-
-/** @brief Determines if the argument is Infinity.
-
-@param value The input floating-point value
-
-The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
-and 0 otherwise. */
-CV_INLINE int cvIsInf( double value )
-{
-    union { uint64 u; double f; } ieee754;
-    ieee754.f = value;
-    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
-           (unsigned)ieee754.u == 0;
-}
-
-//! @} core_utils
-
 /****************************************************************************************\
 *          exchange-add operation for atomic operations on reference counters            *
 \****************************************************************************************/
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -427,7 +427,7 @@ template<typename _Tp, int m> struct Matx_DetOp
    double operator ()(const Matx<_Tp, m, m>& a) const
    {
        Matx<_Tp, m, m> temp = a;
-        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        double p = hal::LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
        if( p == 0 )
            return p;
        for( int i = 0; i < m; i++ )
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -72,9 +72,9 @@ template<typename _Tp, int m> struct Matx_FastInvOp
            b(i, i) = (_Tp)1;

        if( method == DECOMP_CHOLESKY )
-            return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+            return hal::Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);

-        return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        return hal::LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
    }
 };

--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@@ -136,14 +136,6 @@ namespace cv
 /* the alignment of all the allocated buffers */
 #define  CV_MALLOC_ALIGN    16

-#ifdef __GNUC__
-#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
-#elif defined _MSC_VER
-#  define CV_DECL_ALIGNED(x) __declspec(align(x))
-#else
-#  define CV_DECL_ALIGNED(x)
-#endif
-
 /* IEEE754 constants and macros */
 #define  CV_TOGGLE_FLT(x) ((x)^((int)(x) < 0 ? 0x7fffffff : 0))
 #define  CV_TOGGLE_DBL(x) ((x)^((int64)(x) < 0 ? CV_BIG_INT(0x7fffffffffffffff) : 0))
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -113,22 +113,6 @@ bytes of the header. In C++ interface the role of CvArr is played by InputArray
 */
 typedef void CvArr;

-typedef union Cv32suf
-{
-    int i;
-    unsigned u;
-    float f;
-}
-Cv32suf;
-
-typedef union Cv64suf
-{
-    int64 i;
-    uint64 u;
-    double f;
-}
-Cv64suf;
-
 typedef int CVStatus;

 /** @see cv::Error::Code */
--- a/modules/core/src/kmeans.cpp
+++ b/modules/core/src/kmeans.cpp
@@ -79,7 +79,7 @@ public:

        for ( int i = begin; i<end; i++ )
        {
-            tdist2[i] = std::min(normL2Sqr_(data + step*i, data + stepci, dims), dist[i]);
+            tdist2[i] = std::min(normL2Sqr(data + step*i, data + stepci, dims), dist[i]);
        }
    }

@@ -114,7 +114,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,

    for( i = 0; i < N; i++ )
    {
-        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
+        dist[i] = normL2Sqr(data + step*i, data + step*centers[0], dims);
        sum0 += dist[i];
    }

@@ -189,7 +189,7 @@ public:
            for( int k = 0; k < K; k++ )
            {
                const float* center = centers.ptr<float>(k);
-                const double dist = normL2Sqr_(sample, center, dims);
+                const double dist = normL2Sqr(sample, center, dims);

                if( min_dist > dist )
                {
@@ -384,7 +384,7 @@ double cv::kmeans( InputArray _data, int K,
                        if( labels[i] != max_k )
                            continue;
                        sample = data.ptr<float>(i);
-                        double dist = normL2Sqr_(sample, _old_center, dims);
+                        double dist = normL2Sqr(sample, _old_center, dims);

                        if( max_dist <= dist )
                        {
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -50,168 +50,6 @@
 namespace cv
 {

-/****************************************************************************************\
-*                     LU & Cholesky implementation for small matrices                    *
-\****************************************************************************************/
-
-template<typename _Tp> static inline int
-LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    int i, j, k, p = 1;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        k = i;
-
-        for( j = i+1; j < m; j++ )
-            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
-                k = j;
-
-        if( std::abs(A[k*astep + i]) < std::numeric_limits<_Tp>::epsilon() )
-            return 0;
-
-        if( k != i )
-        {
-            for( j = i; j < m; j++ )
-                std::swap(A[i*astep + j], A[k*astep + j]);
-            if( b )
-                for( j = 0; j < n; j++ )
-                    std::swap(b[i*bstep + j], b[k*bstep + j]);
-            p = -p;
-        }
-
-        _Tp d = -1/A[i*astep + i];
-
-        for( j = i+1; j < m; j++ )
-        {
-            _Tp alpha = A[j*astep + i]*d;
-
-            for( k = i+1; k < m; k++ )
-                A[j*astep + k] += alpha*A[i*astep + k];
-
-            if( b )
-                for( k = 0; k < n; k++ )
-                    b[j*bstep + k] += alpha*b[i*bstep + k];
-        }
-
-        A[i*astep + i] = -d;
-    }
-
-    if( b )
-    {
-        for( i = m-1; i >= 0; i-- )
-            for( j = 0; j < n; j++ )
-            {
-                _Tp s = b[i*bstep + j];
-                for( k = i+1; k < m; k++ )
-                    s -= A[i*astep + k]*b[k*bstep + j];
-                b[i*bstep + j] = s*A[i*astep + i];
-            }
-    }
-
-    return p;
-}
-
-
-int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return LUImpl(A, astep, m, b, bstep, n);
-}
-
-
-template<typename _Tp> static inline bool
-CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
-{
-    _Tp* L = A;
-    int i, j, k;
-    double s;
-    astep /= sizeof(A[0]);
-    bstep /= sizeof(b[0]);
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < i; j++ )
-        {
-            s = A[i*astep + j];
-            for( k = 0; k < j; k++ )
-                s -= L[i*astep + k]*L[j*astep + k];
-            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
-        }
-        s = A[i*astep + i];
-        for( k = 0; k < j; k++ )
-        {
-            double t = L[i*astep + k];
-            s -= t*t;
-        }
-        if( s < std::numeric_limits<_Tp>::epsilon() )
-            return false;
-        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
-    }
-
-    if( !b )
-        return true;
-
-    // LLt x = b
-    // 1: L y = b
-    // 2. Lt x = y
-
-    /*
-     [ L00             ]  y0   b0
-     [ L10 L11         ]  y1 = b1
-     [ L20 L21 L22     ]  y2   b2
-     [ L30 L31 L32 L33 ]  y3   b3
-
-     [ L00 L10 L20 L30 ]  x0   y0
-     [     L11 L21 L31 ]  x1 = y1
-     [         L22 L32 ]  x2   y2
-     [             L33 ]  x3   y3
-    */
-
-    for( i = 0; i < m; i++ )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = 0; k < i; k++ )
-                s -= L[i*astep + k]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    for( i = m-1; i >= 0; i-- )
-    {
-        for( j = 0; j < n; j++ )
-        {
-            s = b[i*bstep + j];
-            for( k = m-1; k > i; k-- )
-                s -= L[k*astep + i]*b[k*bstep + j];
-            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
-        }
-    }
-
-    return true;
-}
-
-
-bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
-{
-    return CholImpl(A, astep, m, b, bstep, n);
-}
-
-
 template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
 {
    a = std::abs(a);
@@ -882,7 +720,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_32F, (uchar*)buffer);
            mat.copyTo(a);

-            result = LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@@ -906,7 +744,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_64F, (uchar*)buffer);
            mat.copyTo(a);

-            result = LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
+            result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@@ -1169,13 +1007,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
    setIdentity(dst);

    if( method == DECOMP_LU && type == CV_32F )
-        result = LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
    else if( method == DECOMP_LU && type == CV_64F )
-        result = LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
+        result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
    else if( method == DECOMP_CHOLESKY && type == CV_32F )
-        result = Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
    else
-        result = Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
+        result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);

    if( !result )
        dst = Scalar(0);
@@ -1407,16 +1245,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
    if( method == DECOMP_LU )
    {
        if( type == CV_32F )
-            result = LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
        else
-            result = LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
+            result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
    }
    else if( method == DECOMP_CHOLESKY )
    {
        if( type == CV_32F )
-            result = Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
        else
-            result = Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
+            result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
    }
    else
    {
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -2416,140 +2416,6 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
 namespace cv
 {

-float normL2Sqr_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
-            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
-            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
-        }
-    }
-
-    for( ; j < n; j++ )
-    {
-        float t = a[j] - b[j];
-        d += t*t;
-    }
-    return d;
-}
-
-
-float normL1_(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-        __m128 absmask = _mm_load_ps((const float*)absbuf);
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
-            d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#elif CV_NEON
-    float32x4_t v_sum = vdupq_n_f32(0.0f);
-    for ( ; j <= n - 4; j += 4)
-        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
-
-    float CV_DECL_ALIGNED(16) buf[4];
-    vst1q_f32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
-int normL1_(const uchar* a, const uchar* b, int n)
-{
-    int j = 0, d = 0;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        __m128i d0 = _mm_setzero_si128();
-
-        for( ; j <= n - 16; j += 16 )
-        {
-            __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
-            __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
-
-            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-        }
-
-        for( ; j <= n - 4; j += 4 )
-        {
-            __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
-            __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
-
-            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
-        }
-        d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
-    }
-    else
-#elif CV_NEON
-    uint32x4_t v_sum = vdupq_n_u32(0.0f);
-    for ( ; j <= n - 16; j += 16)
-    {
-        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
-        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
-        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
-    }
-
-    uint CV_DECL_ALIGNED(16) buf[4];
-    vst1q_u32(buf, v_sum);
-    d = buf[0] + buf[1] + buf[2] + buf[3];
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
-                    std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
-        }
-    }
-    for( ; j < n; j++ )
-        d += std::abs(a[j] - b[j]);
-    return d;
-}
-
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
@@ -2564,7 +2430,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result = std::max(result, ST(std::abs(src[k])));
+                    result = std::max(result, ST(cv_abs(src[k])));
            }
    }
    *_result = result;
@@ -2585,7 +2451,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result += std::abs(src[k]);
+                    result += cv_abs(src[k]);
            }
    }
    *_result = result;
@@ -2684,9 +2550,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le

 Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
 {
-    int result = 0;
-    cv::hal::normHamming(a, b, size, result);
-    return result;
+    return cv::hal::normHamming(a, b, size);
 }

 #define CV_DEF_NORM_FUNC(L, suffix, type, ntype) \
@@ -3037,16 +2901,12 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )

                if( normType == NORM_HAMMING )
                {
-                    int result = 0;
-                    cv::hal::normHamming(data, (int)len, result);
-                    return result;
+                    return hal::normHamming(data, (int)len);
                }

                if( normType == NORM_HAMMING2 )
                {
-                    int result = 0;
-                    hal::normHamming(data, (int)len, 2, result);
-                    return result;
+                    return hal::normHamming(data, (int)len, 2);
                }
            }
        }
@@ -3072,9 +2932,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )

        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
-            int one = 0;
-            cv::hal::normHamming(ptrs[0], total, cellSize, one);
-            result += one;
+            result += hal::normHamming(ptrs[0], total, cellSize);
        }

        return result;
@@ -3558,9 +3416,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m

        for( size_t i = 0; i < it.nplanes; i++, ++it )
        {
-            int one = 0;
-            hal::normHamming(ptrs[0], ptrs[1], total, cellSize, one);
-            result += one;
+            result += hal::normHamming(ptrs[0], ptrs[1], total, cellSize);
        }

        return result;
@@ -3698,7 +3554,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
-             hal::normHamming(src1, src2 + step2*i, len, dist[i]);
+             dist[i] = hal::normHamming(src1, src2 + step2*i, len);
    }
    else
    {
@@ -3706,7 +3562,7 @@ static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2,
        for( int i = 0; i < nvecs; i++ )
        {
            if (mask[i])
-                hal::normHamming(src1, src2 + step2*i, len, dist[i]);
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len);
            else
                dist[i] = val0;
        }
@@ -3720,7 +3576,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
    if( !mask )
    {
        for( int i = 0; i < nvecs; i++ )
-            hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]);
+            dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
    }
    else
    {
@@ -3728,7 +3584,7 @@ static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2
        for( int i = 0; i < nvecs; i++ )
        {
            if (mask[i])
-                hal::normHamming(src1, src2 + step2*i, len, 2, dist[i]);
+                dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2);
            else
                dist[i] = val0;
        }