Merge pull request #5810 from mshabunin:hal_interface

2015-12-17 16:48:01 +00:00
parent 5021350082 c04d62db8b
commit 9aeb8c8d5a
65 changed files with 3782 additions and 3437 deletions
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
--- a/modules/core/src/arithm_core.hpp
+++ b/modules/core/src/arithm_core.hpp
@@ -0,0 +1,607 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_ARITHM_CORE_HPP__
+#define __OPENCV_ARITHM_CORE_HPP__
+
+#include "arithm_simd.hpp"
+
+namespace cv {
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
+};
+
+template<typename T> struct OpMin
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::min(a, b); }
+};
+
+template<typename T> struct OpMax
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::max(a, b); }
+};
+
+template<typename T> struct OpAbsDiff
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
+};
+
+template<typename T> struct OpAnd
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a & b; }
+};
+
+template<typename T> struct OpOr
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a | b; }
+};
+
+template<typename T> struct OpXor
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a ^ b; }
+};
+
+template<typename T> struct OpNot
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T ) const { return ~a; }
+};
+
+//=============================================================================
+
+template<typename T, class Op, class VOp>
+void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
+{
+#if CV_SSE2 || CV_NEON
+    VOp vop;
+#endif
+    Op op;
+
+    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
+    {
+        int x = 0;
+
+#if CV_NEON || CV_SSE2
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
+            {
+                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
+                r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
+                VLoadStore256<T>::store(dst + x, r0);
+            }
+        }
+#else
+#if CV_SSE2
+        if( USE_SSE2 )
+        {
+#endif // CV_SSE2
+            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
+            {
+                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
+                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
+                r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
+                r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
+                VLoadStore128<T>::store(dst + x               , r0);
+                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
+            }
+#if CV_SSE2
+        }
+#endif // CV_SSE2
+#endif // CV_AVX2
+#endif // CV_NEON || CV_SSE2
+
+#if CV_AVX2
+        // nothing
+#elif CV_SSE2
+        if( USE_SSE2 )
+        {
+            for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
+            {
+                typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
+                r = vop(r, VLoadStore64<T>::load(src2 + x));
+                VLoadStore64<T>::store(dst + x, r);
+            }
+        }
+#endif
+
+#if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            T v0 = op(src1[x], src2[x]);
+            T v1 = op(src1[x+1], src2[x+1]);
+            dst[x] = v0; dst[x+1] = v1;
+            v0 = op(src1[x+2], src2[x+2]);
+            v1 = op(src1[x+3], src2[x+3]);
+            dst[x+2] = v0; dst[x+3] = v1;
+        }
+#endif
+
+        for( ; x < width; x++ )
+            dst[x] = op(src1[x], src2[x]);
+    }
+}
+
+template<typename T, class Op, class Op32>
+void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
+              T* dst, size_t step, int width, int height)
+{
+#if CV_SSE2 || CV_NEON
+    Op32 op32;
+#endif
+    Op op;
+
+    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
+    {
+        int x = 0;
+
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
+            {
+                for( ; x <= width - 8; x += 8 )
+                {
+                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
+                    r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
+                    VLoadStore256Aligned<T>::store(dst + x, r0);
+                }
+            }
+        }
+#elif CV_SSE2
+        if( USE_SSE2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
+            {
+                for( ; x <= width - 8; x += 8 )
+                {
+                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
+                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
+                    r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
+                    r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
+                    VLoadStore128Aligned<T>::store(dst + x    , r0);
+                    VLoadStore128Aligned<T>::store(dst + x + 4, r1);
+                }
+            }
+        }
+#endif // CV_AVX2
+
+#if CV_NEON || CV_SSE2
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            for( ; x <= width - 8; x += 8 )
+            {
+                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
+                r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
+                VLoadStore256<T>::store(dst + x, r0);
+            }
+        }
+#else
+#if CV_SSE2
+        if( USE_SSE2 )
+        {
+#endif // CV_SSE2
+            for( ; x <= width - 8; x += 8 )
+            {
+                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
+                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
+                r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
+                r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
+                VLoadStore128<T>::store(dst + x    , r0);
+                VLoadStore128<T>::store(dst + x + 4, r1);
+            }
+#if CV_SSE2
+        }
+#endif // CV_SSE2
+#endif // CV_AVX2
+#endif // CV_NEON || CV_SSE2
+
+#if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            T v0 = op(src1[x], src2[x]);
+            T v1 = op(src1[x+1], src2[x+1]);
+            dst[x] = v0; dst[x+1] = v1;
+            v0 = op(src1[x+2], src2[x+2]);
+            v1 = op(src1[x+3], src2[x+3]);
+            dst[x+2] = v0; dst[x+3] = v1;
+        }
+#endif
+
+        for( ; x < width; x++ )
+            dst[x] = op(src1[x], src2[x]);
+    }
+}
+
+
+template<typename T, class Op, class Op64>
+void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
+               T* dst, size_t step, int width, int height)
+{
+#if CV_SSE2
+    Op64 op64;
+#endif
+    Op op;
+
+    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
+    {
+        int x = 0;
+
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
+            {
+                for( ; x <= width - 4; x += 4 )
+                {
+                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
+                    r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
+                    VLoadStore256Aligned<T>::store(dst + x, r0);
+                }
+            }
+        }
+#elif CV_SSE2
+        if( USE_SSE2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
+            {
+                for( ; x <= width - 4; x += 4 )
+                {
+                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
+                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
+                    r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
+                    r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
+                    VLoadStore128Aligned<T>::store(dst + x    , r0);
+                    VLoadStore128Aligned<T>::store(dst + x + 2, r1);
+                }
+            }
+        }
+#endif
+
+        for( ; x <= width - 4; x += 4 )
+        {
+            T v0 = op(src1[x], src2[x]);
+            T v1 = op(src1[x+1], src2[x+1]);
+            dst[x] = v0; dst[x+1] = v1;
+            v0 = op(src1[x+2], src2[x+2]);
+            v1 = op(src1[x+3], src2[x+3]);
+            dst[x+2] = v0; dst[x+3] = v1;
+        }
+
+        for( ; x < width; x++ )
+            dst[x] = op(src1[x], src2[x]);
+    }
+}
+
+template<typename T> static void
+cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
+     uchar* dst, size_t step, int width, int height, int code)
+{
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    if( code == CMP_GE || code == CMP_LT )
+    {
+        std::swap(src1, src2);
+        std::swap(step1, step2);
+        code = code == CMP_GE ? CMP_LE : CMP_GT;
+    }
+
+    Cmp_SIMD<T> vop(code);
+
+    if( code == CMP_GT || code == CMP_LE )
+    {
+        int m = code == CMP_GT ? 0 : 255;
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int x = vop(src1, src2, dst, width);
+            #if CV_ENABLE_UNROLLED
+            for( ; x <= width - 4; x += 4 )
+            {
+                int t0, t1;
+                t0 = -(src1[x] > src2[x]) ^ m;
+                t1 = -(src1[x+1] > src2[x+1]) ^ m;
+                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
+                t0 = -(src1[x+2] > src2[x+2]) ^ m;
+                t1 = -(src1[x+3] > src2[x+3]) ^ m;
+                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
+            }
+            #endif
+            for( ; x < width; x++ )
+                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
+        }
+    }
+    else if( code == CMP_EQ || code == CMP_NE )
+    {
+        int m = code == CMP_EQ ? 0 : 255;
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int x = 0;
+            #if CV_ENABLE_UNROLLED
+            for( ; x <= width - 4; x += 4 )
+            {
+                int t0, t1;
+                t0 = -(src1[x] == src2[x]) ^ m;
+                t1 = -(src1[x+1] == src2[x+1]) ^ m;
+                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
+                t0 = -(src1[x+2] == src2[x+2]) ^ m;
+                t1 = -(src1[x+3] == src2[x+3]) ^ m;
+                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
+            }
+            #endif
+            for( ; x < width; x++ )
+                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
+        }
+    }
+}
+
+template<typename T, typename WT> static void
+mul_( const T* src1, size_t step1, const T* src2, size_t step2,
+      T* dst, size_t step, int width, int height, WT scale )
+{
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Mul_SIMD<T, WT> vop;
+
+    if( scale == (WT)1. )
+    {
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int i = vop(src1, src2, dst, width, scale);
+            #if CV_ENABLE_UNROLLED
+            for(; i <= width - 4; i += 4 )
+            {
+                T t0;
+                T t1;
+                t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
+                t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
+                dst[i  ] = t0;
+                dst[i+1] = t1;
+
+                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
+                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
+                dst[i+2] = t0;
+                dst[i+3] = t1;
+            }
+            #endif
+            for( ; i < width; i++ )
+                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
+        }
+    }
+    else
+    {
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int i = vop(src1, src2, dst, width, scale);
+            #if CV_ENABLE_UNROLLED
+            for(; i <= width - 4; i += 4 )
+            {
+                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
+                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
+                dst[i] = t0; dst[i+1] = t1;
+
+                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
+                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
+                dst[i+2] = t0; dst[i+3] = t1;
+            }
+            #endif
+            for( ; i < width; i++ )
+                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
+        }
+    }
+}
+
+
+template<typename T> static void
+div_i( const T* src1, size_t step1, const T* src2, size_t step2,
+      T* dst, size_t step, int width, int height, double scale )
+{
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Div_SIMD<T> vop;
+    float scale_f = (float)scale;
+
+    for( ; height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int i = vop(src1, src2, dst, width, scale);
+        for( ; i < width; i++ )
+        {
+            T num = src1[i], denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+div_f( const T* src1, size_t step1, const T* src2, size_t step2,
+      T* dst, size_t step, int width, int height, double scale )
+{
+    T scale_f = (T)scale;
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Div_SIMD<T> vop;
+
+    for( ; height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int i = vop(src1, src2, dst, width, scale);
+        for( ; i < width; i++ )
+        {
+            T num = src1[i], denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+recip_i( const T*, size_t, const T* src2, size_t step2,
+         T* dst, size_t step, int width, int height, double scale )
+{
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Recip_SIMD<T> vop;
+    float scale_f = (float)scale;
+
+    for( ; height--; src2 += step2, dst += step )
+    {
+        int i = vop(src2, dst, width, scale);
+        for( ; i < width; i++ )
+        {
+            T denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+recip_f( const T*, size_t, const T* src2, size_t step2,
+         T* dst, size_t step, int width, int height, double scale )
+{
+    T scale_f = (T)scale;
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Recip_SIMD<T> vop;
+
+    for( ; height--; src2 += step2, dst += step )
+    {
+        int i = vop(src2, dst, width, scale);
+        for( ; i < width; i++ )
+        {
+            T denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T, typename WT> static void
+addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
+              T* dst, size_t step, int width, int height, void* _scalars )
+{
+    const double* scalars = (const double*)_scalars;
+    WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    AddWeighted_SIMD<T, WT> vop;
+
+    for( ; height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int x = vop(src1, src2, dst, width, alpha, beta, gamma);
+        #if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
+            T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
+            dst[x] = t0; dst[x+1] = t1;
+
+            t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
+            t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
+            dst[x+2] = t0; dst[x+3] = t1;
+        }
+        #endif
+        for( ; x < width; x++ )
+            dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
+    }
+}
+
+} // cv::
+
+
+#endif // __OPENCV_ARITHM_CORE_HPP__
--- a/modules/core/src/arithm_simd.hpp
+++ b/modules/core/src/arithm_simd.hpp
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -0,0 +1,228 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_HAL_REPLACEMENT_HPP__
+#define __OPENCV_CORE_HAL_REPLACEMENT_HPP__
+
+#include "opencv2/core/hal/interface.h"
+
+inline int hal_ni_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_not8u(const uchar*, size_t, uchar*, size_t, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_add8u hal_ni_add8u
+#define cv_hal_add8s hal_ni_add8s
+#define cv_hal_add16u hal_ni_add16u
+#define cv_hal_add16s hal_ni_add16s
+#define cv_hal_add32s hal_ni_add32s
+#define cv_hal_add32f hal_ni_add32f
+#define cv_hal_add64f hal_ni_add64f
+#define cv_hal_sub8u hal_ni_sub8u
+#define cv_hal_sub8s hal_ni_sub8s
+#define cv_hal_sub16u hal_ni_sub16u
+#define cv_hal_sub16s hal_ni_sub16s
+#define cv_hal_sub32s hal_ni_sub32s
+#define cv_hal_sub32f hal_ni_sub32f
+#define cv_hal_sub64f hal_ni_sub64f
+#define cv_hal_max8u hal_ni_max8u
+#define cv_hal_max8s hal_ni_max8s
+#define cv_hal_max16u hal_ni_max16u
+#define cv_hal_max16s hal_ni_max16s
+#define cv_hal_max32s hal_ni_max32s
+#define cv_hal_max32f hal_ni_max32f
+#define cv_hal_max64f hal_ni_max64f
+#define cv_hal_min8u hal_ni_min8u
+#define cv_hal_min8s hal_ni_min8s
+#define cv_hal_min16u hal_ni_min16u
+#define cv_hal_min16s hal_ni_min16s
+#define cv_hal_min32s hal_ni_min32s
+#define cv_hal_min32f hal_ni_min32f
+#define cv_hal_min64f hal_ni_min64f
+#define cv_hal_absdiff8u hal_ni_absdiff8u
+#define cv_hal_absdiff8s hal_ni_absdiff8s
+#define cv_hal_absdiff16u hal_ni_absdiff16u
+#define cv_hal_absdiff16s hal_ni_absdiff16s
+#define cv_hal_absdiff32s hal_ni_absdiff32s
+#define cv_hal_absdiff32f hal_ni_absdiff32f
+#define cv_hal_absdiff64f hal_ni_absdiff64f
+#define cv_hal_and8u hal_ni_and8u
+#define cv_hal_or8u hal_ni_or8u
+#define cv_hal_xor8u hal_ni_xor8u
+#define cv_hal_not8u hal_ni_not8u
+
+inline int hal_ni_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_cmp8u hal_ni_cmp8u
+#define cv_hal_cmp8s hal_ni_cmp8s
+#define cv_hal_cmp16u hal_ni_cmp16u
+#define cv_hal_cmp16s hal_ni_cmp16s
+#define cv_hal_cmp32s hal_ni_cmp32s
+#define cv_hal_cmp32f hal_ni_cmp32f
+#define cv_hal_cmp64f hal_ni_cmp64f
+
+inline int hal_ni_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_mul8u hal_ni_mul8u
+#define cv_hal_mul8s hal_ni_mul8s
+#define cv_hal_mul16u hal_ni_mul16u
+#define cv_hal_mul16s hal_ni_mul16s
+#define cv_hal_mul32s hal_ni_mul32s
+#define cv_hal_mul32f hal_ni_mul32f
+#define cv_hal_mul64f hal_ni_mul64f
+#define cv_hal_div8u hal_ni_div8u
+#define cv_hal_div8s hal_ni_div8s
+#define cv_hal_div16u hal_ni_div16u
+#define cv_hal_div16s hal_ni_div16s
+#define cv_hal_div32s hal_ni_div32s
+#define cv_hal_div32f hal_ni_div32f
+#define cv_hal_div64f hal_ni_div64f
+#define cv_hal_recip8u hal_ni_recip8u
+#define cv_hal_recip8s hal_ni_recip8s
+#define cv_hal_recip16u hal_ni_recip16u
+#define cv_hal_recip16s hal_ni_recip16s
+#define cv_hal_recip32s hal_ni_recip32s
+#define cv_hal_recip32f hal_ni_recip32f
+#define cv_hal_recip64f hal_ni_recip64f
+
+inline int hal_ni_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, const double*) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_addWeighted8u hal_ni_addWeighted8u
+#define cv_hal_addWeighted8s hal_ni_addWeighted8s
+#define cv_hal_addWeighted16u hal_ni_addWeighted16u
+#define cv_hal_addWeighted16s hal_ni_addWeighted16s
+#define cv_hal_addWeighted32s hal_ni_addWeighted32s
+#define cv_hal_addWeighted32f hal_ni_addWeighted32f
+#define cv_hal_addWeighted64f hal_ni_addWeighted64f
+
+inline int hal_ni_split8u(const uchar*, uchar**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split16u(const ushort*, ushort**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split32s(const int*, int**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_split64s(const int64*, int64**, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_split8u hal_ni_split8u
+#define cv_hal_split16u hal_ni_split16u
+#define cv_hal_split32s hal_ni_split32s
+#define cv_hal_split64s hal_ni_split64s
+
+inline int hal_ni_merge8u(const uchar**, uchar*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge16u(const ushort**, ushort*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge32s(const int**, int*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_merge64s(const int64**, int64*, int, int) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+#define cv_hal_merge8u hal_ni_merge8u
+#define cv_hal_merge16u hal_ni_merge16u
+#define cv_hal_merge32s hal_ni_merge32s
+#define cv_hal_merge64s hal_ni_merge64s
+
+#include "custom_hal.hpp"
+
+#endif
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -52,22 +52,22 @@ namespace cv

 int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
-    return hal::LU(A, astep, m, b, bstep, n);
+    return hal::LU32f(A, astep, m, b, bstep, n);
 }

 int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
-    return hal::LU(A, astep, m, b, bstep, n);
+    return hal::LU64f(A, astep, m, b, bstep, n);
 }

 bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
-    return hal::Cholesky(A, astep, m, b, bstep, n);
+    return hal::Cholesky32f(A, astep, m, b, bstep, n);
 }

 bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
-    return hal::Cholesky(A, astep, m, b, bstep, n);
+    return hal::Cholesky64f(A, astep, m, b, bstep, n);
 }

 template<typename _Tp> static inline _Tp hypot(_Tp a, _Tp b)
@@ -740,7 +740,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_32F, (uchar*)buffer);
            mat.copyTo(a);

-            result = hal::LU(a.ptr<float>(), a.step, rows, 0, 0, 0);
+            result = hal::LU32f(a.ptr<float>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@@ -764,7 +764,7 @@ double cv::determinant( InputArray _mat )
            Mat a(rows, rows, CV_64F, (uchar*)buffer);
            mat.copyTo(a);

-            result = hal::LU(a.ptr<double>(), a.step, rows, 0, 0, 0);
+            result = hal::LU64f(a.ptr<double>(), a.step, rows, 0, 0, 0);
            if( result )
            {
                for( int i = 0; i < rows; i++ )
@@ -1027,13 +1027,13 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
    setIdentity(dst);

    if( method == DECOMP_LU && type == CV_32F )
-        result = hal::LU(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
+        result = hal::LU32f(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n) != 0;
    else if( method == DECOMP_LU && type == CV_64F )
-        result = hal::LU(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
+        result = hal::LU64f(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n) != 0;
    else if( method == DECOMP_CHOLESKY && type == CV_32F )
-        result = hal::Cholesky(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
+        result = hal::Cholesky32f(src1.ptr<float>(), src1.step, n, dst.ptr<float>(), dst.step, n);
    else
-        result = hal::Cholesky(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);
+        result = hal::Cholesky64f(src1.ptr<double>(), src1.step, n, dst.ptr<double>(), dst.step, n);

    if( !result )
        dst = Scalar(0);
@@ -1265,16 +1265,16 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
    if( method == DECOMP_LU )
    {
        if( type == CV_32F )
-            result = hal::LU(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
+            result = hal::LU32f(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb) != 0;
        else
-            result = hal::LU(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
+            result = hal::LU64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb) != 0;
    }
    else if( method == DECOMP_CHOLESKY )
    {
        if( type == CV_32F )
-            result = hal::Cholesky(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
+            result = hal::Cholesky32f(a.ptr<float>(), a.step, n, dst.ptr<float>(), dst.step, nb);
        else
-            result = hal::Cholesky(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
+            result = hal::Cholesky64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
    }
    else
    {
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -191,13 +191,13 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
        {
            const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
            float *mag = (float*)ptrs[2];
-            hal::magnitude( x, y, mag, len );
+            hal::magnitude32f( x, y, mag, len );
        }
        else
        {
            const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
            double *mag = (double*)ptrs[2];
-            hal::magnitude( x, y, mag, len );
+            hal::magnitude64f( x, y, mag, len );
        }
    }
 }
@@ -374,7 +374,7 @@ void cartToPolar( InputArray src1, InputArray src2,
            {
                const float *x = (const float*)ptrs[0], *y = (const float*)ptrs[1];
                float *mag = (float*)ptrs[2], *angle = (float*)ptrs[3];
-                hal::magnitude( x, y, mag, len );
+                hal::magnitude32f( x, y, mag, len );
                hal::fastAtan2( y, x, angle, len, angleInDegrees );
            }
            else
@@ -382,7 +382,7 @@ void cartToPolar( InputArray src1, InputArray src2,
                const double *x = (const double*)ptrs[0], *y = (const double*)ptrs[1];
                double *angle = (double*)ptrs[3];

-                hal::magnitude(x, y, (double*)ptrs[2], len);
+                hal::magnitude64f(x, y, (double*)ptrs[2], len);
                k = 0;

 #if CV_SSE2
@@ -760,7 +760,7 @@ static void Exp_32f_ipp(const float *x, float *y, int n)
        }
        setIppErrorStatus();
    }
-    hal::exp(x, y, n);
+    hal::exp32f(x, y, n);
 }

 static void Exp_64f_ipp(const double *x, double *y, int n)
@@ -774,14 +774,14 @@ static void Exp_64f_ipp(const double *x, double *y, int n)
        }
        setIppErrorStatus();
    }
-    hal::exp(x, y, n);
+    hal::exp64f(x, y, n);
 }

 #define Exp_32f Exp_32f_ipp
 #define Exp_64f Exp_64f_ipp
 #else
-#define Exp_32f hal::exp
-#define Exp_64f hal::exp
+#define Exp_32f hal::exp32f
+#define Exp_64f hal::exp64f
 #endif


@@ -828,7 +828,7 @@ static void Log_32f_ipp(const float *x, float *y, int n)
        }
        setIppErrorStatus();
    }
-    hal::log(x, y, n);
+    hal::log32f(x, y, n);
 }

 static void Log_64f_ipp(const double *x, double *y, int n)
@@ -842,14 +842,14 @@ static void Log_64f_ipp(const double *x, double *y, int n)
        }
        setIppErrorStatus();
    }
-    hal::log(x, y, n);
+    hal::log64f(x, y, n);
 }

 #define Log_32f Log_32f_ipp
 #define Log_64f Log_64f_ipp
 #else
-#define Log_32f hal::log
-#define Log_64f hal::log
+#define Log_32f hal::log32f
+#define Log_64f hal::log64f
 #endif

 void log( InputArray _src, OutputArray _dst )
@@ -1356,10 +1356,10 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,

 #endif

-static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt(src, dst, n); }
-static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt(src, dst, n); }
-static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt(src, dst, n); }
-static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt(src, dst, n); }
+static void InvSqrt_32f(const float* src, float* dst, int n) { hal::invSqrt32f(src, dst, n); }
+static void InvSqrt_64f(const double* src, double* dst, int n) { hal::invSqrt64f(src, dst, n); }
+static void Sqrt_32f(const float* src, float* dst, int n) { hal::sqrt32f(src, dst, n); }
+static void Sqrt_64f(const double* src, double* dst, int n) { hal::sqrt64f(src, dst, n); }

 void pow( InputArray _src, double power, OutputArray _dst )
 {
--- a/modules/core/src/mathfuncs_core.cpp
+++ b/modules/core/src/mathfuncs_core.cpp
--- a/modules/core/src/matrix_decomp.cpp
+++ b/modules/core/src/matrix_decomp.cpp
@@ -0,0 +1,231 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+/****************************************************************************************\
+*                     LU & Cholesky implementation for small matrices                    *
+\****************************************************************************************/
+
+template<typename _Tp> static inline int
+LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps)
+{
+    int i, j, k, p = 1;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        k = i;
+
+        for( j = i+1; j < m; j++ )
+            if( std::abs(A[j*astep + i]) > std::abs(A[k*astep + i]) )
+                k = j;
+
+        if( std::abs(A[k*astep + i]) < eps )
+            return 0;
+
+        if( k != i )
+        {
+            for( j = i; j < m; j++ )
+                std::swap(A[i*astep + j], A[k*astep + j]);
+            if( b )
+                for( j = 0; j < n; j++ )
+                    std::swap(b[i*bstep + j], b[k*bstep + j]);
+            p = -p;
+        }
+
+        _Tp d = -1/A[i*astep + i];
+
+        for( j = i+1; j < m; j++ )
+        {
+            _Tp alpha = A[j*astep + i]*d;
+
+            for( k = i+1; k < m; k++ )
+                A[j*astep + k] += alpha*A[i*astep + k];
+
+            if( b )
+                for( k = 0; k < n; k++ )
+                    b[j*bstep + k] += alpha*b[i*bstep + k];
+        }
+
+        A[i*astep + i] = -d;
+    }
+
+    if( b )
+    {
+        for( i = m-1; i >= 0; i-- )
+            for( j = 0; j < n; j++ )
+            {
+                _Tp s = b[i*bstep + j];
+                for( k = i+1; k < m; k++ )
+                    s -= A[i*astep + k]*b[k*bstep + j];
+                b[i*bstep + j] = s*A[i*astep + i];
+            }
+    }
+
+    return p;
+}
+
+
+int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
+}
+
+
+int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
+}
+
+template<typename _Tp> static inline bool
+CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
+{
+    _Tp* L = A;
+    int i, j, k;
+    double s;
+    astep /= sizeof(A[0]);
+    bstep /= sizeof(b[0]);
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < i; j++ )
+        {
+            s = A[i*astep + j];
+            for( k = 0; k < j; k++ )
+                s -= L[i*astep + k]*L[j*astep + k];
+            L[i*astep + j] = (_Tp)(s*L[j*astep + j]);
+        }
+        s = A[i*astep + i];
+        for( k = 0; k < j; k++ )
+        {
+            double t = L[i*astep + k];
+            s -= t*t;
+        }
+        if( s < std::numeric_limits<_Tp>::epsilon() )
+            return false;
+        L[i*astep + i] = (_Tp)(1./std::sqrt(s));
+    }
+
+    if( !b )
+        return true;
+
+    // LLt x = b
+    // 1: L y = b
+    // 2. Lt x = y
+
+    /*
+     [ L00             ]  y0   b0
+     [ L10 L11         ]  y1 = b1
+     [ L20 L21 L22     ]  y2   b2
+     [ L30 L31 L32 L33 ]  y3   b3
+
+     [ L00 L10 L20 L30 ]  x0   y0
+     [     L11 L21 L31 ]  x1 = y1
+     [         L22 L32 ]  x2   y2
+     [             L33 ]  x3   y3
+     */
+
+    for( i = 0; i < m; i++ )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = 0; k < i; k++ )
+                s -= L[i*astep + k]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    for( i = m-1; i >= 0; i-- )
+    {
+        for( j = 0; j < n; j++ )
+        {
+            s = b[i*bstep + j];
+            for( k = m-1; k > i; k-- )
+                s -= L[k*astep + i]*b[k*bstep + j];
+            b[i*bstep + j] = (_Tp)(s*L[i*astep + i]);
+        }
+    }
+
+    return true;
+}
+
+
+bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+//=============================================================================
+// for compatibility with 3.0
+
+int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
+}
+
+int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
+}
+
+bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n)
+{
+    return CholImpl(A, astep, m, b, bstep, n);
+}
+
+
+}}
--- a/modules/core/src/merge.cpp
+++ b/modules/core/src/merge.cpp
@@ -0,0 +1,412 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+#if CV_NEON
+template<typename T> struct VMerge2;
+template<typename T> struct VMerge3;
+template<typename T> struct VMerge4;
+
+#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        data_type* dst){                                          \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        const data_type* src2, data_type* dst){                   \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            r.val[2] = load_func(src2);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        const data_type* src2, const data_type* src3,             \
+                        data_type* dst){                                          \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            r.val[2] = load_func(src2);                                           \
+            r.val[3] = load_func(src3);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
+MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
+MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
+MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
+
+MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
+MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
+MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
+MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
+
+MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
+MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
+MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
+MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
+
+#elif CV_SSE2
+
+template <typename T>
+struct VMerge2
+{
+    VMerge2() : support(false) { }
+    void operator()(const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VMerge3
+{
+    VMerge3() : support(false) { }
+    void operator()(const T *, const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VMerge4
+{
+    VMerge4() : support(false) { }
+    void operator()(const T *, const T *, const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge2<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge2()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1,                        \
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge3<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge3()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
+        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2,                                             \
+                       v_src3, v_src4, v_src5);                                            \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge4<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge4()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1,                        \
+                    const data_type * src2, const data_type * src3,                        \
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
+        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
+        reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
+        reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
+                       v_src4, v_src5, v_src6, v_src7);                                    \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+
+#if CV_SSE4_1
+MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+#endif
+
+MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+
+#endif
+
+template<typename T> static void
+merge_( const T** src, T* dst, int len, int cn )
+{
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        const T* src0 = src[0];
+        for( i = j = 0; i < len; i++, j += cn )
+            dst[j] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const T *src0 = src[0], *src1 = src[1];
+        i = j = 0;
+#if CV_NEON
+        if(cn == 2)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VMerge2<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, dst + j);
+        }
+#elif CV_SSE2
+        if(cn == 2)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VMerge2<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        i = j = 0;
+#if CV_NEON
+        if(cn == 3)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VMerge3<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, src2 + i, dst + j);
+        }
+#elif CV_SSE2
+        if(cn == 3)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VMerge3<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, src2 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+            dst[j+2] = src2[i];
+        }
+    }
+    else
+    {
+        const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        i = j = 0;
+#if CV_NEON
+        if(cn == 4)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VMerge4<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
+        }
+#elif CV_SSE2
+        if(cn == 4)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VMerge4<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+
+    for( ; k < cn; k += 4 )
+    {
+        const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+}
+
+
+void merge8u(const uchar** src, uchar* dst, int len, int cn )
+{
+    CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn)
+    merge_(src, dst, len, cn);
+}
+
+void merge16u(const ushort** src, ushort* dst, int len, int cn )
+{
+    CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn)
+    merge_(src, dst, len, cn);
+}
+
+void merge32s(const int** src, int* dst, int len, int cn )
+{
+    CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn)
+    merge_(src, dst, len, cn);
+}
+
+void merge64s(const int64** src, int64* dst, int len, int cn )
+{
+    CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn)
+    merge_(src, dst, len, cn);
+}
+
+}}
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -58,8 +58,6 @@
 #include "opencv2/core/ocl.hpp"
 #endif

-#include "opencv2/hal.hpp"
-
 #include <assert.h>
 #include <ctype.h>
 #include <float.h>
@@ -69,6 +67,27 @@
 #include <stdlib.h>
 #include <string.h>

+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <limits>
+#include <float.h>
+#include <cstring>
+#include <cassert>
+
+#define USE_SSE2  (cv::checkHardwareSupport(CV_CPU_SSE))
+#define USE_SSE4_2  (cv::checkHardwareSupport(CV_CPU_SSE4_2))
+#define USE_AVX  (cv::checkHardwareSupport(CV_CPU_AVX))
+#define USE_AVX2  (cv::checkHardwareSupport(CV_CPU_AVX2))
+
+#include "opencv2/core/hal/hal.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include "opencv2/core/sse_utils.hpp"
+#include "opencv2/core/neon_utils.hpp"
+
+#include "arithm_core.hpp"
+#include "hal_replacement.hpp"
+
 #ifdef HAVE_TEGRA_OPTIMIZATION
 #include "opencv2/core/core_tegra.hpp"
 #else
@@ -78,6 +97,34 @@
 namespace cv
 {

+// -128.f ... 255.f
+extern const float g_8x32fTab[];
+#define CV_8TO32F(x)  cv::g_8x32fTab[(x)+128]
+
+extern const ushort g_8x16uSqrTab[];
+#define CV_SQR_8U(x)  cv::g_8x16uSqrTab[(x)+255]
+
+extern const uchar g_Saturate8u[];
+#define CV_FAST_CAST_8U(t)   (assert(-256 <= (t) && (t) <= 512), cv::g_Saturate8u[(t)+256])
+#define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
+#define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
+
+template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
+{ return CV_FAST_CAST_8U(a + b); }
+
+template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
+{ return CV_FAST_CAST_8U(a - b); }
+
+template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
+{ return saturate_cast<short>(std::abs(a - b)); }
+
+template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
+{ return saturate_cast<schar>(std::abs(a - b)); }
+
+template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
+
+template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
+
 typedef void (*BinaryFunc)(const uchar* src1, size_t step1,
                       const uchar* src2, size_t step2,
                       uchar* dst, size_t step, Size sz,
@@ -100,21 +147,6 @@ BinaryFunc getCopyMaskFunc(size_t esz);
 /* maximal average node_count/hash_size ratio beyond which hash table is resized */
 #define  CV_SPARSE_HASH_RATIO    3

-
-
-// -128.f ... 255.f
-extern const float g_8x32fTab[];
-#define CV_8TO32F(x)  cv::g_8x32fTab[(x)+128]
-
-extern const ushort g_8x16uSqrTab[];
-#define CV_SQR_8U(x)  cv::g_8x16uSqrTab[(x)+255]
-
-extern const uchar g_Saturate8u[];
-#define CV_FAST_CAST_8U(t)   (assert(-256 <= (t) && (t) <= 512), cv::g_Saturate8u[(t)+256])
-#define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
-#define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
-
-
 #if defined WIN32 || defined _WIN32
 void deleteThreadAllocData();
 #endif
@@ -282,6 +314,4 @@ cv::Mutex& getInitializationMutex();

 }

-#include "opencv2/hal/intrin.hpp"
-
 #endif /*_CXCORE_INTERNAL_H_*/
--- a/modules/core/src/split.cpp
+++ b/modules/core/src/split.cpp
@@ -0,0 +1,428 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+#if CV_NEON
+template<typename T> struct VSplit2;
+template<typename T> struct VSplit3;
+template<typename T> struct VSplit4;
+
+#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>                                                        \
+    {                                                                             \
+        void operator()(const data_type* src, data_type* dst0,                    \
+                        data_type* dst1) const                                    \
+        {                                                                         \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+        }                                                                         \
+    }
+
+#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>                                                        \
+    {                                                                             \
+        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
+                        data_type* dst2) const                                    \
+        {                                                                         \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+            store_func(dst2, r.val[2]);                                           \
+        }                                                                         \
+    }
+
+#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>                                                        \
+    {                                                                             \
+        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
+                        data_type* dst2, data_type* dst3) const                   \
+        {                                                                         \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+            store_func(dst2, r.val[2]);                                           \
+            store_func(dst3, r.val[3]);                                           \
+        }                                                                         \
+    }
+
+SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
+SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
+SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
+SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
+
+SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
+SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
+SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
+SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
+
+SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
+SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
+SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
+SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
+
+#elif CV_SSE2
+
+template <typename T>
+struct VSplit2
+{
+    VSplit2() : support(false) { }
+    void operator()(const T *, T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VSplit3
+{
+    VSplit3() : support(false) { }
+    void operator()(const T *, T *, T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VSplit4
+{
+    VSplit4() : support(false) { }
+    void operator()(const T *, T *, T *, T *, T *) const { }
+
+    bool support;
+};
+
+#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit2<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit2()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src,                                                 \
+                    data_type * dst0, data_type * dst1) const                              \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit3<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit3()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src,                                                 \
+                    data_type * dst0, data_type * dst1, data_type * dst2) const            \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
+        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
+                         v_src3, v_src4, v_src5);                                          \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit4<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit4()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
+                    data_type * dst2, data_type * dst3) const                              \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
+        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
+        reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
+        reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
+                         v_src4, v_src5, v_src6, v_src7);                                  \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
+        _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+#endif
+
+template<typename T> static void
+split_( const T* src, T** dst, int len, int cn )
+{
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        T* dst0 = dst[0];
+
+        if(cn == 1)
+        {
+            memcpy(dst0, src, len * sizeof(T));
+        }
+        else
+        {
+            for( i = 0, j = 0 ; i < len; i++, j += cn )
+                dst0[i] = src[j];
+        }
+    }
+    else if( k == 2 )
+    {
+        T *dst0 = dst[0], *dst1 = dst[1];
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 2)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VSplit2<T> vsplit;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i);
+        }
+#elif CV_SSE2
+        if (cn == 2)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VSplit2<T> vsplit;
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i);
+            }
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst0[i] = src[j];
+            dst1[i] = src[j+1];
+        }
+    }
+    else if( k == 3 )
+    {
+        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 3)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VSplit3<T> vsplit;
+            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
+        }
+#elif CV_SSE2
+        if (cn == 3)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VSplit3<T> vsplit;
+
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
+            }
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst0[i] = src[j];
+            dst1[i] = src[j+1];
+            dst2[i] = src[j+2];
+        }
+    }
+    else
+    {
+        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 4)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VSplit4<T> vsplit;
+            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
+        }
+#elif CV_SSE2
+        if (cn == 4)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VSplit4<T> vsplit;
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
+            }
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst0[i] = src[j]; dst1[i] = src[j+1];
+            dst2[i] = src[j+2]; dst3[i] = src[j+3];
+        }
+    }
+
+    for( ; k < cn; k += 4 )
+    {
+        T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst0[i] = src[j]; dst1[i] = src[j+1];
+            dst2[i] = src[j+2]; dst3[i] = src[j+3];
+        }
+    }
+}
+
+void split8u(const uchar* src, uchar** dst, int len, int cn )
+{
+    CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn)
+    split_(src, dst, len, cn);
+}
+
+void split16u(const ushort* src, ushort** dst, int len, int cn )
+{
+    CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn)
+    split_(src, dst, len, cn);
+}
+
+void split32s(const int* src, int** dst, int len, int cn )
+{
+    CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn)
+    split_(src, dst, len, cn);
+}
+
+void split64s(const int64* src, int64** dst, int len, int cn )
+{
+    CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn)
+    split_(src, dst, len, cn);
+}
+
+}}
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -3996,3 +3996,266 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )

    return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
 }
+
+namespace cv { namespace hal {
+
+static const uchar popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+
+static const uchar popCountTable2[] =
+{
+    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+};
+
+static const uchar popCountTable4[] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+
+int normHamming(const uchar* a, int n)
+{
+    int i = 0;
+    int result = 0;
+#if CV_NEON
+    {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t bitsSet = vcntq_u8 (A_vec);
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
+        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+    }
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i]] + popCountTable[a[i+1]] +
+            popCountTable[a[i+2]] + popCountTable[a[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, const uchar* b, int n)
+{
+    int i = 0;
+    int result = 0;
+#if CV_NEON
+    {
+        uint32x4_t bits = vmovq_n_u32(0);
+        for (; i <= n - 16; i += 16) {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t B_vec = vld1q_u8 (b + i);
+            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+            uint8x16_t bitsSet = vcntq_u8 (AxorB);
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            bits = vaddq_u32(bits, bitSet4);
+        }
+        uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+        result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+        result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+    }
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+                    popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i] ^ b[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        return -1;
+    int i = 0;
+    int result = 0;
+#if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
+#endif
+    for( ; i < n; i++ )
+        result += tab[a[i]];
+    return result;
+}
+
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, b, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        return -1;
+    int i = 0;
+    int result = 0;
+    #if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
+                tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    #endif
+    for( ; i < n; i++ )
+        result += tab[a[i] ^ b[i]];
+    return result;
+}
+
+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+        d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    float CV_DECL_ALIGNED(16) buf[4];
+    static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+    __m128 absmask = _mm_load_ps((const float*)absbuf);
+
+    for( ; j <= n - 8; j += 8 )
+    {
+        __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+        __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+        d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+        d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+    }
+    _mm_store_ps(buf, _mm_add_ps(d0, d1));
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#elif CV_NEON
+    float32x4_t v_sum = vdupq_n_f32(0.0f);
+    for ( ; j <= n - 4; j += 4)
+        v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j)));
+
+    float CV_DECL_ALIGNED(16) buf[4];
+    vst1q_f32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    __m128i d0 = _mm_setzero_si128();
+
+    for( ; j <= n - 16; j += 16 )
+    {
+        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+
+    for( ; j <= n - 4; j += 4 )
+    {
+        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+
+        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+    }
+    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+#elif CV_NEON
+    uint32x4_t v_sum = vdupq_n_u32(0.0f);
+    for ( ; j <= n - 16; j += 16)
+    {
+        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
+        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
+        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
+    }
+
+    uint CV_DECL_ALIGNED(16) buf[4];
+    vst1q_u32(buf, v_sum);
+    d = buf[0] + buf[1] + buf[2] + buf[3];
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+}} //cv::hal
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -86,6 +86,45 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex();
 #undef max
 #undef abs
 #include <tchar.h>
+#if defined _MSC_VER
+  #if _MSC_VER >= 1400
+    #include <intrin.h>
+  #elif defined _M_IX86
+    static void __cpuid(int* cpuid_data, int)
+    {
+        __asm
+        {
+            push ebx
+            push edi
+            mov edi, cpuid_data
+            mov eax, 1
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+            pop ebx
+        }
+    }
+    static void __cpuidex(int* cpuid_data, int, int)
+    {
+        __asm
+        {
+            push edi
+            mov edi, cpuid_data
+            mov eax, 7
+            mov ecx, 0
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+        }
+    }
+  #endif
+#endif

 #ifdef WINRT
 #include <wrl/client.h>
@@ -198,15 +237,154 @@ void Exception::formatMessage()
        msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str());
 }

+struct HWFeatures
+{
+    enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
+
+    HWFeatures(void)
+    {
+        memset( have, 0, sizeof(have) );
+        x86_family = 0;
+    }
+
+    static HWFeatures initialize(void)
+    {
+        HWFeatures f;
+        int cpuid_data[4] = { 0, 0, 0, 0 };
+
+    #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+        __cpuid(cpuid_data, 1);
+    #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+        #ifdef __x86_64__
+        asm __volatile__
+        (
+         "movl $1, %%eax\n\t"
+         "cpuid\n\t"
+         :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+         :
+         : "cc"
+        );
+        #else
+        asm volatile
+        (
+         "pushl %%ebx\n\t"
+         "movl $1,%%eax\n\t"
+         "cpuid\n\t"
+         "popl %%ebx\n\t"
+         : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
+         :
+         : "cc"
+        );
+        #endif
+    #endif
+
+        f.x86_family = (cpuid_data[0] >> 8) & 15;
+        if( f.x86_family >= 6 )
+        {
+            f.have[CV_CPU_MMX]    = (cpuid_data[3] & (1 << 23)) != 0;
+            f.have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
+            f.have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
+            f.have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
+            f.have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
+            f.have[CV_CPU_FMA3]  = (cpuid_data[2] & (1<<12)) != 0;
+            f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
+            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
+            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
+            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
+
+            // make the second call to the cpuid command in order to get
+            // information about extended features like AVX2
+        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+            __cpuidex(cpuid_data, 7, 0);
+        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+            #ifdef __x86_64__
+            asm __volatile__
+            (
+             "movl $7, %%eax\n\t"
+             "movl $0, %%ecx\n\t"
+             "cpuid\n\t"
+             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+             :
+             : "cc"
+            );
+            #else
+            asm volatile
+            (
+             "pushl %%ebx\n\t"
+             "movl $7,%%eax\n\t"
+             "movl $0,%%ecx\n\t"
+             "cpuid\n\t"
+             "movl %%ebx, %0\n\t"
+             "popl %%ebx\n\t"
+             : "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
+             :
+             : "cc"
+            );
+            #endif
+        #endif
+            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
+
+            f.have[CV_CPU_AVX_512F]       = (cpuid_data[1] & (1<<16)) != 0;
+            f.have[CV_CPU_AVX_512DQ]      = (cpuid_data[1] & (1<<17)) != 0;
+            f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
+            f.have[CV_CPU_AVX_512PF]      = (cpuid_data[1] & (1<<26)) != 0;
+            f.have[CV_CPU_AVX_512ER]      = (cpuid_data[1] & (1<<27)) != 0;
+            f.have[CV_CPU_AVX_512CD]      = (cpuid_data[1] & (1<<28)) != 0;
+            f.have[CV_CPU_AVX_512BW]      = (cpuid_data[1] & (1<<30)) != 0;
+            f.have[CV_CPU_AVX_512VL]      = (cpuid_data[1] & (1<<31)) != 0;
+            f.have[CV_CPU_AVX_512VBMI]    = (cpuid_data[2] &  (1<<1)) != 0;
+        }
+
+    #if defined ANDROID || defined __linux__
+    #ifdef __aarch64__
+        f.have[CV_CPU_NEON] = true;
+    #else
+        int cpufile = open("/proc/self/auxv", O_RDONLY);
+
+        if (cpufile >= 0)
+        {
+            Elf32_auxv_t auxv;
+            const size_t size_auxv_t = sizeof(auxv);
+
+            while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
+            {
+                if (auxv.a_type == AT_HWCAP)
+                {
+                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
+                    break;
+                }
+            }
+
+            close(cpufile);
+        }
+    #endif
+    #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
+        f.have[CV_CPU_NEON] = true;
+    #endif
+
+        return f;
+    }
+
+    int x86_family;
+    bool have[MAX_FEATURE+1];
+};
+
+static HWFeatures  featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
+static HWFeatures* currentFeatures = &featuresEnabled;
+
 bool checkHardwareSupport(int feature)
 {
    CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
-    return cv::hal::checkHardwareSupport(feature);
+    return currentFeatures->have[feature];
 }

+
+volatile bool useOptimizedFlag = true;
+
 void setUseOptimized( bool flag )
 {
-    cv::hal::setUseOptimized(flag);
+    useOptimizedFlag = flag;
+    currentFeatures = flag ? &featuresEnabled : &featuresDisabled;

    ipp::setUseIPP(flag);
 #ifdef HAVE_OPENCL
@@ -219,7 +397,7 @@ void setUseOptimized( bool flag )

 bool useOptimized(void)
 {
-    return cv::hal::useOptimized();
+    return useOptimizedFlag;
 }

 int64 getTickCount(void)
@@ -499,12 +677,12 @@ redirectError( CvErrorCallback errCallback, void* userdata, void** prevUserdata)
 CV_IMPL int cvCheckHardwareSupport(int feature)
 {
    CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
-    return cv::hal::checkHardwareSupport(feature);
+    return cv::currentFeatures->have[feature];
 }

 CV_IMPL int cvUseOptimized( int flag )
 {
-    int prevMode = cv::useOptimized();
+    int prevMode = cv::useOptimizedFlag;
    cv::setUseOptimized( flag != 0 );
    return prevMode;
 }