Merge remote-tracking branch 'master' into stitch-fix

* 'master' of github.com:itseez/opencv: (82 commits) moved part of video to contrib/{outflow, bgsegm}; moved matlab to contrib added some basic functionality needed by the new face module (moved from the old "contrib") moved to the new opencv_contrib/face module fixed various warnings and obvious errors reported by clang compiler and the coverity tool. Fixed review comment from Vadim Pisarevsky modified farneback sample to use T-API ECC patch by the author (G. Evangelidis); fixed some OCL Farneback optical flow test failures on Mac small fix for GaussianBlur ocl test fix binary package build small fix for ocl_resize fix IOS framework fixed test ocl_MatchTemplate for sparse matrix Fixed typos fixing error, wrong template method param. fixing Mac build some formal changes (generally adding constness) Fixed choice of kercn and rowsPerWI for non-Intel device. fixed nDiffs for CalcBackProject fixed tests for ocl_filter2d, ocl_matchTemplate, ocl_histogram.cpp Fixed issue: Mat::copyTo(UMat) if device copy is obsolete. Added test. ... Conflicts: modules/core/include/opencv2/core/mat.inl.hpp
2014-08-11 14:50:08 +04:00
parent f908c2b0e3 07744af790
commit c54b8f9b00
346 changed files with 26116 additions and 422526 deletions
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,6 +1,5 @@
 set(the_description "The Core Functionality")
 ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" OPTIONAL opencv_cudev)
-ocv_module_include_directories(${ZLIB_INCLUDE_DIRS})

 if(HAVE_WINRT_CX)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW")
@@ -19,11 +18,11 @@ file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "incl
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})

-ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+ocv_glob_module_sources(SOURCES "${OPENCV_MODULE_opencv_core_BINARY_DIR}/version_string.inc"
                        HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})

+ocv_module_include_directories(${the_module} ${ZLIB_INCLUDE_DIRS})
 ocv_create_module()
-ocv_add_precompiled_headers(${the_module})

 ocv_add_accuracy_tests()
 ocv_add_perf_tests()
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@@ -845,7 +845,6 @@ For convenience, the following types from the OpenCV C API already have such a s
 that calls the appropriate release function:

 * ``CvCapture``
-* :ocv:struct:`CvDTreeSplit`
 * :ocv:struct:`CvFileStorage`
 * ``CvHaarClassifierCascade``
 * :ocv:struct:`CvMat`
@@ -2326,6 +2325,69 @@ Returns the matrix iterator and sets it to the after-last matrix element.

 The methods return the matrix read-only or read-write iterators, set to the point following the last matrix element.

+
+Mat::forEach
+------------
+Invoke with arguments functor, and runs the functor over all matrix element.
+
+.. ocv:function:: template<typename _Tp, typename Functor> void Mat::forEach(Functor operation)
+
+.. ocv:function:: template<typename _Tp, typename Functor> void Mat::forEach(Functor operation) const
+
+The methos runs operation in parallel. Operation is passed by arguments. Operation have to be a function pointer, a function object or a lambda(C++11).
+
+All of below operation is equal. Put 0xFF to first channel of all matrix elements. ::
+
+    Mat image(1920, 1080, CV_8UC3);
+    typedef cv::Point3_<uint8_t> Pixel;
+
+    // first. raw pointer access.
+    for (int r = 0; r < image.rows; ++r) {
+        Pixel* ptr = image.ptr<Pixel>(0, r);
+        const Pixel* ptr_end = ptr + image.cols;
+        for (; ptr != ptr_end; ++ptr) {
+            ptr->x = 255;
+        }
+    }
+
+
+    // Using MatIterator. (Simple but there are a Iterator's overhead)
+    for (Pixel &p : cv::Mat_<Pixel>(image)) {
+        p.x = 255;
+    }
+
+
+    // Parallel execution with function object.
+    struct Operator {
+        void operator ()(Pixel &pixel, const int * position) {
+            pixel.x = 255;
+        }
+    };
+    image.forEach<Pixel>(Operator());
+
+
+    // Parallel execution using C++11 lambda.
+    image.forEach<Pixel>([](Pixel &p, const int * position) -> void {
+        p.x = 255;
+    });
+
+position parameter is index of current pixel. ::
+
+    // Creating 3D matrix (255 x 255 x 255) typed uint8_t,
+    //  and initialize all elements by the value which equals elements position.
+    //  i.e. pixels (x,y,z) = (1,2,3) is (b,g,r) = (1,2,3).
+
+    int sizes[] = { 255, 255, 255 };
+    typedef cv::Point3_<uint8_t> Pixel;
+
+    Mat_<Pixel> image = Mat::zeros(3, sizes, CV_8UC3);
+
+    image.forEachWithPosition([&](Pixel& pixel, const int position[]) -> void{
+        pixel.x = position[0];
+        pixel.y = position[1];
+        pixel.z = position[2];
+    });
+
 Mat\_
 -----
 .. ocv:class:: Mat_
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -690,7 +690,61 @@ public:
    Mat mean; //!< mean value subtracted before the projection and added after the back projection
 };

+// Linear Discriminant Analysis
+class CV_EXPORTS LDA
+{
+public:
+    // Initializes a LDA with num_components (default 0) and specifies how
+    // samples are aligned (default dataAsRow=true).
+    explicit LDA(int num_components = 0);

+    // Initializes and performs a Discriminant Analysis with Fisher's
+    // Optimization Criterion on given data in src and corresponding labels
+    // in labels. If 0 (or less) number of components are given, they are
+    // automatically determined for given data in computation.
+    LDA(InputArrayOfArrays src, InputArray labels, int num_components = 0);
+
+    // Serializes this object to a given filename.
+    void save(const String& filename) const;
+
+    // Deserializes this object from a given filename.
+    void load(const String& filename);
+
+    // Serializes this object to a given cv::FileStorage.
+    void save(FileStorage& fs) const;
+
+        // Deserializes this object from a given cv::FileStorage.
+    void load(const FileStorage& node);
+
+    // Destructor.
+    ~LDA();
+
+    //! Compute the discriminants for data in src and labels.
+    void compute(InputArrayOfArrays src, InputArray labels);
+
+    // Projects samples into the LDA subspace.
+    Mat project(InputArray src);
+
+    // Reconstructs projections from the LDA subspace.
+    Mat reconstruct(InputArray src);
+
+    // Returns the eigenvectors of this LDA.
+    Mat eigenvectors() const { return _eigenvectors; }
+
+    // Returns the eigenvalues of this LDA.
+    Mat eigenvalues() const { return _eigenvalues; }
+
+    static Mat subspaceProject(InputArray W, InputArray mean, InputArray src);
+    static Mat subspaceReconstruct(InputArray W, InputArray mean, InputArray src);
+
+protected:
+    bool _dataAsRow;
+    int _num_components;
+    Mat _eigenvectors;
+    Mat _eigenvalues;
+
+    void lda(InputArrayOfArrays src, InputArray labels);
+};

 /*!
    Singular Value Decomposition class
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -261,8 +261,8 @@ public:
    int* refcount;

    //! helper fields used in locateROI and adjustROI
-    uchar* datastart;
-    uchar* dataend;
+    const uchar* datastart;
+    const uchar* dataend;

    //! allocator
    Allocator* allocator;
@@ -349,8 +349,8 @@ public:
    uchar* data;
    int* refcount;

-    uchar* datastart;
-    uchar* dataend;
+    const uchar* datastart;
+    const uchar* dataend;

    AllocType alloc_type;
 };
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -395,7 +395,7 @@ struct CV_EXPORTS UMatData

 struct CV_EXPORTS UMatDataAutoLock
 {
-    UMatDataAutoLock(UMatData* u);
+    explicit UMatDataAutoLock(UMatData* u);
    ~UMatDataAutoLock();
    UMatData* u;
 };
@@ -403,7 +403,7 @@ struct CV_EXPORTS UMatDataAutoLock

 struct CV_EXPORTS MatSize
 {
-    MatSize(int* _p);
+    explicit MatSize(int* _p);
    Size operator()() const;
    const int& operator[](int i) const;
    int& operator[](int i);
@@ -417,7 +417,7 @@ struct CV_EXPORTS MatSize
 struct CV_EXPORTS MatStep
 {
    MatStep();
-    MatStep(size_t s);
+    explicit MatStep(size_t s);
    const size_t& operator[](int i) const;
    size_t& operator[](int i);
    operator size_t() const;
@@ -900,6 +900,11 @@ public:
    template<typename _Tp> MatConstIterator_<_Tp> begin() const;
    template<typename _Tp> MatConstIterator_<_Tp> end() const;

+    //! template methods for for operation over all matrix elements.
+    // the operations take care of skipping gaps in the end of rows (if any)
+    template<typename _Tp, typename Functor> void forEach(const Functor& operation);
+    template<typename _Tp, typename Functor> void forEach(const Functor& operation) const;
+
    enum { MAGIC_VAL  = 0x42FF0000, AUTO_STEP = 0, CONTINUOUS_FLAG = CV_MAT_CONT_FLAG, SUBMATRIX_FLAG = CV_SUBMAT_FLAG };
    enum { MAGIC_MASK = 0xFFFF0000, TYPE_MASK = 0x00000FFF, DEPTH_MASK = 7 };

@@ -918,9 +923,9 @@ public:
    uchar* data;

    //! helper fields used in locateROI and adjustROI
-    uchar* datastart;
-    uchar* dataend;
-    uchar* datalimit;
+    const uchar* datastart;
+    const uchar* dataend;
+    const uchar* datalimit;

    //! custom allocator
    MatAllocator* allocator;
@@ -934,6 +939,7 @@ public:
    MatStep step;

 protected:
+    template<typename _Tp, typename Functor> void forEach_impl(const Functor& operation);
 };


@@ -1043,6 +1049,11 @@ public:
    const_iterator begin() const;
    const_iterator end() const;

+    //! template methods for for operation over all matrix elements.
+    // the operations take care of skipping gaps in the end of rows (if any)
+    template<typename Functor> void forEach(const Functor& operation);
+    template<typename Functor> void forEach(const Functor& operation) const;
+
    //! equivalent to Mat::create(_rows, _cols, DataType<_Tp>::type)
    void create(int _rows, int _cols);
    //! equivalent to Mat::create(_size, DataType<_Tp>::type)
@@ -1804,9 +1815,9 @@ public:
    //! copy operator
    MatConstIterator& operator = (const MatConstIterator& it);
    //! returns the current matrix element
-    uchar* operator *() const;
+    const uchar* operator *() const;
    //! returns the i-th matrix element, relative to the current
-    uchar* operator [](ptrdiff_t i) const;
+    const uchar* operator [](ptrdiff_t i) const;

    //! shifts the iterator forward by the specified number of elements
    MatConstIterator& operator += (ptrdiff_t ofs);
@@ -1831,9 +1842,9 @@ public:

    const Mat* m;
    size_t elemSize;
-    uchar* ptr;
-    uchar* sliceStart;
-    uchar* sliceEnd;
+    const uchar* ptr;
+    const uchar* sliceStart;
+    const uchar* sliceEnd;
 };


@@ -1917,9 +1928,9 @@ public:
    //! constructor that sets the iterator to the specified element of the matrix
    MatIterator_(Mat_<_Tp>* _m, int _row, int _col=0);
    //! constructor that sets the iterator to the specified element of the matrix
-    MatIterator_(const Mat_<_Tp>* _m, Point _pt);
+    MatIterator_(Mat_<_Tp>* _m, Point _pt);
    //! constructor that sets the iterator to the specified element of the matrix
-    MatIterator_(const Mat_<_Tp>* _m, const int* _idx);
+    MatIterator_(Mat_<_Tp>* _m, const int* _idx);
    //! copy constructor
    MatIterator_(const MatIterator_& it);
    //! copy operator
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -438,7 +438,7 @@ Mat::Mat(const std::vector<_Tp>& vec, bool copyData)
    if( !copyData )
    {
        step[0] = step[1] = sizeof(_Tp);
-        data = datastart = (uchar*)&vec[0];
+        datastart = data = (uchar*)&vec[0];
        datalimit = dataend = datastart + rows * step[0];
    }
    else
@@ -453,7 +453,7 @@ Mat::Mat(const Vec<_Tp, n>& vec, bool copyData)
    if( !copyData )
    {
        step[0] = step[1] = sizeof(_Tp);
-        data = datastart = (uchar*)vec.val;
+        datastart = data = (uchar*)vec.val;
        datalimit = dataend = datastart + rows * step[0];
    }
    else
@@ -470,7 +470,7 @@ Mat::Mat(const Matx<_Tp,m,n>& M, bool copyData)
    {
        step[0] = cols * sizeof(_Tp);
        step[1] = sizeof(_Tp);
-        data = datastart = (uchar*)M.val;
+        datastart = data = (uchar*)M.val;
        datalimit = dataend = datastart + rows * step[0];
    }
    else
@@ -485,7 +485,7 @@ Mat::Mat(const Point_<_Tp>& pt, bool copyData)
    if( !copyData )
    {
        step[0] = step[1] = sizeof(_Tp);
-        data = datastart = (uchar*)&pt.x;
+        datastart = data = (uchar*)&pt.x;
        datalimit = dataend = datastart + rows * step[0];
    }
    else
@@ -504,7 +504,7 @@ Mat::Mat(const Point3_<_Tp>& pt, bool copyData)
    if( !copyData )
    {
        step[0] = step[1] = sizeof(_Tp);
-        data = datastart = (uchar*)&pt.x;
+        datastart = data = (uchar*)&pt.x;
        datalimit = dataend = datastart + rows * step[0];
    }
    else
@@ -642,7 +642,7 @@ inline void Mat::release()
    if( u && CV_XADD(&u->refcount, -1) == 1 )
        deallocate();
    u = NULL;
-    data = datastart = dataend = datalimit = 0;
+    datastart = dataend = datalimit = data = 0;
    for(int i = 0; i < dims; i++)
        size.p[i] = 0;
 }
@@ -1000,6 +1000,17 @@ MatIterator_<_Tp> Mat::end()
    return it;
 }

+template<typename _Tp, typename Functor> inline
+void Mat::forEach(const Functor& operation) {
+    this->forEach_impl<_Tp>(operation);
+};
+
+template<typename _Tp, typename Functor> inline
+void Mat::forEach(const Functor& operation) const {
+    // call as not const
+    (const_cast<Mat*>(this))->forEach<const _Tp>(operation);
+};
+
 template<typename _Tp> inline
 Mat::operator std::vector<_Tp>() const
 {
@@ -1045,7 +1056,7 @@ void Mat::push_back(const _Tp& elem)
    }
    CV_Assert(DataType<_Tp>::type == type() && cols == 1
              /* && dims == 2 (cols == 1 implies dims == 2) */);
-    uchar* tmp = dataend + step[0];
+    const uchar* tmp = dataend + step[0];
    if( !isSubmatrix() && isContinuous() && tmp <= datalimit )
    {
        *(_Tp*)(data + (size.p[0]++) * step.p[0]) = elem;
@@ -1585,6 +1596,15 @@ MatIterator_<_Tp> Mat_<_Tp>::end()
    return Mat::end<_Tp>();
 }

+template<typename _Tp> template<typename Functor> inline
+void Mat_<_Tp>::forEach(const Functor& operation) {
+    Mat::forEach<_Tp, Functor>(operation);
+}
+
+template<typename _Tp> template<typename Functor> inline
+void Mat_<_Tp>::forEach(const Functor& operation) const {
+    Mat::forEach<_Tp, Functor>(operation);
+}

 ///////////////////////////// SparseMat /////////////////////////////

@@ -2149,7 +2169,7 @@ MatConstIterator& MatConstIterator::operator = (const MatConstIterator& it )
 }

 inline
-uchar* MatConstIterator::operator *() const
+const uchar* MatConstIterator::operator *() const
 {
    return ptr;
 }
@@ -2282,7 +2302,7 @@ MatConstIterator operator - (const MatConstIterator& a, ptrdiff_t ofs)


 inline
-uchar* MatConstIterator::operator [](ptrdiff_t i) const
+const uchar* MatConstIterator::operator [](ptrdiff_t i) const
 {
    return *(*this + i);
 }
@@ -2454,12 +2474,12 @@ MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, int _row, int _col)
 {}

 template<typename _Tp> inline
-MatIterator_<_Tp>::MatIterator_(const Mat_<_Tp>* _m, Point _pt)
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, Point _pt)
    : MatConstIterator_<_Tp>(_m, _pt)
 {}

 template<typename _Tp> inline
-MatIterator_<_Tp>::MatIterator_(const Mat_<_Tp>* _m, const int* _idx)
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, const int* _idx)
    : MatConstIterator_<_Tp>(_m, _idx)
 {}

@@ -2593,7 +2613,7 @@ inline SparseMatConstIterator& SparseMatConstIterator::operator = (const SparseM
 template<typename _Tp> inline
 const _Tp& SparseMatConstIterator::value() const
 {
-    return *(_Tp*)ptr;
+    return *(const _Tp*)ptr;
 }

 inline
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -636,6 +636,9 @@ protected:

 CV_EXPORTS MatAllocator* getOpenCLAllocator();

+CV_EXPORTS_W bool isPerformanceCheckBypassed();
+#define OCL_PERFORMANCE_CHECK(condition) (cv::ocl::isPerformanceCheckBypassed() || (condition))
+
 }}

 #endif
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -274,6 +274,102 @@ public:

 CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes=-1.);

+/////////////////////////////// forEach method of cv::Mat ////////////////////////////
+template<typename _Tp, typename Functor> inline
+void Mat::forEach_impl(const Functor& operation) {
+    if (false) {
+        operation(*reinterpret_cast<_Tp*>(0), reinterpret_cast<int*>(NULL));
+        // If your compiler fail in this line.
+        // Please check that your functor signature is
+        //     (_Tp&, const int*)   <- multidimential
+        //  or (_Tp&, void*)        <- in case of you don't need current idx.
+    }
+
+    CV_Assert(this->total() / this->size[this->dims - 1] <= INT_MAX);
+    const int LINES = static_cast<int>(this->total() / this->size[this->dims - 1]);
+
+    class PixelOperationWrapper :public ParallelLoopBody
+    {
+    public:
+        PixelOperationWrapper(Mat_<_Tp>* const frame, const Functor& _operation)
+            : mat(frame), op(_operation) {};
+        virtual ~PixelOperationWrapper(){};
+        // ! Overloaded virtual operator
+        // convert range call to row call.
+        virtual void operator()(const Range &range) const {
+            const int DIMS = mat->dims;
+            const int COLS = mat->size[DIMS - 1];
+            if (DIMS <= 2) {
+                for (int row = range.start; row < range.end; ++row) {
+                    this->rowCall2(row, COLS);
+                }
+            } else {
+                std::vector<int> idx(COLS); /// idx is modified in this->rowCall
+                idx[DIMS - 2] = range.start - 1;
+
+                for (int line_num = range.start; line_num < range.end; ++line_num) {
+                    idx[DIMS - 2]++;
+                    for (int i = DIMS - 2; i >= 0; --i) {
+                        if (idx[i] >= mat->size[i]) {
+                            idx[i - 1] += idx[i] / mat->size[i];
+                            idx[i] %= mat->size[i];
+                            continue; // carry-over;
+                        }
+                        else {
+                            break;
+                        }
+                    }
+                    this->rowCall(&idx[0], COLS, DIMS);
+                }
+            }
+        };
+    private:
+        Mat_<_Tp>* const mat;
+        const Functor op;
+        // ! Call operator for each elements in this row.
+        inline void rowCall(int* const idx, const int COLS, const int DIMS) const {
+            int &col = idx[DIMS - 1];
+            col = 0;
+            _Tp* pixel = &(mat->template at<_Tp>(idx));
+
+            while (col < COLS) {
+                op(*pixel, const_cast<const int*>(idx));
+                pixel++; col++;
+            }
+            col = 0;
+        }
+        // ! Call operator for each elements in this row. 2d mat special version.
+        inline void rowCall2(const int row, const int COLS) const {
+            union Index{
+                int body[2];
+                operator const int*() const {
+                    return reinterpret_cast<const int*>(this);
+                }
+                int& operator[](const int i) {
+                    return body[i];
+                }
+            } idx = {{row, 0}};
+            // Special union is needed to avoid
+            // "error: array subscript is above array bounds [-Werror=array-bounds]"
+            // when call the functor `op` such that access idx[3].
+
+            _Tp* pixel = &(mat->template at<_Tp>(idx));
+            const _Tp* const pixel_end = pixel + COLS;
+            while(pixel < pixel_end) {
+                op(*pixel++, static_cast<const int*>(idx));
+                idx[1]++;
+            }
+        };
+        PixelOperationWrapper& operator=(const PixelOperationWrapper &) {
+            CV_Assert(false);
+            // We can not remove this implementation because Visual Studio warning C4822.
+            return *this;
+        };
+    };
+
+    parallel_for_(cv::Range(0, LINES), PixelOperationWrapper(reinterpret_cast<Mat_<_Tp>*>(this), operation));
+};
+
 /////////////////////////// Synchronization Primitives ///////////////////////////////

 class CV_EXPORTS Mutex
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -39,7 +39,7 @@
 //
 //M*/

-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/perf/opencl/perf_bufferpool.cpp
+++ b/modules/core/perf/opencl/perf_bufferpool.cpp
@@ -4,7 +4,7 @@
 //
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.

-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/perf/opencl/perf_channels.cpp
+++ b/modules/core/perf/opencl/perf_channels.cpp
@@ -44,7 +44,7 @@
 //
 //M*/

-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/perf/opencl/perf_dxt.cpp
+++ b/modules/core/perf/opencl/perf_dxt.cpp
@@ -44,7 +44,7 @@
 //
 //M*/

-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/perf/opencl/perf_gemm.cpp
+++ b/modules/core/perf/opencl/perf_gemm.cpp
@@ -44,7 +44,7 @@
 //
 //M*/

-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/perf/opencl/perf_matop.cpp
+++ b/modules/core/perf/opencl/perf_matop.cpp
@@ -5,7 +5,7 @@
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/perf/opencl/perf_usage_flags.cpp
+++ b/modules/core/perf/opencl/perf_usage_flags.cpp
@@ -4,7 +4,7 @@
 //
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.

-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
 #include "opencv2/ts/ocl_perf.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -47,7 +47,7 @@
 // */

 #include "precomp.hpp"
-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"

 namespace cv
 {
@@ -1607,7 +1607,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
 #ifdef HAVE_OPENCL
-    bool use_opencl = _dst.isUMat() && dims1 <= 2 && dims2 <= 2;
+    bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2;
 #endif
    bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
    bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
@@ -2440,6 +2440,34 @@ addWeighted8u( const uchar* src1, size_t step1,
                _mm_storel_epi64((__m128i*)(dst + x), u);
            }
        }
+#elif CV_NEON
+        float32x4_t g = vdupq_n_f32 (gamma);
+
+        for( ; x <= size.width - 8; x += 8 )
+        {
+            uint8x8_t in1 = vld1_u8(src1+x);
+            uint16x8_t in1_16 = vmovl_u8(in1);
+            float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
+            float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
+
+            uint8x8_t in2 = vld1_u8(src2+x);
+            uint16x8_t in2_16 = vmovl_u8(in2);
+            float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
+            float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
+
+            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
+            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
+            out_f_l = vaddq_f32(out_f_l, g);
+            out_f_h = vaddq_f32(out_f_h, g);
+
+            uint16x4_t out_16_l = vqmovun_s32(vcvtq_s32_f32(out_f_l));
+            uint16x4_t out_16_h = vqmovun_s32(vcvtq_s32_f32(out_f_h));
+
+            uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
+            uint8x8_t out = vqmovn_u16(out_16);
+
+            vst1_u8(dst+x, out);
+        }
 #endif
        #if CV_ENABLE_UNROLLED
        for( ; x <= size.width - 4; x += 4 )
@@ -2650,6 +2678,14 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste

                }
            }
+           #elif CV_NEON
+            uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
+
+            for( ; x <= size.width - 16; x += 16 )
+            {
+                vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
+            }
+
           #endif

            for( ; x < size.width; x++ ){
@@ -2674,6 +2710,13 @@ static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t ste
                    _mm_storeu_si128((__m128i*)(dst + x), r00);
                }
            }
+           #elif CV_NEON
+            uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
+
+            for( ; x <= size.width - 16; x += 16 )
+            {
+                vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
+            }
           #endif
           for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
@@ -2759,6 +2802,22 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
                    x += 8;
                }
            }
+           #elif CV_NEON
+            uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
+
+            for( ; x <= size.width - 16; x += 16 )
+            {
+                int16x8_t in1 = vld1q_s16(src1 + x);
+                int16x8_t in2 = vld1q_s16(src2 + x);
+                uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
+
+                in1 = vld1q_s16(src1 + x + 8);
+                in2 = vld1q_s16(src2 + x + 8);
+                uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
+
+                vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
+            }
+
           #endif

            for( ; x < size.width; x++ ){
@@ -2797,6 +2856,21 @@ static void cmp16s(const short* src1, size_t step1, const short* src2, size_t st
                    x += 8;
                }
            }
+           #elif CV_NEON
+            uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
+
+            for( ; x <= size.width - 16; x += 16 )
+            {
+                int16x8_t in1 = vld1q_s16(src1 + x);
+                int16x8_t in2 = vld1q_s16(src2 + x);
+                uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
+
+                in1 = vld1q_s16(src1 + x + 8);
+                in2 = vld1q_s16(src2 + x + 8);
+                uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
+
+                vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
+            }
           #endif
           for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
@@ -2982,7 +3056,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
        haveScalar = true;
    }

-    CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(),
+    CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
               ocl_compare(_src1, _src2, _dst, op, haveScalar))

    int kind1 = _src1.kind(), kind2 = _src2.kind();
@@ -3085,7 +3159,7 @@ namespace cv
 {

 template <typename T>
-struct InRange_SSE
+struct InRange_SIMD
 {
    int operator () (const T *, const T *, const T *, uchar *, int) const
    {
@@ -3096,7 +3170,7 @@ struct InRange_SSE
 #if CV_SSE2

 template <>
-struct InRange_SSE<uchar>
+struct InRange_SIMD<uchar>
 {
    int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
                     uchar * dst, int len) const
@@ -3121,7 +3195,7 @@ struct InRange_SSE<uchar>
 };

 template <>
-struct InRange_SSE<schar>
+struct InRange_SIMD<schar>
 {
    int operator () (const schar * src1, const schar * src2, const schar * src3,
                     uchar * dst, int len) const
@@ -3146,7 +3220,7 @@ struct InRange_SSE<schar>
 };

 template <>
-struct InRange_SSE<ushort>
+struct InRange_SIMD<ushort>
 {
    int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
                     uchar * dst, int len) const
@@ -3172,7 +3246,7 @@ struct InRange_SSE<ushort>
 };

 template <>
-struct InRange_SSE<short>
+struct InRange_SIMD<short>
 {
    int operator () (const short * src1, const short * src2, const short * src3,
                     uchar * dst, int len) const
@@ -3198,7 +3272,7 @@ struct InRange_SSE<short>
 };

 template <>
-struct InRange_SSE<int>
+struct InRange_SIMD<int>
 {
    int operator () (const int * src1, const int * src2, const int * src3,
                     uchar * dst, int len) const
@@ -3230,7 +3304,7 @@ struct InRange_SSE<int>
 };

 template <>
-struct InRange_SSE<float>
+struct InRange_SIMD<float>
 {
    int operator () (const float * src1, const float * src2, const float * src3,
                     uchar * dst, int len) const
@@ -3261,6 +3335,160 @@ struct InRange_SSE<float>
    }
 };

+#elif CV_NEON
+
+template <>
+struct InRange_SIMD<uchar>
+{
+    int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        for ( ; x <= len - 16; x += 16 )
+        {
+            uint8x16_t values = vld1q_u8(src1 + x);
+            uint8x16_t low = vld1q_u8(src2 + x);
+            uint8x16_t high = vld1q_u8(src3 + x);
+
+            vst1q_u8(dst + x, vandq_u8(vcgeq_u8(values, low), vcgeq_u8(high, values)));
+        }
+        return x;
+    }
+};
+
+template <>
+struct InRange_SIMD<schar>
+{
+    int operator () (const schar * src1, const schar * src2, const schar * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        for ( ; x <= len - 16; x += 16 )
+        {
+            int8x16_t values = vld1q_s8(src1 + x);
+            int8x16_t low = vld1q_s8(src2 + x);
+            int8x16_t high = vld1q_s8(src3 + x);
+
+            vst1q_u8(dst + x, vandq_u8(vcgeq_s8(values, low), vcgeq_s8(high, values)));
+        }
+        return x;
+    }
+};
+
+template <>
+struct InRange_SIMD<ushort>
+{
+    int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        for ( ; x <= len - 16; x += 16 )
+        {
+            uint16x8_t values = vld1q_u16((const uint16_t*)(src1 + x));
+            uint16x8_t low = vld1q_u16((const uint16_t*)(src2 + x));
+            uint16x8_t high = vld1q_u16((const uint16_t*)(src3 + x));
+            uint8x8_t  r1 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values)));
+
+            values = vld1q_u16((const uint16_t*)(src1 + x + 8));
+            low = vld1q_u16((const uint16_t*)(src2 + x + 8));
+            high = vld1q_u16((const uint16_t*)(src3 + x + 8));
+            uint8x8_t  r2 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values)));
+
+            vst1q_u8(dst + x, vcombine_u8(r1, r2));
+        }
+        return x;
+    }
+};
+
+template <>
+struct InRange_SIMD<short>
+{
+    int operator () (const short * src1, const short * src2, const short * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        for ( ; x <= len - 16; x += 16 )
+        {
+            int16x8_t values = vld1q_s16((const int16_t*)(src1 + x));
+            int16x8_t low = vld1q_s16((const int16_t*)(src2 + x));
+            int16x8_t high = vld1q_s16((const int16_t*)(src3 + x));
+            uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values)));
+
+            values = vld1q_s16((const int16_t*)(src1 + x + 8));
+            low = vld1q_s16((const int16_t*)(src2 + x + 8));
+            high = vld1q_s16((const int16_t*)(src3 + x + 8));
+            uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values)));
+
+            vst1q_u8(dst + x, vcombine_u8(r1, r2));
+        }
+        return x;
+    }
+};
+
+template <>
+struct InRange_SIMD<int>
+{
+    int operator () (const int * src1, const int * src2, const int * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        for ( ; x <= len - 8; x += 8 )
+        {
+            int32x4_t values = vld1q_s32((const int32_t*)(src1 + x));
+            int32x4_t low = vld1q_s32((const int32_t*)(src2 + x));
+            int32x4_t high = vld1q_s32((const int32_t*)(src3 + x));
+
+            uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values)));
+
+            values = vld1q_s32((const int32_t*)(src1 + x + 4));
+            low = vld1q_s32((const int32_t*)(src2 + x + 4));
+            high = vld1q_s32((const int32_t*)(src3 + x + 4));
+
+            uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values)));
+
+            uint16x8_t res_16 = vcombine_u16(r1, r2);
+
+            vst1_u8(dst + x, vmovn_u16(res_16));
+        }
+        return x;
+    }
+};
+
+template <>
+struct InRange_SIMD<float>
+{
+    int operator () (const float * src1, const float * src2, const float * src3,
+                     uchar * dst, int len) const
+    {
+        int x = 0;
+
+        for ( ; x <= len - 8; x += 8 )
+        {
+            float32x4_t values = vld1q_f32((const float32_t*)(src1 + x));
+            float32x4_t low = vld1q_f32((const float32_t*)(src2 + x));
+            float32x4_t high = vld1q_f32((const float32_t*)(src3 + x));
+
+            uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values)));
+
+            values = vld1q_f32((const float32_t*)(src1 + x + 4));
+            low = vld1q_f32((const float32_t*)(src2 + x + 4));
+            high = vld1q_f32((const float32_t*)(src3 + x + 4));
+
+            uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values)));
+
+            uint16x8_t res_16 = vcombine_u16(r1, r2);
+
+            vst1_u8(dst + x, vmovn_u16(res_16));
+        }
+        return x;
+    }
+};
+
 #endif

 template <typename T>
@@ -3272,7 +3500,7 @@ static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
    step2 /= sizeof(src2[0]);
    step3 /= sizeof(src3[0]);

-    InRange_SSE<T> vop;
+    InRange_SIMD<T> vop;

    for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
    {
@@ -3500,7 +3728,7 @@ void cv::inRange(InputArray _src, InputArray _lowerb,
                 InputArray _upperb, OutputArray _dst)
 {
    CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
-               _upperb.dims() <= 2 && _dst.isUMat(),
+               _upperb.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
               ocl_inRange(_src, _lowerb, _upperb, _dst))

    int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -41,7 +41,7 @@
 //M*/

 #include "precomp.hpp"
-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"

 namespace cv
 {
@@ -50,6 +50,71 @@ namespace cv
 *                                       split & merge                                    *
 \****************************************************************************************/

+#if CV_NEON
+template<typename T> struct VSplit2;
+template<typename T> struct VSplit3;
+template<typename T> struct VSplit4;
+
+#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src, data_type* dst0, data_type* dst1){  \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+        }                                                                         \
+    }
+
+#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
+                        data_type* dst2){                                         \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+            store_func(dst2, r.val[2]);                                           \
+        }                                                                         \
+    }
+
+#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
+                        data_type* dst2, data_type* dst3){                        \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+            store_func(dst2, r.val[2]);                                           \
+            store_func(dst3, r.val[3]);                                           \
+        }                                                                         \
+    }
+
+SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
+SPLIT2_KERNEL_TEMPLATE(VSplit2, schar ,   int8x16x2_t, vld2q_s8 , vst1q_s8 );
+SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
+SPLIT2_KERNEL_TEMPLATE(VSplit2, short ,   int16x8x2_t, vld2q_s16, vst1q_s16);
+SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
+SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32);
+SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
+
+SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
+SPLIT3_KERNEL_TEMPLATE(VSplit3, schar ,   int8x16x3_t, vld3q_s8 , vst1q_s8 );
+SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
+SPLIT3_KERNEL_TEMPLATE(VSplit3, short ,   int16x8x3_t, vld3q_s16, vst1q_s16);
+SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
+SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32);
+SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
+
+SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
+SPLIT4_KERNEL_TEMPLATE(VSplit4, schar ,   int8x16x4_t, vld4q_s8 , vst1q_s8 );
+SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
+SPLIT4_KERNEL_TEMPLATE(VSplit4, short ,   int16x8x4_t, vld4q_s16, vst1q_s16);
+SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
+SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32);
+SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
+#endif
+
 template<typename T> static void
 split_( const T* src, T** dst, int len, int cn )
 {
@@ -58,13 +123,34 @@ split_( const T* src, T** dst, int len, int cn )
    if( k == 1 )
    {
        T* dst0 = dst[0];
-        for( i = j = 0; i < len; i++, j += cn )
-            dst0[i] = src[j];
+
+        if(cn == 1)
+        {
+            memcpy(dst0, src, len * sizeof(T));
+        }
+        else
+        {
+            for( i = 0, j = 0 ; i < len; i++, j += cn )
+                dst0[i] = src[j];
+        }
    }
    else if( k == 2 )
    {
        T *dst0 = dst[0], *dst1 = dst[1];
-        for( i = j = 0; i < len; i++, j += cn )
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 2)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VSplit2<T> vsplit;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j];
            dst1[i] = src[j+1];
@@ -73,7 +159,20 @@ split_( const T* src, T** dst, int len, int cn )
    else if( k == 3 )
    {
        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
-        for( i = j = 0; i < len; i++, j += cn )
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 3)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VSplit3<T> vsplit;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j];
            dst1[i] = src[j+1];
@@ -83,7 +182,20 @@ split_( const T* src, T** dst, int len, int cn )
    else
    {
        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
-        for( i = j = 0; i < len; i++, j += cn )
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 4)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VSplit4<T> vsplit;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
        {
            dst0[i] = src[j]; dst1[i] = src[j+1];
            dst2[i] = src[j+2]; dst3[i] = src[j+3];
@@ -101,6 +213,77 @@ split_( const T* src, T** dst, int len, int cn )
    }
 }

+
+#if CV_NEON
+template<typename T> struct VMerge2;
+template<typename T> struct VMerge3;
+template<typename T> struct VMerge4;
+
+#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        data_type* dst){                                          \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        const data_type* src2, data_type* dst){                   \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            r.val[2] = load_func(src2);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        const data_type* src2, const data_type* src3,             \
+                        data_type* dst){                                          \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            r.val[2] = load_func(src2);                                           \
+            r.val[3] = load_func(src3);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
+MERGE2_KERNEL_TEMPLATE(VMerge2, schar ,   int8x16x2_t, vld1q_s8 , vst2q_s8 );
+MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
+MERGE2_KERNEL_TEMPLATE(VMerge2, short ,   int16x8x2_t, vld1q_s16, vst2q_s16);
+MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
+MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32);
+MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
+
+MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
+MERGE3_KERNEL_TEMPLATE(VMerge3, schar ,   int8x16x3_t, vld1q_s8 , vst3q_s8 );
+MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
+MERGE3_KERNEL_TEMPLATE(VMerge3, short ,   int16x8x3_t, vld1q_s16, vst3q_s16);
+MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
+MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32);
+MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
+
+MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
+MERGE4_KERNEL_TEMPLATE(VMerge4, schar ,   int8x16x4_t, vld1q_s8 , vst4q_s8 );
+MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
+MERGE4_KERNEL_TEMPLATE(VMerge4, short ,   int16x8x4_t, vld1q_s16, vst4q_s16);
+MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
+MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32);
+MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
+#endif
+
 template<typename T> static void
 merge_( const T** src, T* dst, int len, int cn )
 {
@@ -115,7 +298,19 @@ merge_( const T** src, T* dst, int len, int cn )
    else if( k == 2 )
    {
        const T *src0 = src[0], *src1 = src[1];
-        for( i = j = 0; i < len; i++, j += cn )
+        i = j = 0;
+#if CV_NEON
+        if(cn == 2)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VMerge2<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i];
            dst[j+1] = src1[i];
@@ -124,7 +319,19 @@ merge_( const T** src, T* dst, int len, int cn )
    else if( k == 3 )
    {
        const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
-        for( i = j = 0; i < len; i++, j += cn )
+        i = j = 0;
+#if CV_NEON
+        if(cn == 3)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VMerge3<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, src2 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i];
            dst[j+1] = src1[i];
@@ -134,7 +341,19 @@ merge_( const T** src, T* dst, int len, int cn )
    else
    {
        const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
-        for( i = j = 0; i < len; i++, j += cn )
+        i = j = 0;
+#if CV_NEON
+        if(cn == 4)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VMerge4<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
        {
            dst[j] = src0[i]; dst[j+1] = src1[i];
            dst[j+2] = src2[i]; dst[j+3] = src3[i];
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -46,7 +46,7 @@
 // */

 #include "precomp.hpp"
-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"

 namespace cv
 {
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -42,7 +42,7 @@
 #include "precomp.hpp"
 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"
 #include <map>

 namespace cv
@@ -1801,11 +1801,11 @@ private:
    UMat twiddles;
    String buildOptions;
    int thread_count;
-    bool status;
    int dft_size;
+    bool status;

 public:
-    OCL_FftPlan(int _size): dft_size(_size), status(true)
+    OCL_FftPlan(int _size) : dft_size(_size), status(true)
    {
        int min_radix;
        std::vector<int> radixes, blocks;
@@ -2635,8 +2635,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )

            for( i = 0; i < nonzero_rows; i++ )
            {
-                uchar* sptr = src.data + i*src.step;
-                uchar* dptr0 = dst.data + i*dst.step;
+                const uchar* sptr = src.ptr(i);
+                uchar* dptr0 = dst.ptr(i);
                uchar* dptr = dptr0;

                if( tmp_buf )
@@ -2649,7 +2649,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )

            for( ; i < count; i++ )
            {
-                uchar* dptr0 = dst.data + i*dst.step;
+                uchar* dptr0 = dst.ptr(i);
                memset( dptr0, 0, dst_full_len );
            }

@@ -2661,7 +2661,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
        {
            int a = 0, b = count;
            uchar *buf0, *buf1, *dbuf0, *dbuf1;
-            uchar* sptr0 = src.data;
+            const uchar* sptr0 = src.data;
            uchar* dptr0 = dst.data;
            buf0 = ptr;
            ptr += len*complex_elem_size;
@@ -2800,7 +2800,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
                    int n = dst.cols;
                    if( elem_size == (int)sizeof(float) )
                    {
-                        float* p0 = (float*)dst.data;
+                        float* p0 = dst.ptr<float>();
                        size_t dstep = dst.step/sizeof(p0[0]);
                        for( i = 0; i < len; i++ )
                        {
@@ -2816,7 +2816,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
                    }
                    else
                    {
-                        double* p0 = (double*)dst.data;
+                        double* p0 = dst.ptr<double>();
                        size_t dstep = dst.step/sizeof(p0[0]);
                        for( i = 0; i < len; i++ )
                        {
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -955,10 +955,10 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
        SVD::compute(src, w, u, vt);
        SVD::backSubst(w, u, vt, Mat(), _dst);
        return type == CV_32F ?
-            (((float*)w.data)[0] >= FLT_EPSILON ?
-             ((float*)w.data)[n-1]/((float*)w.data)[0] : 0) :
-            (((double*)w.data)[0] >= DBL_EPSILON ?
-             ((double*)w.data)[n-1]/((double*)w.data)[0] : 0);
+            (w.ptr<float>()[0] >= FLT_EPSILON ?
+             w.ptr<float>()[n-1]/w.ptr<float>()[0] : 0) :
+            (w.ptr<double>()[0] >= DBL_EPSILON ?
+             w.ptr<double>()[n-1]/w.ptr<double>()[0] : 0);
    }

    CV_Assert( m == n );
@@ -975,10 +975,10 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
        transpose(vt, u);
        SVD::backSubst(w, u, vt, Mat(), _dst);
        return type == CV_32F ?
-        (((float*)w.data)[0] >= FLT_EPSILON ?
-         ((float*)w.data)[n-1]/((float*)w.data)[0] : 0) :
-        (((double*)w.data)[0] >= DBL_EPSILON ?
-         ((double*)w.data)[n-1]/((double*)w.data)[0] : 0);
+        (w.ptr<float>()[0] >= FLT_EPSILON ?
+         w.ptr<float>()[n-1]/w.ptr<float>()[0] : 0) :
+        (w.ptr<double>()[0] >= DBL_EPSILON ?
+         w.ptr<double>()[n-1]/w.ptr<double>()[0] : 0);
    }

    CV_Assert( method == DECOMP_LU || method == DECOMP_CHOLESKY );
@@ -988,7 +988,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )

    if( n <= 3 )
    {
-        uchar* srcdata = src.data;
+        const uchar* srcdata = src.data;
        uchar* dstdata = dst.data;
        size_t srcstep = src.step;
        size_t dststep = dst.step;
@@ -1212,8 +1212,8 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
        #define bf(y) ((float*)(bdata + y*src2step))[0]
        #define bd(y) ((double*)(bdata + y*src2step))[0]

-        uchar* srcdata = src.data;
-        uchar* bdata = _src2.data;
+        const uchar* srcdata = src.data;
+        const uchar* bdata = _src2.data;
        uchar* dstdata = dst.data;
        size_t srcstep = src.step;
        size_t src2step = _src2.step;
@@ -1557,13 +1557,17 @@ static void _SVDcompute( InputArray _aarr, OutputArray _w,
    {
        if( !at )
        {
-            transpose(temp_u, _u);
-            temp_v.copyTo(_vt);
+            if( _u.needed() )
+                transpose(temp_u, _u);
+            if( _vt.needed() )
+                temp_v.copyTo(_vt);
        }
        else
        {
-            transpose(temp_v, _u);
-            temp_u.copyTo(_vt);
+            if( _u.needed() )
+                transpose(temp_v, _u);
+            if( _vt.needed() )
+                temp_u.copyTo(_vt);
        }
    }
 }
@@ -1705,7 +1709,7 @@ cvEigenVV( CvArr* srcarr, CvArr* evectsarr, CvArr* evalsarr, double,
        eigen(src, evals, evects);
        if( evects0.data != evects.data )
        {
-            uchar* p = evects0.data;
+            const uchar* p = evects0.data;
            evects.convertTo(evects0, evects0.type());
            CV_Assert( p == evects0.data );
        }
@@ -1714,7 +1718,7 @@ cvEigenVV( CvArr* srcarr, CvArr* evectsarr, CvArr* evalsarr, double,
        eigen(src, evals);
    if( evals0.data != evals.data )
    {
-        uchar* p = evals0.data;
+        const uchar* p = evals0.data;
        if( evals0.size() == evals.size() )
            evals.convertTo(evals0, evals0.type());
        else if( evals0.type() == evals.type() )
--- a/modules/core/src/lda.cpp
+++ b/modules/core/src/lda.cpp
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -41,7 +41,7 @@
 //M*/

 #include "precomp.hpp"
-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"

 namespace cv
 {
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -41,7 +41,7 @@
 //M*/

 #include "precomp.hpp"
-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"
 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"

 namespace cv
@@ -3295,7 +3295,6 @@ void cv::PCABackProject(InputArray data, InputArray mean,
    pca.backProject(data, result);
 }

-
 /****************************************************************************************\
 *                                    Earlier API                                         *
 \****************************************************************************************/
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -41,7 +41,7 @@
 //M*/

 #include "precomp.hpp"
-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"

 #include "bufferpool.impl.hpp"

@@ -346,7 +346,7 @@ static void finalizeHdr(Mat& m)
    if( d > 2 )
        m.rows = m.cols = -1;
    if(m.u)
-        m.data = m.datastart = m.u->data;
+        m.datastart = m.data = m.u->data;
    if( m.data )
    {
        m.datalimit = m.datastart + m.size[0]*m.step[0];
@@ -510,7 +510,7 @@ Mat::Mat(int _dims, const int* _sizes, int _type, void* _data, const size_t* _st
      datalimit(0), allocator(0), u(0), size(&rows)
 {
    flags |= CV_MAT_TYPE(_type);
-    data = datastart = (uchar*)_data;
+    datastart = data = (uchar*)_data;
    setSize(*this, _dims, _sizes, _steps, true);
    finalizeHdr(*this);
 }
@@ -549,7 +549,7 @@ static Mat cvMatNDToMat(const CvMatND* m, bool copyData)

    if( !m )
        return thiz;
-    thiz.data = thiz.datastart = m->data.ptr;
+    thiz.datastart = thiz.data = m->data.ptr;
    thiz.flags |= CV_MAT_TYPE(m->type);
    int _sizes[CV_MAX_DIM];
    size_t _steps[CV_MAX_DIM];
@@ -587,7 +587,7 @@ static Mat cvMatToMat(const CvMat* m, bool copyData)
        thiz.dims = 2;
        thiz.rows = m->rows;
        thiz.cols = m->cols;
-        thiz.data = thiz.datastart = m->data.ptr;
+        thiz.datastart = thiz.data = m->data.ptr;
        size_t esz = CV_ELEM_SIZE(m->type), minstep = thiz.cols*esz, _step = m->step;
        if( _step == 0 )
            _step = minstep;
@@ -597,7 +597,7 @@ static Mat cvMatToMat(const CvMat* m, bool copyData)
    }
    else
    {
-        thiz.data = thiz.datastart = thiz.dataend = 0;
+        thiz.datastart = thiz.dataend = thiz.data = 0;
        Mat(m->rows, m->cols, m->type, m->data.ptr, m->step).copyTo(thiz);
    }

@@ -636,7 +636,7 @@ static Mat iplImageToMat(const IplImage* img, bool copyData)
        m.rows = img->roi->height;
        m.cols = img->roi->width;
        esz = CV_ELEM_SIZE(m.flags);
-        m.data = m.datastart = (uchar*)img->imageData +
+        m.datastart = m.data = (uchar*)img->imageData +
            (selectedPlane ? (img->roi->coi - 1)*m.step*img->height : 0) +
            img->roi->yOffset*m.step[0] + img->roi->xOffset*esz;
    }
@@ -2758,15 +2758,18 @@ namespace cv {

 static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
 {
-    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn;
-    if (cn == 1)
+    int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), kercn = cn, rowsPerWI = 1;
+    int sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn);
+    if (ocl::Device::getDefault().isIntel())
    {
-        kercn = std::min(ocl::predictOptimalVectorWidth(_m), 4);
-        if (kercn != 4)
-            kercn = 1;
+        rowsPerWI = 4;
+        if (cn == 1)
+        {
+            kercn = std::min(ocl::predictOptimalVectorWidth(_m), 4);
+            if (kercn != 4)
+                kercn = 1;
+        }
    }
-    int sctype = CV_MAKE_TYPE(depth, cn == 3 ? 4 : cn),
-            rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;

    ocl::Kernel k("setIdentity", ocl::core::set_identity_oclsrc,
                  format("-D T=%s -D T1=%s -D cn=%d -D ST=%s -D kercn=%d -D rowsPerWI=%d",
@@ -5529,14 +5532,14 @@ double norm( const SparseMat& src, int normType )
    {
        if( normType == NORM_INF )
            for( i = 0; i < N; i++, ++it )
-                result = std::max(result, std::abs((double)*(const float*)it.ptr));
+                result = std::max(result, std::abs((double)it.value<float>()));
        else if( normType == NORM_L1 )
            for( i = 0; i < N; i++, ++it )
-                result += std::abs(*(const float*)it.ptr);
+                result += std::abs(it.value<float>());
        else
            for( i = 0; i < N; i++, ++it )
            {
-                double v = *(const float*)it.ptr;
+                double v = it.value<float>();
                result += v*v;
            }
    }
@@ -5544,14 +5547,14 @@ double norm( const SparseMat& src, int normType )
    {
        if( normType == NORM_INF )
            for( i = 0; i < N; i++, ++it )
-                result = std::max(result, std::abs(*(const double*)it.ptr));
+                result = std::max(result, std::abs(it.value<double>()));
        else if( normType == NORM_L1 )
            for( i = 0; i < N; i++, ++it )
-                result += std::abs(*(const double*)it.ptr);
+                result += std::abs(it.value<double>());
        else
            for( i = 0; i < N; i++, ++it )
            {
-                double v = *(const double*)it.ptr;
+                double v = it.value<double>();
                result += v*v;
            }
    }
@@ -5575,7 +5578,7 @@ void minMaxLoc( const SparseMat& src, double* _minval, double* _maxval, int* _mi
        float minval = FLT_MAX, maxval = -FLT_MAX;
        for( i = 0; i < N; i++, ++it )
        {
-            float v = *(const float*)it.ptr;
+            float v = it.value<float>();
            if( v < minval )
            {
                minval = v;
@@ -5597,7 +5600,7 @@ void minMaxLoc( const SparseMat& src, double* _minval, double* _maxval, int* _mi
        double minval = DBL_MAX, maxval = -DBL_MAX;
        for( i = 0; i < N; i++, ++it )
        {
-            double v = *(const double*)it.ptr;
+            double v = it.value<double>();
            if( v < minval )
            {
                minval = v;
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -57,6 +57,28 @@
 # endif
 #endif

+
+// TODO Move to some common place
+static bool getBoolParameter(const char* name, bool defaultValue)
+{
+    const char* envValue = getenv(name);
+    if (envValue == NULL)
+    {
+        return defaultValue;
+    }
+    cv::String value = envValue;
+    if (value == "1" || value == "True" || value == "true" || value == "TRUE")
+    {
+        return true;
+    }
+    if (value == "0" || value == "False" || value == "false" || value == "FALSE")
+    {
+        return false;
+    }
+    CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
+}
+
+
 // TODO Move to some common place
 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
 {
@@ -1305,7 +1327,18 @@ OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
 #ifdef _DEBUG
 #define CV_OclDbgAssert CV_DbgAssert
 #else
-#define CV_OclDbgAssert(expr) (void)(expr)
+static bool isRaiseError()
+{
+    static bool initialized = false;
+    static bool value = false;
+    if (!initialized)
+    {
+        value = getBoolParameter("OPENCV_OPENCL_RAISE_ERROR", false);
+        initialized = true;
+    }
+    return value;
+}
+#define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0)
 #endif

 namespace cv { namespace ocl {
@@ -4711,4 +4744,16 @@ void* Image2D::ptr() const
    return p ? p->handle : 0;
 }

+bool isPerformanceCheckBypassed()
+{
+    static bool initialized = false;
+    static bool value = false;
+    if (!initialized)
+    {
+        value = getBoolParameter("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
+        initialized = true;
+    }
+    return value;
+}
+
 }}
--- a/modules/core/src/opencl/fft.cl
+++ b/modules/core/src/opencl/fft.cl
@@ -424,7 +424,7 @@ void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const
    const int x3 = x2 + t/3;
    float2 a0, a1, a2, a3, a4, a5, a6, a7, a8;

-    if (x1 < t/2)
+    if (x1 < t/3)
    {
        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
@@ -433,7 +433,7 @@ void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const

    barrier(CLK_LOCAL_MEM_FENCE);

-    if (x1 < t/2)
+    if (x1 < t/3)
    {
        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
--- a/modules/core/src/opencl/meanstddev.cl
+++ b/modules/core/src/opencl/meanstddev.cl
@@ -59,7 +59,7 @@ __kernel void meanStdDev(__global const uchar * srcptr, int src_step, int src_of
    for (int grain = groups * WGS; id < total; id += grain)
    {
 #ifdef HAVE_MASK
-#ifdef HAVE_SRC_CONT
+#ifdef HAVE_MASK_CONT
        int mask_index = id;
 #else
        int mask_index = mad24(id / cols, mask_step, id % cols);
--- a/modules/core/src/opencl/runtime/opencl_clamdblas.cpp
+++ b/modules/core/src/opencl/runtime/opencl_clamdblas.cpp
@@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "../../precomp.hpp"

 #ifdef HAVE_CLAMDBLAS

--- a/modules/core/src/opencl/runtime/opencl_clamdfft.cpp
+++ b/modules/core/src/opencl/runtime/opencl_clamdfft.cpp
@@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "../../precomp.hpp"

 #ifdef HAVE_CLAMDFFT

--- a/modules/core/src/opencl/runtime/opencl_core.cpp
+++ b/modules/core/src/opencl/runtime/opencl_core.cpp
@@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "../../precomp.hpp"

 #if defined(HAVE_OPENCL) && !defined(HAVE_OPENCL_STATIC)

--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -44,7 +44,7 @@
 #include <climits>
 #include <limits>

-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"

 namespace cv
 {
@@ -568,7 +568,7 @@ cv::Scalar cv::sum( InputArray _src )
 {
 #ifdef HAVE_OPENCL
    Scalar _res;
-    CV_OCL_RUN_(_src.isUMat() && _src.dims() <= 2,
+    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
                ocl_sum(_src, _res, OCL_OP_SUM),
                _res)
 #endif
@@ -719,7 +719,7 @@ int cv::countNonZero( InputArray _src )

 #ifdef HAVE_OPENCL
    int res = -1;
-    CV_OCL_RUN_(_src.isUMat() && _src.dims() <= 2,
+    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
                ocl_countNonZero(_src, res),
                res)
 #endif
@@ -782,7 +782,7 @@ cv::Scalar cv::mean( InputArray _src, InputArray _mask )
        int type = src.type();
        if( !mask.empty() )
        {
-            typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, void *, int, IppiSize, Ipp64f *);
+            typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *);
            ippiMaskMeanFuncC1 ippFuncC1 =
            type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR :
            type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR :
@@ -795,7 +795,7 @@ cv::Scalar cv::mean( InputArray _src, InputArray _mask )
                    return Scalar(res);
                setIppErrorStatus();
            }
-            typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, void *, int, IppiSize, int, Ipp64f *);
+            typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *);
            ippiMaskMeanFuncC3 ippFuncC3 =
            type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR :
            type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR :
@@ -918,7 +918,8 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv
    {
        int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
        bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
-                isContinuous = _src.isContinuous();
+                isContinuous = _src.isContinuous(),
+                isMaskContinuous = _mask.isContinuous();
        const ocl::Device &defDev = ocl::Device::getDefault();
        int groups = defDev.maxComputeUnits();
        if (defDev.isIntel())
@@ -943,13 +944,14 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv

        char cvt[2][40];
        String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d"
-                             " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s"
+                             " -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s"
                             " -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
                             ocl::typeToStr(type), ocl::typeToStr(depth),
                             ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth,
                             ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth),
                             ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]),
                             cn, isContinuous ? " -D HAVE_SRC_CONT" : "",
+                             isMaskContinuous ? " -D HAVE_MASK_CONT" : "",
                             ocl::convertTypeStr(depth, ddepth, cn, cvt[1]),
                             (int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "",
                             doubleSupport ? " -D DOUBLE_SUPPORT" : "");
@@ -1025,7 +1027,7 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv

 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
 {
-    CV_OCL_RUN(_src.isUMat() && _src.dims() <= 2,
+    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
               ocl_meanStdDev(_src, _mean, _sdv, _mask))

    Mat src = _src.getMat(), mask = _mask.getMat();
@@ -1069,7 +1071,7 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
        int type = src.type();
        if( !mask.empty() )
        {
-            typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, void *, int, IppiSize, Ipp64f *, Ipp64f *);
+            typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *);
            ippiMaskMeanStdDevFuncC1 ippFuncC1 =
            type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR :
            type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR :
@@ -1081,7 +1083,7 @@ void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, Input
                    return;
                setIppErrorStatus();
            }
-            typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
+            typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *);
            ippiMaskMeanStdDevFuncC3 ippFuncC3 =
            type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR :
            type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR :
@@ -1571,7 +1573,7 @@ void cv::minMaxIdx(InputArray _src, double* minVal,
    CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
        (cn > 1 && _mask.empty() && !minIdx && !maxIdx) );

-    CV_OCL_RUN(_src.isUMat() && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
+    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
               ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask))

    Mat src = _src.getMat(), mask = _mask.getMat();
@@ -2234,7 +2236,7 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )

 #ifdef HAVE_OPENCL
    double _result = 0;
-    CV_OCL_RUN_(_src.isUMat() && _src.dims() <= 2,
+    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
                ocl_norm(_src, normType, _mask, _result),
                _result)
 #endif
@@ -2594,7 +2596,7 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m

 #ifdef HAVE_OPENCL
    double _result = 0;
-    CV_OCL_RUN_(_src1.isUMat(),
+    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src1.isUMat()),
                ocl_norm(_src1, _src2, normType, _mask, _result),
                _result)
 #endif
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -41,7 +41,7 @@
 //M*/

 #include "precomp.hpp"
-#include "opencl_kernels.hpp"
+#include "opencl_kernels_core.hpp"

 ///////////////////////////////// UMat implementation ///////////////////////////////

@@ -582,7 +582,7 @@ Mat UMat::getMat(int accessFlags) const
    hdr.flags = flags;
    hdr.u = u;
    hdr.datastart = u->data;
-    hdr.data = hdr.datastart + offset;
+    hdr.data = u->data + offset;
    hdr.datalimit = hdr.dataend = u->data + u->size;
    CV_XADD(&hdr.u->refcount, 1);
    return hdr;
@@ -593,15 +593,16 @@ void* UMat::handle(int accessFlags) const
    if( !u )
        return 0;

-    if ((accessFlags & ACCESS_WRITE) != 0)
-        u->markHostCopyObsolete(true);
-
    // check flags: if CPU copy is newer, copy it back to GPU.
    if( u->deviceCopyObsolete() )
    {
        CV_Assert(u->refcount == 0);
        u->currAllocator->unmap(u);
    }
+
+    if ((accessFlags & ACCESS_WRITE) != 0)
+        u->markHostCopyObsolete(true);
+
    return u->handle;
 }

--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -39,7 +39,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 #include "opencv2/ts/ocl_test.hpp"

 #include <cmath>
@@ -157,6 +157,7 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
        Border maskBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(mask, mask_roi, roiSize, maskBorder, CV_8UC1, 0, 2);
        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+        *mask.ptr(0) = 255; // prevent test case with mask filled 0 only

        val = cv::Scalar(rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0),
                         rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0));
@@ -1419,7 +1420,7 @@ OCL_TEST_P(UMatDot, Mat)
        OCL_OFF(const double cpuRes = src1_roi.dot(src2_roi));
        OCL_ON(const double gpuRes = usrc1_roi.dot(usrc2_roi));

-        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-5);
    }
 }

@@ -1749,7 +1750,7 @@ OCL_TEST_P(ReduceAvg, Mat)
        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_AVG, dtype));
        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_AVG, dtype));

-        double eps = ddepth <= CV_32S ? 1 : 5e-6;
+        double eps = ddepth <= CV_32S ? 1 : 6e-6;
        OCL_EXPECT_MATS_NEAR(dst, eps);
    }
 }
--- a/modules/core/test/ocl/test_channels.cpp
+++ b/modules/core/test/ocl/test_channels.cpp
@@ -44,7 +44,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 #include "opencv2/ts/ocl_test.hpp"

 #ifdef HAVE_OPENCL
@@ -105,6 +105,7 @@ PARAM_TEST_CASE(Merge, MatDepth, int, bool)
        UMAT_UPLOAD_INPUT_PARAMETER(src3);
        UMAT_UPLOAD_INPUT_PARAMETER(src4);

+        src_roi.clear(); usrc_roi.clear(); // for test_loop_times > 1
        src_roi.push_back(src1_roi), usrc_roi.push_back(usrc1_roi);
        if (nsrc >= 2)
            src_roi.push_back(src2_roi), usrc_roi.push_back(usrc2_roi);
--- a/modules/core/test/ocl/test_dft.cpp
+++ b/modules/core/test/ocl/test_dft.cpp
@@ -43,7 +43,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 #include "opencv2/ts/ocl_test.hpp"

 #ifdef HAVE_OPENCL
@@ -108,7 +108,7 @@ OCL_TEST_P(Dft, Mat)
 {
    generateTestData();

-    int nonzero_rows = hint ? src.cols - randomInt(1, src.rows-1) : 0;
+    int nonzero_rows = hint ? src.rows - randomInt(1, src.rows-1) : 0;
    OCL_OFF(cv::dft(src, dst, dft_flags, nonzero_rows));
    OCL_ON(cv::dft(usrc, udst, dft_flags, nonzero_rows));

@@ -175,7 +175,7 @@ OCL_TEST_P(MulSpectrums, Mat)

 OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool()));

-OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)),
+OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(45, 72), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)),
                                               Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R),
                                               Bool(), // DFT_INVERSE
                                               Bool(), // DFT_ROWS
--- a/modules/core/test/ocl/test_gemm.cpp
+++ b/modules/core/test/ocl/test_gemm.cpp
@@ -42,7 +42,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 #include "opencv2/ts/ocl_test.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/test/ocl/test_matrix_expr.cpp
+++ b/modules/core/test/ocl/test_matrix_expr.cpp
@@ -5,7 +5,7 @@
 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 #include "opencv2/ts/ocl_test.hpp"

 #ifdef HAVE_OPENCL
--- a/modules/core/test/ocl/test_matrix_operation.cpp
+++ b/modules/core/test/ocl/test_matrix_operation.cpp
@@ -44,7 +44,7 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"
 #include "opencv2/ts/ocl_test.hpp"

 #ifdef HAVE_OPENCL
@@ -96,7 +96,7 @@ OCL_TEST_P(ConvertTo, Accuracy)
        OCL_OFF(src_roi.convertTo(dst_roi, dstType, alpha, beta));
        OCL_ON(usrc_roi.convertTo(udst_roi, dstType, alpha, beta));

-        double eps = src_depth >= CV_32F || CV_MAT_DEPTH(dstType) >= CV_32F ? 1e-4 : 1;
+        double eps = CV_MAT_DEPTH(dstType) >= CV_32F ? 2e-4 : 1;
        OCL_EXPECT_MATS_NEAR(dst, eps);
    }
 }
@@ -121,7 +121,7 @@ PARAM_TEST_CASE(CopyTo, MatDepth, Channels, bool, bool)
        use_mask = GET_PARAM(3);
    }

-    void generateTestData()
+    void generateTestData(bool one_cn_mask = false)
    {
        const int type = CV_MAKE_TYPE(depth, cn);

@@ -132,9 +132,11 @@ PARAM_TEST_CASE(CopyTo, MatDepth, Channels, bool, bool)
        if (use_mask)
        {
            Border maskBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            int mask_cn = randomDouble(0.0, 2.0) > 1.0 ? cn : 1;
+            int mask_cn = 1;
+            if (!one_cn_mask && randomDouble(0.0, 2.0) > 1.0)
+                mask_cn = cn;
            randomSubMat(mask, mask_roi, roiSize, maskBorder, CV_8UC(mask_cn), 0, 2);
-            cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+            cv::threshold(mask, mask, 0.5, 255., THRESH_BINARY);
        }

        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
@@ -177,7 +179,7 @@ OCL_TEST_P(SetTo, Accuracy)
 {
    for (int j = 0; j < test_loop_times; j++)
    {
-        generateTestData();
+        generateTestData(true); // see modules/core/src/umatrix.cpp Ln:791 => CV_Assert( mask.size() == size() && mask.type() == CV_8UC1 );

        if (use_mask)
        {
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -649,6 +649,16 @@ static void setValue(SparseMat& M, const int* idx, double value, RNG& rng)
        CV_Error(CV_StsUnsupportedFormat, "");
 }

+template<typename Pixel>
+struct InitializerFunctor{
+    /// Initializer for cv::Mat::forEach test
+    void operator()(Pixel & pixel, const int * idx) const {
+        pixel.x = idx[0];
+        pixel.y = idx[1];
+        pixel.z = idx[2];
+    }
+};
+
 void Core_ArrayOpTest::run( int /* start_from */)
 {
    int errcount = 0;
@@ -686,6 +696,45 @@ void Core_ArrayOpTest::run( int /* start_from */)
            errcount++;
        }
    }
+    // test cv::Mat::forEach
+    {
+        const int dims[3] = { 101, 107, 7 };
+        typedef cv::Point3i Pixel;
+
+        cv::Mat a = cv::Mat::zeros(3, dims, CV_32SC3);
+        InitializerFunctor<Pixel> initializer;
+
+        a.forEach<Pixel>(initializer);
+
+        uint64 total = 0;
+        bool error_reported = false;
+        for (int i0 = 0; i0 < dims[0]; ++i0) {
+            for (int i1 = 0; i1 < dims[1]; ++i1) {
+                for (int i2 = 0; i2 < dims[2]; ++i2) {
+                    Pixel& pixel = a.at<Pixel>(i0, i1, i2);
+                    if (pixel.x != i0 || pixel.y != i1 || pixel.z != i2) {
+                        if (!error_reported) {
+                            ts->printf(cvtest::TS::LOG, "forEach is not correct.\n"
+                                "First error detected at (%d, %d, %d).\n", pixel.x, pixel.y, pixel.z);
+                            error_reported = true;
+                        }
+                        errcount++;
+                    }
+                    total += pixel.x;
+                    total += pixel.y;
+                    total += pixel.z;
+                }
+            }
+        }
+        uint64 total2 = 0;
+        for (size_t i = 0; i < sizeof(dims) / sizeof(dims[0]); ++i) {
+            total2 += ((dims[i] - 1) * dims[i] / 2) * dims[0] * dims[1] * dims[2] / dims[i];
+        }
+        if (total != total2) {
+            ts->printf(cvtest::TS::LOG, "forEach is not correct because total is invalid.\n");
+            errcount++;
+        }
+    }

    RNG rng;
    const int MAX_DIM = 5, MAX_DIM_SZ = 10;
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@@ -745,6 +745,24 @@ TEST(UMat, Sync)
    EXPECT_EQ(0, cvtest::norm(um.getMat(ACCESS_READ), cv::Mat(um.size(), um.type(), 19), NORM_INF));
 }

+TEST(UMat, CopyToIfDeviceCopyIsObsolete)
+{
+    UMat um(7, 2, CV_8UC1);
+    Mat m(um.size(), um.type());
+    m.setTo(Scalar::all(0));
+
+    {
+        // make obsolete device copy of UMat
+        Mat temp = um.getMat(ACCESS_WRITE);
+        temp.setTo(Scalar::all(10));
+    }
+
+    m.copyTo(um);
+    um.setTo(Scalar::all(17));
+
+    EXPECT_EQ(0, cvtest::norm(um.getMat(ACCESS_READ), Mat(um.size(), um.type(), 17), NORM_INF));
+}
+
 TEST(UMat, setOpenCL)
 {
    // save the current state