Merge remote-tracking branch 'original/master' into constrained-hough-lines

2014-01-31 14:27:51 -08:00
parent b074c67a56 983799d398
commit aaa9be7c65
573 changed files with 16052 additions and 94518 deletions
--- a/modules/imgproc/doc/filtering.rst
+++ b/modules/imgproc/doc/filtering.rst
@@ -412,29 +412,6 @@ http://www.dai.ed.ac.uk/CVonline/LOCAL\_COPIES/MANDUCHI1/Bilateral\_Filtering.ht
 This filter does not work inplace.


-adaptiveBilateralFilter
-----------------------
-Applies the adaptive bilateral filter to an image.
-
-.. ocv:function:: void adaptiveBilateralFilter( InputArray src, OutputArray dst, Size ksize, double sigmaSpace, double maxSigmaColor = 20.0, Point anchor=Point(-1, -1), int borderType=BORDER_DEFAULT )
-
-.. ocv:pyfunction:: cv2.adaptiveBilateralFilter(src, ksize, sigmaSpace[, dst[, anchor[, borderType]]]) -> dst
-
-    :param src: The source image
-
-    :param dst: The destination image; will have the same size and the same type as src
-
-    :param ksize: The kernel size. This is the neighborhood where the local variance will be calculated, and where pixels will contribute (in a weighted manner).
-
-    :param sigmaSpace: Filter sigma in the coordinate space. Larger value of the parameter means that farther pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
-
-    :param maxSigmaColor: Maximum allowed sigma color (will clamp the value calculated in the ksize neighborhood. Larger value of the parameter means that more dissimilar pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
-
-    :param borderType: Pixel extrapolation method.
-
-A main part of our strategy will be to load each raw pixel once, and reuse it to calculate all pixels in the output (filtered) image that need this pixel value. The math of the filter is that of the usual bilateral filter, except that the sigma color is calculated in the neighborhood, and clamped by the optional input value.
-
-
 blur
 ----
 Blurs an image using the normalized box filter.
--- a/modules/imgproc/doc/miscellaneous_transformations.rst
+++ b/modules/imgproc/doc/miscellaneous_transformations.rst
@@ -110,6 +110,8 @@ But in case of a non-linear transformation, an input RGB image should be normali

 If you use ``cvtColor`` with 8-bit images, the conversion will have some information lost. For many applications, this will not be noticeable but it is recommended to use 32-bit images in applications that need the full range of colors or that convert an image before an operation and then convert back.

+If conversion adds the alpha channel, its value will set to the maximum of corresponding channel range: 255 for ``CV_8U``, 65535 for ``CV_16U``, 1 for ``CV_32F``.
+
 The function can do the following transformations:

 *
@@ -124,7 +126,7 @@ The function can do the following transformations:

    .. math::

-        \text{Gray to RGB[A]:} \quad R  \leftarrow Y, G  \leftarrow Y, B  \leftarrow Y, A  \leftarrow 0
+        \text{Gray to RGB[A]:} \quad R  \leftarrow Y, G  \leftarrow Y, B  \leftarrow Y, A  \leftarrow \max (ChannelRange)

    The conversion from a RGB image to gray is done with:

--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -203,7 +203,8 @@ enum { HISTCMP_CORREL        = 0,
       HISTCMP_CHISQR        = 1,
       HISTCMP_INTERSECT     = 2,
       HISTCMP_BHATTACHARYYA = 3,
-       HISTCMP_HELLINGER     = HISTCMP_BHATTACHARYYA
+       HISTCMP_HELLINGER     = HISTCMP_BHATTACHARYYA,
+       HISTCMP_CHISQR_ALT    = 4
     };

 //! the color conversion code
@@ -952,7 +953,7 @@ public:
 */
    CV_WRAP virtual int compareSegments(const Size& size, InputArray lines1, InputArray lines2, InputOutputArray _image = noArray()) = 0;

-    virtual ~LineSegmentDetector() {};
+    virtual ~LineSegmentDetector() { }
 };

 //! Returns a pointer to a LineSegmentDetector class.
@@ -1065,11 +1066,6 @@ CV_EXPORTS_W void bilateralFilter( InputArray src, OutputArray dst, int d,
                                   double sigmaColor, double sigmaSpace,
                                   int borderType = BORDER_DEFAULT );

-//! smooths the image using adaptive bilateral filter
-CV_EXPORTS_W void adaptiveBilateralFilter( InputArray src, OutputArray dst, Size ksize,
-                                           double sigmaSpace, double maxSigmaColor = 20.0, Point anchor=Point(-1, -1),
-                                           int borderType=BORDER_DEFAULT );
-
 //! smooths the image using the box filter. Each pixel is processed in O(1) time
 CV_EXPORTS_W void boxFilter( InputArray src, OutputArray dst, int ddepth,
                             Size ksize, Point anchor = Point(-1,-1),
--- a/modules/imgproc/perf/opencl/perf_accumulate.cpp
+++ b/modules/imgproc/perf/opencl/perf_accumulate.cpp
@@ -0,0 +1,140 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////// Accumulate ///////////////////////////////////
+
+typedef Size_MatType AccumulateFixture;
+
+OCL_PERF_TEST_P(AccumulateFixture, Accumulate,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params), cn = CV_MAT_CN(srcType), dstType = CV_32FC(cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
+
+    UMat src(srcSize, srcType), dst(srcSize, dstType);
+    declare.in(src, dst, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::accumulate(src, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+/////////////////////////////////// AccumulateSquare ///////////////////////////////////
+
+typedef Size_MatType AccumulateSquareFixture;
+
+OCL_PERF_TEST_P(AccumulateSquareFixture, AccumulateSquare,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params), cn = CV_MAT_CN(srcType), dstType = CV_32FC(cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
+
+    UMat src(srcSize, srcType), dst(srcSize, dstType);
+    declare.in(src, dst, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::accumulateSquare(src, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+/////////////////////////////////// AccumulateProduct ///////////////////////////////////
+
+typedef Size_MatType AccumulateProductFixture;
+
+OCL_PERF_TEST_P(AccumulateProductFixture, AccumulateProduct,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params), cn = CV_MAT_CN(srcType), dstType = CV_32FC(cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
+
+    UMat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, dstType);
+    declare.in(src1, src2, dst, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::accumulateProduct(src1, src2, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+/////////////////////////////////// AccumulateWeighted ///////////////////////////////////
+
+typedef Size_MatType AccumulateWeightedFixture;
+
+OCL_PERF_TEST_P(AccumulateWeightedFixture, AccumulateWeighted,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params), cn = CV_MAT_CN(srcType), dstType = CV_32FC(cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
+
+    UMat src(srcSize, srcType), dst(srcSize, dstType);
+    declare.in(src, dst, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::accumulateWeighted(src, dst, 2.0);
+
+    SANITY_CHECK_NOTHING();
+}
+
+} } // namespace cvtest::ocl
+
+#endif
--- a/modules/imgproc/perf/opencl/perf_blend.cpp
+++ b/modules/imgproc/perf/opencl/perf_blend.cpp
@@ -0,0 +1,82 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// BlendLinear ////////////////////////
+
+typedef Size_MatType BlendLinearFixture;
+
+OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params);
+    const double eps = CV_MAT_DEPTH(srcType) <= CV_32S ? 1.0 : 0.2;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, srcType);
+
+    UMat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType);
+    UMat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);
+
+    declare.in(src1, src2, WARMUP_RNG).in(weights1, weights2, WARMUP_READ).out(dst);
+    randu(weights1, 0, 1);
+    randu(weights2, 0, 1);
+
+    OCL_TEST_CYCLE() cv::blendLinear(src1, src2, weights1, weights2, dst);
+
+    SANITY_CHECK(dst, eps);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/perf/opencl/perf_color.cpp
+++ b/modules/imgproc/perf/opencl/perf_color.cpp
@@ -0,0 +1,104 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+using std::tr1::make_tuple;
+
+///////////// cvtColor////////////////////////
+
+CV_ENUM(ConversionTypes, COLOR_RGB2GRAY, COLOR_RGB2BGR, COLOR_RGB2YUV, COLOR_YUV2RGB, COLOR_RGB2YCrCb,
+        COLOR_YCrCb2RGB, COLOR_RGB2XYZ, COLOR_XYZ2RGB, COLOR_RGB2HSV, COLOR_HSV2RGB, COLOR_RGB2HLS,
+        COLOR_HLS2RGB, COLOR_BGR5652BGR, COLOR_BGR2BGR565, COLOR_RGBA2mRGBA, COLOR_mRGBA2RGBA, COLOR_YUV2RGB_NV12)
+
+typedef tuple<Size, tuple<ConversionTypes, int, int> > CvtColorParams;
+typedef TestBaseWithParam<CvtColorParams> CvtColorFixture;
+
+OCL_PERF_TEST_P(CvtColorFixture, CvtColor, testing::Combine(
+                OCL_TEST_SIZES,
+                testing::Values(
+                    make_tuple(ConversionTypes(COLOR_RGB2GRAY), 3, 1),
+                    make_tuple(ConversionTypes(COLOR_RGB2BGR), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_RGB2YUV), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_YUV2RGB), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_RGB2YCrCb), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_YCrCb2RGB), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_RGB2XYZ), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_XYZ2RGB), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_RGB2HSV), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_HSV2RGB), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_RGB2HLS), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_HLS2RGB), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_BGR5652BGR), 2, 3),
+                    make_tuple(ConversionTypes(COLOR_BGR2BGR565), 3, 2),
+                    make_tuple(ConversionTypes(COLOR_RGBA2mRGBA), 4, 4),
+                    make_tuple(ConversionTypes(COLOR_mRGBA2RGBA), 4, 4),
+                    make_tuple(ConversionTypes(COLOR_YUV2RGB_NV12), 1, 3)
+                    )))
+{
+    CvtColorParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const tuple<int, int, int> conversionParams = get<1>(params);
+    const int code = get<0>(conversionParams), scn = get<1>(conversionParams),
+            dcn = get<2>(conversionParams);
+
+    UMat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);
+
+    SANITY_CHECK(dst, 1);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/perf/opencl/perf_filters.cpp
+++ b/modules/imgproc/perf/opencl/perf_filters.cpp
@@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+typedef tuple<Size, MatType, int> FilterParams;
+typedef TestBaseWithParam<FilterParams> FilterFixture;
+
+///////////// Blur ////////////////////////
+
+typedef FilterFixture BlurFixture;
+
+OCL_PERF_TEST_P(BlurFixture, Blur,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params), bordertype = BORDER_CONSTANT;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::blur(src, dst, Size(ksize, ksize), Point(-1, -1), bordertype);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Laplacian////////////////////////
+
+typedef FilterFixture LaplacianFixture;
+
+OCL_PERF_TEST_P(LaplacianFixture, Laplacian,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize, 1);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Erode ////////////////////
+
+typedef FilterFixture ErodeFixture;
+
+OCL_PERF_TEST_P(ErodeFixture, Erode,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const Mat ker = getStructuringElement(MORPH_RECT, Size(ksize, ksize));
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst).in(ker);
+
+    OCL_TEST_CYCLE() cv::erode(src, dst, ker);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Dilate ////////////////////
+
+typedef FilterFixture DilateFixture;
+
+OCL_PERF_TEST_P(DilateFixture, Dilate,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const Mat ker = getStructuringElement(MORPH_RECT, Size(ksize, ksize));
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst).in(ker);
+
+    OCL_TEST_CYCLE() cv::dilate(src, dst, ker);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// MorphologyEx ////////////////////////
+
+CV_ENUM(MorphOp, MORPH_OPEN, MORPH_CLOSE, MORPH_GRADIENT, MORPH_TOPHAT, MORPH_BLACKHAT)
+
+typedef tuple<Size, MatType, MorphOp, int> MorphologyExParams;
+typedef TestBaseWithParam<MorphologyExParams> MorphologyExFixture;
+
+OCL_PERF_TEST_P(MorphologyExFixture, MorphologyEx,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, MorphOp::all(), OCL_PERF_ENUM(3, 5)))
+{
+    const MorphologyExParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), op = get<2>(params), ksize = get<3>(params);
+    const Mat ker = getStructuringElement(MORPH_RECT, Size(ksize, ksize));
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst).in(ker);
+
+    OCL_TEST_CYCLE() cv::morphologyEx(src, dst, op, ker);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Sobel ////////////////////////
+
+typedef Size_MatType SobelFixture;
+
+OCL_PERF_TEST_P(SobelFixture, Sobel,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), dx = 1, dy = 1;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type, sizeof(float) * 2);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::Sobel(src, dst, -1, dx, dy);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Scharr ////////////////////////
+
+typedef Size_MatType ScharrFixture;
+
+OCL_PERF_TEST_P(ScharrFixture, Scharr,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), dx = 1, dy = 0;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type, sizeof(float) * 2);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::Scharr(src, dst, -1, dx, dy);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// GaussianBlur ////////////////////////
+
+typedef FilterFixture GaussianBlurFixture;
+
+OCL_PERF_TEST_P(GaussianBlurFixture, GaussianBlur,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5, 7)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 + DBL_EPSILON : 3e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::GaussianBlur(src, dst, Size(ksize, ksize), 0);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Filter2D ////////////////////////
+
+typedef FilterFixture Filter2DFixture;
+
+OCL_PERF_TEST_P(Filter2DFixture, Filter2D,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type), kernel(ksize, ksize, CV_32SC1);
+    declare.in(src, WARMUP_RNG).in(kernel).out(dst);
+    randu(kernel, -3.0, 3.0);
+
+    OCL_TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Bilateral ////////////////////////
+
+typedef TestBaseWithParam<Size> BilateralFixture;
+
+OCL_PERF_TEST_P(BilateralFixture, Bilateral, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+    const int d = 7;
+    const double sigmacolor = 50.0, sigmaspace = 50.0;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::bilateralFilter(src, dst, d, sigmacolor, sigmaspace);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// MedianBlur ////////////////////////
+
+typedef tuple<Size, int> MedianBlurParams;
+typedef TestBaseWithParam<MedianBlurParams> MedianBlurFixture;
+
+OCL_PERF_TEST_P(MedianBlurFixture, Bilateral, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(3, 5)))
+{
+    MedianBlurParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int ksize = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::medianBlur(src, dst, ksize);
+
+    SANITY_CHECK(dst);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/perf/opencl/perf_gftt.cpp
+++ b/modules/imgproc/perf/opencl/perf_gftt.cpp
@@ -0,0 +1,87 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#include <sstream>
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+//////////////////////////// GoodFeaturesToTrack //////////////////////////
+
+typedef tuple<String, double, bool> GoodFeaturesToTrackParams;
+typedef TestBaseWithParam<GoodFeaturesToTrackParams> GoodFeaturesToTrackFixture;
+
+OCL_PERF_TEST_P(GoodFeaturesToTrackFixture, GoodFeaturesToTrack,
+                ::testing::Combine(OCL_PERF_ENUM(String("gpu/opticalflow/rubberwhale1.png")),
+                                   OCL_PERF_ENUM(0.0, 3.0), Bool()))
+{
+    GoodFeaturesToTrackParams params = GetParam();
+    const String fileName = get<0>(params);
+    const double minDistance = get<1>(params), qualityLevel = 0.01;
+    const bool harrisDetector = get<2>(params);
+    const int maxCorners = 1000;
+
+    Mat img = imread(getDataPath(fileName), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty()) << "could not load " << fileName;
+
+    checkDeviceMaxMemoryAllocSize(img.size(), img.type());
+
+    UMat src(img.size(), img.type()), dst(1, maxCorners, CV_32FC2);
+    img.copyTo(src);
+
+    declare.in(src, WARMUP_READ).out(dst);
+
+    OCL_TEST_CYCLE() cv::goodFeaturesToTrack(src, dst, maxCorners, qualityLevel,
+                                             minDistance, noArray(), 3, harrisDetector, 0.04);
+
+    SANITY_CHECK(dst);
+}
+
+} } // namespace cvtest::ocl
+
+#endif
--- a/modules/imgproc/perf/opencl/perf_imgproc.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgproc.cpp
@@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// equalizeHist ////////////////////////
+
+typedef TestBaseWithParam<Size> EqualizeHistFixture;
+
+OCL_PERF_TEST_P(EqualizeHistFixture, EqualizeHist, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+    const double eps = 1;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::equalizeHist(src, dst);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// calcHist ////////////////////////
+
+typedef TestBaseWithParam<Size> CalcHistFixture;
+
+OCL_PERF_TEST_P(CalcHistFixture, CalcHist, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+
+    const std::vector<int> channels(1, 0);
+    std::vector<float> ranges(2);
+    std::vector<int> histSize(1, 256);
+    ranges[0] = 0;
+    ranges[1] = 256;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), hist(256, 1, CV_32FC1);
+    declare.in(src, WARMUP_RNG).out(hist);
+
+    OCL_TEST_CYCLE() cv::calcHist(std::vector<UMat>(1, src), channels, noArray(), hist, histSize, ranges, false);
+
+    SANITY_CHECK(hist);
+}
+
+/////////// CopyMakeBorder //////////////////////
+
+CV_ENUM(Border, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101)
+
+typedef tuple<Size, MatType, Border> CopyMakeBorderParamType;
+typedef TestBaseWithParam<CopyMakeBorderParamType> CopyMakeBorderFixture;
+
+OCL_PERF_TEST_P(CopyMakeBorderFixture, CopyMakeBorder,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, Border::all()))
+{
+    const CopyMakeBorderParamType params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), borderType = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst;
+    const Size dstSize = srcSize + Size(12, 12);
+    dst.create(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::copyMakeBorder(src, dst, 7, 5, 5, 7, borderType, cv::Scalar(1.0));
+
+    SANITY_CHECK(dst);
+}
+
+///////////// CornerMinEigenVal ////////////////////////
+
+typedef Size_MatType CornerMinEigenValFixture;
+
+OCL_PERF_TEST_P(CornerMinEigenValFixture, CornerMinEigenVal,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), borderType = BORDER_REFLECT;
+    const int blockSize = 7, apertureSize = 1 + 2 * 3;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, CV_32FC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// CornerHarris ////////////////////////
+
+typedef Size_MatType CornerHarrisFixture;
+
+OCL_PERF_TEST_P(CornerHarrisFixture, CornerHarris,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), borderType = BORDER_REFLECT;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, CV_32FC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::cornerHarris(src, dst, 5, 7, 0.1, borderType);
+
+    SANITY_CHECK(dst, 5e-6, ERROR_RELATIVE);
+}
+
+///////////// PreCornerDetect ////////////////////////
+
+typedef Size_MatType PreCornerDetectFixture;
+
+OCL_PERF_TEST_P(PreCornerDetectFixture, PreCornerDetect,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), borderType = BORDER_REFLECT;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, CV_32FC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::preCornerDetect(src, dst, 3, borderType);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// Integral ////////////////////////
+
+typedef tuple<Size, MatDepth> IntegralParams;
+typedef TestBaseWithParam<IntegralParams> IntegralFixture;
+
+OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F)))
+{
+    const IntegralParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int ddepth = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, ddepth);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize + Size(1, 1), ddepth);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::integral(src, dst, ddepth);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// Threshold ////////////////////////
+
+CV_ENUM(ThreshType, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO_INV)
+
+typedef tuple<Size, MatType, ThreshType> ThreshParams;
+typedef TestBaseWithParam<ThreshParams> ThreshFixture;
+
+OCL_PERF_TEST_P(ThreshFixture, Threshold,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, ThreshType::all()))
+{
+    const ThreshParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params);
+    const int threshType = get<2>(params);
+    const double maxValue = 220.0, threshold = 50;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, srcType);
+
+    UMat src(srcSize, srcType), dst(srcSize, srcType);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::threshold(src, dst, threshold, maxValue, threshType);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// CLAHE ////////////////////////
+
+typedef TestBaseWithParam<Size> CLAHEFixture;
+
+OCL_PERF_TEST_P(CLAHEFixture, CLAHE, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
+    const double clipLimit = 40.0;
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    cv::Ptr<cv::CLAHE> clahe = cv::createCLAHE(clipLimit);
+    OCL_TEST_CYCLE() clahe->apply(src, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// SqrBoxFilter ////////////////////////
+
+typedef TestBaseWithParam<Size> SqrBoxFilterFixture;
+
+OCL_PERF_TEST_P(SqrBoxFilterFixture, SqrBoxFilter, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_32SC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::sqrBoxFilter(src, dst, CV_32S, Size(3, 3));
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Canny ////////////////////////
+
+typedef tuple<int, bool> CannyParams;
+typedef TestBaseWithParam<CannyParams> CannyFixture;
+
+OCL_PERF_TEST_P(CannyFixture, Canny, ::testing::Combine(OCL_PERF_ENUM(3, 5), Bool()))
+{
+    const CannyParams params = GetParam();
+    int apertureSize = get<0>(params);
+    bool L2Grad = get<1>(params);
+
+    Mat _img = imread(getDataPath("gpu/stereobm/aloe-L.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_TRUE(!_img.empty()) << "can't open aloe-L.png";
+
+    UMat img;
+    _img.copyTo(img);
+    UMat edges(img.size(), CV_8UC1);
+
+    declare.in(img, WARMUP_RNG).out(edges);
+
+    OCL_TEST_CYCLE() cv::Canny(img, edges, 50.0, 100.0, apertureSize, L2Grad);
+
+    if (apertureSize == 3)
+        SANITY_CHECK(edges);
+    else
+        SANITY_CHECK_NOTHING();
+}
+
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@@ -0,0 +1,210 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// WarpAffine ////////////////////////
+
+CV_ENUM(InterType, INTER_NEAREST, INTER_LINEAR)
+
+typedef tuple<Size, MatType, InterType> WarpAffineParams;
+typedef TestBaseWithParam<WarpAffineParams> WarpAffineFixture;
+
+OCL_PERF_TEST_P(WarpAffineFixture, WarpAffine,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, InterType::all()))
+{
+    static const double coeffs[2][3] =
+    {
+        { cos(CV_PI / 6), -sin(CV_PI / 6), 100.0  },
+        { sin(CV_PI / 6), cos(CV_PI / 6) , -100.0 }
+    };
+    Mat M(2, 3, CV_64F, (void *)coeffs);
+
+    const WarpAffineParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), interpolation = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::warpAffine(src, dst, M, srcSize, interpolation);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// WarpPerspective ////////////////////////
+
+typedef WarpAffineParams WarpPerspectiveParams;
+typedef TestBaseWithParam<WarpPerspectiveParams> WarpPerspectiveFixture;
+
+OCL_PERF_TEST_P(WarpPerspectiveFixture, WarpPerspective,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, InterType::all()))
+{
+    static const double coeffs[3][3] =
+    {
+        {cos(CV_PI / 6), -sin(CV_PI / 6), 100.0},
+        {sin(CV_PI / 6), cos(CV_PI / 6), -100.0},
+        {0.0, 0.0, 1.0}
+    };
+    Mat M(3, 3, CV_64F, (void *)coeffs);
+
+    const WarpPerspectiveParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), interpolation = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::warpPerspective(src, dst, M, srcSize, interpolation);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Resize ////////////////////////
+
+typedef tuple<Size, MatType, InterType, double> ResizeParams;
+typedef TestBaseWithParam<ResizeParams> ResizeFixture;
+
+OCL_PERF_TEST_P(ResizeFixture, Resize,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES,
+                               InterType::all(), ::testing::Values(0.5, 2.0)))
+{
+    const ResizeParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), interType = get<2>(params);
+    double scale = get<3>(params);
+    const Size dstSize(cvRound(srcSize.width * scale), cvRound(srcSize.height * scale));
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(dstSize, type);
+
+    UMat src(srcSize, type), dst(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::resize(src, dst, Size(), scale, scale, interType);
+
+    SANITY_CHECK(dst, eps);
+}
+
+typedef tuple<Size, MatType, double> ResizeAreaParams;
+typedef TestBaseWithParam<ResizeAreaParams> ResizeAreaFixture;
+
+OCL_PERF_TEST_P(ResizeAreaFixture, Resize,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, ::testing::Values(0.3, 0.5, 0.6)))
+{
+    const ResizeAreaParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    double scale = get<2>(params);
+    const Size dstSize(cvRound(srcSize.width * scale), cvRound(srcSize.height * scale));
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(dstSize, type);
+
+    UMat src(srcSize, type), dst(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::resize(src, dst, Size(), scale, scale, cv::INTER_AREA);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Remap ////////////////////////
+
+typedef tuple<Size, MatType, InterType> RemapParams;
+typedef TestBaseWithParam<RemapParams> RemapFixture;
+
+OCL_PERF_TEST_P(RemapFixture, Remap,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, InterType::all()))
+{
+    const RemapParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), interpolation = get<2>(params), borderMode = BORDER_CONSTANT;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    UMat xmap(srcSize, CV_32FC1), ymap(srcSize, CV_32FC1);
+
+    {
+        Mat _xmap = xmap.getMat(ACCESS_WRITE), _ymap = ymap.getMat(ACCESS_WRITE);
+        for (int i = 0; i < srcSize.height; ++i)
+        {
+            float * const xmap_row = _xmap.ptr<float>(i);
+            float * const ymap_row = _ymap.ptr<float>(i);
+
+            for (int j = 0; j < srcSize.width; ++j)
+            {
+                xmap_row[j] = (j - srcSize.width * 0.5f) * 0.75f + srcSize.width * 0.5f;
+                ymap_row[j] = (i - srcSize.height * 0.5f) * 0.75f + srcSize.height * 0.5f;
+            }
+        }
+    }
+    declare.in(src, WARMUP_RNG).in(xmap, ymap, WARMUP_READ).out(dst);
+
+    OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
+
+    SANITY_CHECK(dst, eps);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/perf/opencl/perf_matchTemplate.cpp
+++ b/modules/imgproc/perf/opencl/perf_matchTemplate.cpp
@@ -0,0 +1,57 @@
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+
+namespace ocl {
+
+    CV_ENUM(MethodType, TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED)
+
+    typedef std::tr1::tuple<Size, Size, MethodType> ImgSize_TmplSize_Method_t;
+    typedef TestBaseWithParam<ImgSize_TmplSize_Method_t> ImgSize_TmplSize_Method;
+
+    OCL_PERF_TEST_P(ImgSize_TmplSize_Method, MatchTemplate,
+            ::testing::Combine(
+                testing::Values(szSmall128, cv::Size(320, 240),
+                                cv::Size(640, 480), cv::Size(800, 600),
+                                cv::Size(1024, 768), cv::Size(1280, 1024)),
+                testing::Values(cv::Size(12, 12), cv::Size(28, 9),
+                                cv::Size(8, 30), cv::Size(16, 16)),
+                MethodType::all()
+                )
+            )
+    {
+        Size imgSz = get<0>(GetParam());
+        Size tmplSz = get<1>(GetParam());
+        int method = get<2>(GetParam());
+
+        UMat img(imgSz, CV_8UC1);
+        UMat tmpl(tmplSz, CV_8UC1);
+        UMat result(imgSz - tmplSz + Size(1,1), CV_32F);
+
+        declare
+            .in(img, WARMUP_RNG)
+            .in(tmpl, WARMUP_RNG)
+            .out(result)
+            .time(30);
+
+        OCL_TEST_CYCLE() matchTemplate(img, tmpl, result, method);
+
+        bool isNormed =
+            method == TM_CCORR_NORMED ||
+            method == TM_SQDIFF_NORMED ||
+            method == TM_CCOEFF_NORMED;
+        double eps = isNormed ? 3e-2
+            : 255 * 255 * tmpl.total() * 1e-4;
+
+        if (isNormed)
+            SANITY_CHECK(result,eps,ERROR_RELATIVE);
+        else
+            SANITY_CHECK(result, eps);
+    }
+}
+}
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/perf/opencl/perf_moments.cpp
+++ b/modules/imgproc/perf/opencl/perf_moments.cpp
@@ -0,0 +1,78 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other Materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// Moments ////////////////////////
+
+typedef tuple<Size, bool> MomentsParams;
+typedef TestBaseWithParam<MomentsParams> MomentsFixture;
+
+OCL_PERF_TEST_P(MomentsFixture, Moments,
+    ::testing::Combine(OCL_TEST_SIZES, ::testing::Bool()))
+{
+    const MomentsParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const bool binaryImage = get<1>(params);
+
+    cv::Moments m;
+    UMat src(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() m = cv::moments(src, binaryImage);
+
+    SANITY_CHECK_MOMENTS(m, 1e-6, ERROR_RELATIVE);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/perf/opencl/perf_pyramid.cpp
+++ b/modules/imgproc/perf/opencl/perf_pyramid.cpp
@@ -0,0 +1,134 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// PyrDown //////////////////////
+
+typedef Size_MatType PyrDownFixture;
+
+OCL_PERF_TEST_P(PyrDownFixture, PyrDown,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    const Size dstSize((srcSize.height + 1) >> 1, (srcSize.width + 1) >> 1);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(dstSize, type);
+
+    UMat src(srcSize, type), dst(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::pyrDown(src, dst);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// PyrUp ////////////////////////
+
+typedef Size_MatType PyrUpFixture;
+
+OCL_PERF_TEST_P(PyrUpFixture, PyrUp,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    const Size dstSize(srcSize.height << 1, srcSize.width << 1);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(dstSize, type);
+
+    UMat src(srcSize, type), dst(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::pyrDown(src, dst);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// buildPyramid ////////////////////////
+
+typedef Size_MatType BuildPyramidFixture;
+
+OCL_PERF_TEST_P(BuildPyramidFixture, BuildPyramid,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), maxLevel = 5;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    std::vector<UMat> dst(maxLevel);
+    UMat src(srcSize, type);
+    declare.in(src, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::buildPyramid(src, dst, maxLevel);
+
+    UMat dst0 = dst[0], dst1 = dst[1], dst2 = dst[2], dst3 = dst[3], dst4 = dst[4];
+
+    SANITY_CHECK(dst0, eps);
+    SANITY_CHECK(dst1, eps);
+    SANITY_CHECK(dst2, eps);
+    SANITY_CHECK(dst3, eps);
+    SANITY_CHECK(dst4, eps);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/perf/perf_filter2d.cpp
+++ b/modules/imgproc/perf/perf_filter2d.cpp
@@ -8,7 +8,7 @@ using std::tr1::make_tuple;
 using std::tr1::get;


-CV_ENUM(BorderMode, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101);
+CV_ENUM(BorderMode, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101)

 typedef TestBaseWithParam< tr1::tuple<Size, int, BorderMode> > TestFilter2d;
 typedef TestBaseWithParam< tr1::tuple<string, int> > Image_KernelSize;
--- a/modules/imgproc/perf/perf_phasecorr.cpp
+++ b/modules/imgproc/perf/perf_phasecorr.cpp
@@ -0,0 +1,22 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace perf;
+using namespace testing;
+using std::tr1::make_tuple;
+using std::tr1::get;
+
+typedef TestBaseWithParam<Size > CreateHanningWindowFixture;
+
+PERF_TEST_P( CreateHanningWindowFixture, CreateHanningWindow, Values(szVGA, sz1080p))
+{
+    const Size size = GetParam();
+    Mat dst(size, CV_32FC1);
+
+    declare.in(dst, WARMUP_RNG).out(dst);
+
+    TEST_CYCLE() cv::createHanningWindow(dst, size, CV_32FC1);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -41,6 +41,7 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 namespace cv
 {
@@ -352,15 +353,83 @@ inline int getAccTabIdx(int sdepth, int ddepth)
           sdepth == CV_64F && ddepth == CV_64F ? 6 : -1;
 }

+#ifdef HAVE_OPENCL
+
+enum
+{
+    ACCUMULATE = 0,
+    ACCUMULATE_SQUARE = 1,
+    ACCUMULATE_PRODUCT = 2,
+    ACCUMULATE_WEIGHTED = 3
+};
+
+static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray _dst, double alpha,
+                            InputArray _mask, int op_type )
+{
+    CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
+              op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
+
+    int stype = _src.type(), cn = CV_MAT_CN(stype);
+    int sdepth = CV_MAT_DEPTH(stype), ddepth = _dst.depth();
+
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+            haveMask = !_mask.empty();
+
+    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
+        return false;
+
+    const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
+                                   "ACCUMULATE_WEIGHTED" };
+
+    ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
+                  format("-D %s%s -D srcT=%s -D cn=%d -D dstT=%s%s",
+                         opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
+                         ocl::typeToStr(sdepth), cn, ocl::typeToStr(ddepth),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(), mask = _mask.getUMat();
+
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
+            dstarg = ocl::KernelArg::ReadWrite(dst),
+            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+
+    int argidx = k.set(0, srcarg);
+    if (op_type == ACCUMULATE_PRODUCT)
+        argidx = k.set(argidx, src2arg);
+    argidx = k.set(argidx, dstarg);
+    if (op_type == ACCUMULATE_WEIGHTED)
+    {
+        if (ddepth == CV_32F)
+            argidx = k.set(argidx, (float)alpha);
+        else
+            argidx = k.set(argidx, alpha);
+    }
+    if (haveMask)
+        argidx = k.set(argidx, maskarg);
+
+    size_t globalsize[2] = { src.cols, src.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+
 }

 void cv::accumulate( InputArray _src, InputOutputArray _dst, InputArray _mask )
 {
-    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-    int sdepth = src.depth(), ddepth = dst.depth(), cn = src.channels();
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);

-    CV_Assert( dst.size == src.size && dst.channels() == cn );
-    CV_Assert( mask.empty() || (mask.size == src.size && mask.type() == CV_8U) );
+    CV_Assert( _src.sameSize(_dst) && dcn == scn );
+    CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE))
+
+    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();

    int fidx = getAccTabIdx(sdepth, ddepth);
    AccFunc func = fidx >= 0 ? accTab[fidx] : 0;
@@ -372,17 +441,21 @@ void cv::accumulate( InputArray _src, InputOutputArray _dst, InputArray _mask )
    int len = (int)it.size;

    for( size_t i = 0; i < it.nplanes; i++, ++it )
-        func(ptrs[0], ptrs[1], ptrs[2], len, cn);
+        func(ptrs[0], ptrs[1], ptrs[2], len, scn);
 }

-
 void cv::accumulateSquare( InputArray _src, InputOutputArray _dst, InputArray _mask )
 {
-    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-    int sdepth = src.depth(), ddepth = dst.depth(), cn = src.channels();
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);

-    CV_Assert( dst.size == src.size && dst.channels() == cn );
-    CV_Assert( mask.empty() || (mask.size == src.size && mask.type() == CV_8U) );
+    CV_Assert( _src.sameSize(_dst) && dcn == scn );
+    CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE_SQUARE))
+
+    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();

    int fidx = getAccTabIdx(sdepth, ddepth);
    AccFunc func = fidx >= 0 ? accSqrTab[fidx] : 0;
@@ -394,18 +467,23 @@ void cv::accumulateSquare( InputArray _src, InputOutputArray _dst, InputArray _m
    int len = (int)it.size;

    for( size_t i = 0; i < it.nplanes; i++, ++it )
-        func(ptrs[0], ptrs[1], ptrs[2], len, cn);
+        func(ptrs[0], ptrs[1], ptrs[2], len, scn);
 }

 void cv::accumulateProduct( InputArray _src1, InputArray _src2,
                            InputOutputArray _dst, InputArray _mask )
 {
-    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-    int sdepth = src1.depth(), ddepth = dst.depth(), cn = src1.channels();
+    int stype = _src1.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);

-    CV_Assert( src2.size && src1.size && src2.type() == src1.type() );
-    CV_Assert( dst.size == src1.size && dst.channels() == cn );
-    CV_Assert( mask.empty() || (mask.size == src1.size && mask.type() == CV_8U) );
+    CV_Assert( _src1.sameSize(_src2) && stype == _src2.type() );
+    CV_Assert( _src1.sameSize(_dst) && dcn == scn );
+    CV_Assert( _mask.empty() || (_src1.sameSize(_mask) && _mask.type() == CV_8U) );
+
+    CV_OCL_RUN(_src1.dims() <= 2 && _dst.isUMat(),
+               ocl_accumulate(_src1, _src2, _dst, 0.0, _mask, ACCUMULATE_PRODUCT))
+
+    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), dst = _dst.getMat(), mask = _mask.getMat();

    int fidx = getAccTabIdx(sdepth, ddepth);
    AccProdFunc func = fidx >= 0 ? accProdTab[fidx] : 0;
@@ -417,18 +495,22 @@ void cv::accumulateProduct( InputArray _src1, InputArray _src2,
    int len = (int)it.size;

    for( size_t i = 0; i < it.nplanes; i++, ++it )
-        func(ptrs[0], ptrs[1], ptrs[2], ptrs[3], len, cn);
+        func(ptrs[0], ptrs[1], ptrs[2], ptrs[3], len, scn);
 }

-
 void cv::accumulateWeighted( InputArray _src, InputOutputArray _dst,
                             double alpha, InputArray _mask )
 {
-    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-    int sdepth = src.depth(), ddepth = dst.depth(), cn = src.channels();
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);

-    CV_Assert( dst.size == src.size && dst.channels() == cn );
-    CV_Assert( mask.empty() || (mask.size == src.size && mask.type() == CV_8U) );
+    CV_Assert( _src.sameSize(_dst) && dcn == scn );
+    CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_accumulate(_src, noArray(), _dst, alpha, _mask, ACCUMULATE_WEIGHTED))
+
+    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();

    int fidx = getAccTabIdx(sdepth, ddepth);
    AccWFunc func = fidx >= 0 ? accWTab[fidx] : 0;
@@ -440,7 +522,7 @@ void cv::accumulateWeighted( InputArray _src, InputOutputArray _dst,
    int len = (int)it.size;

    for( size_t i = 0; i < it.nplanes; i++, ++it )
-        func(ptrs[0], ptrs[1], ptrs[2], len, cn, alpha);
+        func(ptrs[0], ptrs[1], ptrs[2], len, scn, alpha);
 }


--- a/modules/imgproc/src/blend.cpp
+++ b/modules/imgproc/src/blend.cpp
@@ -91,6 +91,8 @@ private:
    Mat * dst;
 };

+#ifdef HAVE_OPENCL
+
 static bool ocl_blendLinear( InputArray _src1, InputArray _src2, InputArray _weights1, InputArray _weights2, OutputArray _dst )
 {
    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
@@ -113,6 +115,8 @@ static bool ocl_blendLinear( InputArray _src1, InputArray _src2, InputArray _wei
    return k.run(2, globalsize, NULL, false);
 }

+#endif
+
 }

 void cv::blendLinear( InputArray _src1, InputArray _src2, InputArray _weights1, InputArray _weights2, OutputArray _dst )
@@ -126,8 +130,8 @@ void cv::blendLinear( InputArray _src1, InputArray _src2, InputArray _weights1,

    _dst.create(size, type);

-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_blendLinear(_src1, _src2, _weights1, _weights2, _dst))
-        return;
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_blendLinear(_src1, _src2, _weights1, _weights2, _dst))

    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), weights1 = _weights1.getMat(),
            weights2 = _weights2.getMat(), dst = _dst.getMat();
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -40,6 +40,7 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"

 /*
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
@@ -48,9 +49,11 @@
 #undef USE_IPP_CANNY
 #endif
 */
-#ifdef USE_IPP_CANNY
+
 namespace cv
 {
+
+#ifdef USE_IPP_CANNY
 static bool ippCanny(const Mat& _src, Mat& _dst, float low,  float high)
 {
    int size = 0, size1 = 0;
@@ -83,22 +86,169 @@ static bool ippCanny(const Mat& _src, Mat& _dst, float low,  float high)
        return false;
    return true;
 }
-}
 #endif

+#ifdef HAVE_OPENCL
+
+static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float high_thresh,
+                      int aperture_size, bool L2gradient, int cn, const Size & size)
+{
+    UMat dx(size, CV_16SC(cn)), dy(size, CV_16SC(cn));
+
+    if (L2gradient)
+    {
+        low_thresh = std::min(32767.0f, low_thresh);
+        high_thresh = std::min(32767.0f, high_thresh);
+
+        if (low_thresh > 0) low_thresh *= low_thresh;
+        if (high_thresh > 0) high_thresh *= high_thresh;
+    }
+    int low = cvFloor(low_thresh), high = cvFloor(high_thresh);
+    Size esize(size.width + 2, size.height + 2);
+
+    UMat mag;
+    size_t globalsize[2] = { size.width * cn, size.height }, localsize[2] = { 16, 16 };
+
+    if (aperture_size == 3 && !_src.isSubmatrix())
+    {
+        // Sobel calculation
+        ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc);
+        if (calcSobelRowPassKernel.empty())
+            return false;
+
+        UMat src = _src.getUMat(), dxBuf(size, CV_16SC(cn)), dyBuf(size, CV_16SC(cn));
+        calcSobelRowPassKernel.args(ocl::KernelArg::ReadOnly(src),
+                                    ocl::KernelArg::WriteOnlyNoSize(dxBuf),
+                                    ocl::KernelArg::WriteOnlyNoSize(dyBuf));
+
+        if (!calcSobelRowPassKernel.run(2, globalsize, localsize, false))
+            return false;
+
+        // magnitude calculation
+        ocl::Kernel magnitudeKernel("calcMagnitude_buf", ocl::imgproc::canny_oclsrc,
+                                    L2gradient ? " -D L2GRAD" : "");
+        if (magnitudeKernel.empty())
+            return false;
+
+        mag = UMat(esize, CV_32SC(cn), Scalar::all(0));
+        dx.create(size, CV_16SC(cn));
+        dy.create(size, CV_16SC(cn));
+
+        magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dxBuf), ocl::KernelArg::ReadOnlyNoSize(dyBuf),
+                             ocl::KernelArg::WriteOnlyNoSize(dx), ocl::KernelArg::WriteOnlyNoSize(dy),
+                             ocl::KernelArg::WriteOnlyNoSize(mag, cn), size.height, size.width);
+
+        if (!magnitudeKernel.run(2, globalsize, localsize, false))
+            return false;
+    }
+    else
+    {
+        dx.create(size, CV_16SC(cn));
+        dy.create(size, CV_16SC(cn));
+
+        Sobel(_src, dx, CV_16SC1, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
+        Sobel(_src, dy, CV_16SC1, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
+
+        // magnitude calculation
+        ocl::Kernel magnitudeKernel("calcMagnitude", ocl::imgproc::canny_oclsrc,
+                                    L2gradient ? " -D L2GRAD" : "");
+        if (magnitudeKernel.empty())
+            return false;
+
+        mag = UMat(esize, CV_32SC(cn), Scalar::all(0));
+        magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
+                             ocl::KernelArg::WriteOnlyNoSize(mag, cn), size.height, size.width);
+
+        if (!magnitudeKernel.run(2, globalsize, NULL, false))
+            return false;
+    }
+
+    // map calculation
+    ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc);
+    if (calcMapKernel.empty())
+        return false;
+
+    UMat map(esize, CV_32SC(cn));
+    calcMapKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
+                       ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map, cn),
+                       size.height, size.width, low, high);
+
+    if (!calcMapKernel.run(2, globalsize, localsize, false))
+        return false;
+
+    // local hysteresis thresholding
+    ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc);
+    if (edgesHysteresisLocalKernel.empty())
+        return false;
+
+    UMat stack(1, size.area(), CV_16UC2), counter(1, 1, CV_32SC1, Scalar::all(0));
+    edgesHysteresisLocalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack),
+                                    ocl::KernelArg::PtrReadWrite(counter), size.height, size.width);
+    if (!edgesHysteresisLocalKernel.run(2, globalsize, localsize, false))
+        return false;
+
+    // global hysteresis thresholding
+    UMat stack2(1, size.area(), CV_16UC2);
+    int count;
+
+    for ( ; ; )
+    {
+        ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc);
+        if (edgesHysteresisGlobalKernel.empty())
+            return false;
+
+        {
+            Mat _counter = counter.getMat(ACCESS_RW);
+            count = _counter.at<int>(0, 0);
+            if (count == 0)
+                break;
+
+            _counter.at<int>(0, 0) = 0;
+        }
+
+        edgesHysteresisGlobalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack),
+                                         ocl::KernelArg::PtrReadWrite(stack2), ocl::KernelArg::PtrReadWrite(counter),
+                                         size.height, size.width, count);
+
+#define divUp(total, grain) ((total + grain - 1) / grain)
+        size_t localsize2[2] = { 128, 1 }, globalsize2[2] = { std::min(count, 65535) * 128, divUp(count, 65535) };
+#undef divUp
+
+        if (!edgesHysteresisGlobalKernel.run(2, globalsize2, localsize2, false))
+            return false;
+
+        std::swap(stack, stack2);
+    }
+
+    // get edges
+    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc);
+    if (getEdgesKernel.empty())
+        return false;
+
+    _dst.create(size, CV_8UC(cn));
+    UMat dst = _dst.getUMat();
+
+    getEdgesKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::WriteOnly(dst));
+    return getEdgesKernel.run(2, globalsize, NULL, false);
+}
+
+#endif
+
+}
+
 void cv::Canny( InputArray _src, OutputArray _dst,
                double low_thresh, double high_thresh,
                int aperture_size, bool L2gradient )
 {
-    Mat src = _src.getMat();
-    CV_Assert( src.depth() == CV_8U );
+    const int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    const Size size = _src.size();

-    _dst.create(src.size(), CV_8U);
-    Mat dst = _dst.getMat();
+    CV_Assert( depth == CV_8U );
+    _dst.create(size, CV_8U);

    if (!L2gradient && (aperture_size & CV_CANNY_L2_GRADIENT) == CV_CANNY_L2_GRADIENT)
    {
-        //backward compatibility
+        // backward compatibility
        aperture_size &= ~CV_CANNY_L2_GRADIENT;
        L2gradient = true;
    }
@@ -109,6 +259,11 @@ void cv::Canny( InputArray _src, OutputArray _dst,
    if (low_thresh > high_thresh)
        std::swap(low_thresh, high_thresh);

+    CV_OCL_RUN(_dst.isUMat() && cn == 1,
+               ocl_Canny(_src, _dst, (float)low_thresh, (float)high_thresh, aperture_size, L2gradient, cn, size))
+
+    Mat src = _src.getMat(), dst = _dst.getMat();
+
 #ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::canny(src, dst, low_thresh, high_thresh, aperture_size, L2gradient))
        return;
@@ -120,12 +275,11 @@ void cv::Canny( InputArray _src, OutputArray _dst,
        return;
 #endif

-    const int cn = src.channels();
    Mat dx(src.rows, src.cols, CV_16SC(cn));
    Mat dy(src.rows, src.cols, CV_16SC(cn));

-    Sobel(src, dx, CV_16S, 1, 0, aperture_size, 1, 0, cv::BORDER_REPLICATE);
-    Sobel(src, dy, CV_16S, 0, 1, aperture_size, 1, 0, cv::BORDER_REPLICATE);
+    Sobel(src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
+    Sobel(src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);

    if (L2gradient)
    {
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -45,6 +45,8 @@
 // ----------------------------------------------------------------------
 // CLAHE

+#ifdef HAVE_OPENCL
+
 namespace clahe
 {
    static bool calcLut(cv::InputArray _src, cv::OutputArray _dst,
@@ -88,7 +90,7 @@ namespace clahe
        return true;
    }

-    static bool transform(const cv::InputArray _src, cv::OutputArray _dst, const cv::InputArray _lut,
+    static bool transform(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _lut,
        const int tilesX, const int tilesY, const cv::Size & tileSize)
    {

@@ -124,6 +126,8 @@ namespace clahe
    }
 }

+#endif
+
 namespace
 {
    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
@@ -321,9 +325,12 @@ namespace
        int tilesY_;

        cv::Mat srcExt_;
-        cv::UMat usrcExt_;
        cv::Mat lut_;
+
+#ifdef HAVE_OPENCL
+        cv::UMat usrcExt_;
        cv::UMat ulut_;
+#endif
    };

    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
@@ -340,7 +347,9 @@ namespace
    {
        CV_Assert( _src.type() == CV_8UC1 );

+#ifdef HAVE_OPENCL
        bool useOpenCL = cv::ocl::useOpenCL() && _src.isUMat() && _src.dims()<=2;
+#endif

        const int histSize = 256;

@@ -354,6 +363,7 @@ namespace
        }
        else
        {
+#ifdef HAVE_OPENCL
            if(useOpenCL)
            {
                cv::copyMakeBorder(_src, usrcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
@@ -361,6 +371,7 @@ namespace
                _srcForLut = usrcExt_;
            }
            else
+#endif
            {
                cv::copyMakeBorder(_src, srcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
                tileSize = cv::Size(srcExt_.size().width / tilesX_, srcExt_.size().height / tilesY_);
@@ -378,9 +389,11 @@ namespace
            clipLimit = std::max(clipLimit, 1);
        }

-        if(useOpenCL && clahe::calcLut(_srcForLut, ulut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale) )
+#ifdef HAVE_OPENCL
+        if (useOpenCL && clahe::calcLut(_srcForLut, ulut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale) )
            if( clahe::transform(_src, _dst, ulut_, tilesX_, tilesY_, tileSize) )
                return;
+#endif

        cv::Mat src = _src.getMat();
        _dst.create( src.size(), src.type() );
@@ -420,8 +433,10 @@ namespace
    {
        srcExt_.release();
        lut_.release();
+#ifdef HAVE_OPENCL
        usrcExt_.release();
        ulut_.release();
+#endif
    }
 }

--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -2688,6 +2688,7 @@ struct mRGBA2RGBA
    }
 };

+#ifdef HAVE_OPENCL

 static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
@@ -2996,7 +2997,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
        }
        else
            k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
-                     format("-D depth=%d -D hscale=%f -D bidx=%d -D scn=%d -D dcn=3", depth, hrange*(1.f/360.f), bidx, scn));
+                     format("-D depth=%d -D hscale=%ff -D bidx=%d -D scn=%d -D dcn=3", depth, hrange*(1.f/360.f), bidx, scn));
        break;
    }
    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
@@ -3014,7 +3015,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

        String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
        k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d -D hrange=%d -D hscale=%f",
+                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d -D hrange=%d -D hscale=%ff",
                        depth, dcn, bidx, hrange, 6.f/hrange));
        break;
    }
@@ -3041,6 +3042,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
    return ok;
 }

+#endif
+
 }//namespace cv

 //////////////////////////////////////////////////////////////////////////////////////////
@@ -3049,12 +3052,11 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )

 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
-    bool use_opencl = ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT;
    int stype = _src.type();
    int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;

-    if( use_opencl && ocl_cvtColor(_src, _dst, code, dcn) )
-        return;
+    CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat(),
+                ocl_cvtColor(_src, _dst, code, dcn) )

    Mat src = _src.getMat(), dst;
    Size sz = src.size();
@@ -3658,7 +3660,7 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                int ustepIdx = 0;
                int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;

-                if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); };
+                if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }

                switch(dcn*10 + bIdx)
                {
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -41,14 +41,12 @@
 //M*/

 #include "precomp.hpp"
-#include <stdio.h>
-
+#include "opencl_kernels.hpp"

 namespace cv
 {

-static void
-calcMinEigenVal( const Mat& _cov, Mat& _dst )
+static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
 {
    int i, j;
    Size size = _cov.size();
@@ -104,8 +102,7 @@ calcMinEigenVal( const Mat& _cov, Mat& _dst )
 }


-static void
-calcHarris( const Mat& _cov, Mat& _dst, double k )
+static void calcHarris( const Mat& _cov, Mat& _dst, double k )
 {
    int i, j;
    Size size = _cov.size();
@@ -219,8 +216,7 @@ static void eigen2x2( const float* cov, float* dst, int n )
    }
 }

-static void
-calcEigenValsVecs( const Mat& _cov, Mat& _dst )
+static void calcEigenValsVecs( const Mat& _cov, Mat& _dst )
 {
    Size size = _cov.size();
    if( _cov.isContinuous() && _dst.isContinuous() )
@@ -306,12 +302,110 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
        calcEigenValsVecs( cov, eigenv );
 }

+#ifdef HAVE_OPENCL
+
+static bool ocl_cornerMinEigenValVecs(InputArray _src, OutputArray _dst, int block_size,
+                                      int aperture_size, double k, int borderType, int op_type)
+{
+    CV_Assert(op_type == HARRIS || op_type == MINEIGENVAL);
+
+    if ( !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
+           borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101) )
+        return false;
+
+    int type = _src.type(), depth = CV_MAT_DEPTH(type);
+    double scale = (double)(1 << ((aperture_size > 0 ? aperture_size : 3) - 1)) * block_size;
+    if( aperture_size < 0 )
+        scale *= 2.0;
+    if( depth == CV_8U )
+        scale *= 255.0;
+    scale = 1.0 / scale;
+
+    if ( !(type == CV_8UC1 || type == CV_32FC1) )
+        return false;
+
+    UMat Dx, Dy;
+    if (aperture_size > 0)
+    {
+        Sobel(_src, Dx, CV_32F, 1, 0, aperture_size, scale, 0, borderType);
+        Sobel(_src, Dy, CV_32F, 0, 1, aperture_size, scale, 0, borderType);
+    }
+    else
+    {
+        Scharr(_src, Dx, CV_32F, 1, 0, scale, 0, borderType);
+        Scharr(_src, Dy, CV_32F, 0, 1, scale, 0, borderType);
+    }
+
+    const char * const borderTypes[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT",
+                                         0, "BORDER_REFLECT101" };
+    const char * const cornerType[] = { "CORNER_MINEIGENVAL", "CORNER_HARRIS", 0 };
+
+    ocl::Kernel cornelKernel("corner", ocl::imgproc::corner_oclsrc,
+                             format("-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s -D %s",
+                                    block_size / 2, block_size / 2, block_size, block_size,
+                                    borderTypes[borderType], cornerType[op_type]));
+    if (cornelKernel.empty())
+        return false;
+
+    _dst.createSameSize(_src, CV_32FC1);
+    UMat dst = _dst.getUMat();
+
+    cornelKernel.args(ocl::KernelArg::ReadOnly(Dx), ocl::KernelArg::ReadOnly(Dy),
+                      ocl::KernelArg::WriteOnly(dst), (float)k);
+
+    size_t blockSizeX = 256, blockSizeY = 1;
+    size_t gSize = blockSizeX - block_size / 2 * 2;
+    size_t globalSizeX = (Dx.cols) % gSize == 0 ? Dx.cols / gSize * blockSizeX : (Dx.cols / gSize + 1) * blockSizeX;
+    size_t rows_per_thread = 2;
+    size_t globalSizeY = ((Dx.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ?
+                         ((Dx.rows + rows_per_thread - 1) / rows_per_thread) :
+                         (((Dx.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
+
+    size_t globalsize[2] = { globalSizeX, globalSizeY }, localsize[2] = { blockSizeX, blockSizeY };
+    return cornelKernel.run(2, globalsize, localsize, false);
+}
+
+static bool ocl_preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int borderType, int depth )
+{
+    UMat Dx, Dy, D2x, D2y, Dxy;
+
+    Sobel( _src, Dx, CV_32F, 1, 0, ksize, 1, 0, borderType );
+    Sobel( _src, Dy, CV_32F, 0, 1, ksize, 1, 0, borderType );
+    Sobel( _src, D2x, CV_32F, 2, 0, ksize, 1, 0, borderType );
+    Sobel( _src, D2y, CV_32F, 0, 2, ksize, 1, 0, borderType );
+    Sobel( _src, Dxy, CV_32F, 1, 1, ksize, 1, 0, borderType );
+
+    _dst.create( _src.size(), CV_32FC1 );
+    UMat dst = _dst.getUMat();
+
+    double factor = 1 << (ksize - 1);
+    if( depth == CV_8U )
+        factor *= 255;
+    factor = 1./(factor * factor * factor);
+
+    ocl::Kernel k("preCornerDetect", ocl::imgproc::precornerdetect_oclsrc);
+    if (k.empty())
+        return false;
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(Dx), ocl::KernelArg::ReadOnlyNoSize(Dy),
+           ocl::KernelArg::ReadOnlyNoSize(D2x), ocl::KernelArg::ReadOnlyNoSize(D2y),
+           ocl::KernelArg::ReadOnlyNoSize(Dxy), ocl::KernelArg::WriteOnly(dst), (float)factor);
+
+    size_t globalsize[2] = { dst.cols, dst.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+
 }

 void cv::cornerMinEigenVal( InputArray _src, OutputArray _dst, int blockSize, int ksize, int borderType )
 {
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_cornerMinEigenValVecs(_src, _dst, blockSize, ksize, 0.0, borderType, MINEIGENVAL))
+
    Mat src = _src.getMat();
-    _dst.create( src.size(), CV_32F );
+    _dst.create( src.size(), CV_32FC1 );
    Mat dst = _dst.getMat();
    cornerEigenValsVecs( src, dst, blockSize, ksize, MINEIGENVAL, 0, borderType );
 }
@@ -319,8 +413,11 @@ void cv::cornerMinEigenVal( InputArray _src, OutputArray _dst, int blockSize, in

 void cv::cornerHarris( InputArray _src, OutputArray _dst, int blockSize, int ksize, double k, int borderType )
 {
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_cornerMinEigenValVecs(_src, _dst, blockSize, ksize, k, borderType, HARRIS))
+
    Mat src = _src.getMat();
-    _dst.create( src.size(), CV_32F );
+    _dst.create( src.size(), CV_32FC1 );
    Mat dst = _dst.getMat();
    cornerEigenValsVecs( src, dst, blockSize, ksize, HARRIS, k, borderType );
 }
@@ -341,10 +438,14 @@ void cv::cornerEigenValsAndVecs( InputArray _src, OutputArray _dst, int blockSiz

 void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int borderType )
 {
-    Mat Dx, Dy, D2x, D2y, Dxy, src = _src.getMat();
+    int type = _src.type();
+    CV_Assert( type == CV_8UC1 || type == CV_32FC1 );

-    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_32FC1 );
-    _dst.create( src.size(), CV_32F );
+    CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat(),
+                ocl_preCornerDetect(_src, _dst, ksize, borderType, CV_MAT_DEPTH(type)))
+
+    Mat Dx, Dy, D2x, D2y, Dxy, src = _src.getMat();
+    _dst.create( src.size(), CV_32FC1 );
    Mat dst = _dst.getMat();

    Sobel( src, Dx, CV_32F, 1, 0, ksize, 1, 0, borderType );
--- a/modules/imgproc/src/featureselect.cpp
+++ b/modules/imgproc/src/featureselect.cpp
@@ -38,18 +38,183 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 #include <cstdio>
 #include <vector>
+#include <iostream>

 namespace cv
 {

-template<typename T> struct greaterThanPtr
+struct greaterThanPtr :
+        public std::binary_function<const float *, const float *, bool>
 {
-    bool operator()(const T* a, const T* b) const { return *a > *b; }
+    bool operator () (const float * a, const float * b) const
+    { return *a > *b; }
 };

+struct Corner
+{
+    float val;
+    short y;
+    short x;
+
+    bool operator < (const Corner & c) const
+    {  return val > c.val; }
+};
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
+                                     int maxCorners, double qualityLevel, double minDistance,
+                                     InputArray _mask, int blockSize,
+                                     bool useHarrisDetector, double harrisK )
+{
+    UMat eig, tmp;
+    if( useHarrisDetector )
+        cornerHarris( _image, eig, blockSize, 3, harrisK );
+    else
+        cornerMinEigenVal( _image, eig, blockSize, 3 );
+
+    double maxVal = 0;
+    minMaxLoc( eig, NULL, &maxVal, NULL, NULL, _mask );
+    threshold( eig, eig, maxVal*qualityLevel, 0, THRESH_TOZERO );
+    dilate( eig, tmp, Mat());
+
+    Size imgsize = _image.size();
+    std::vector<Corner> tmpCorners;
+    size_t total, i, j, ncorners = 0, possibleCornersCount =
+            std::max(1024, static_cast<int>(imgsize.area() * 0.1));
+    bool haveMask = !_mask.empty();
+
+    // collect list of pointers to features - put them into temporary image
+    {
+        ocl::Kernel k("findCorners", ocl::imgproc::gftt_oclsrc,
+                      format(haveMask ? "-D HAVE_MASK" : ""));
+        if (k.empty())
+            return false;
+
+        UMat counter(1, 1, CV_32SC1, Scalar::all(0)),
+                corners(1, (int)(possibleCornersCount * sizeof(Corner)), CV_8UC1);
+        ocl::KernelArg eigarg = ocl::KernelArg::ReadOnlyNoSize(eig),
+                tmparg = ocl::KernelArg::ReadOnlyNoSize(tmp),
+                cornersarg = ocl::KernelArg::PtrWriteOnly(corners),
+                counterarg = ocl::KernelArg::PtrReadWrite(counter);
+
+        if (!haveMask)
+            k.args(eigarg, tmparg, cornersarg, counterarg,
+                   imgsize.height - 2, imgsize.width - 2);
+        else
+        {
+            UMat mask = _mask.getUMat();
+            k.args(eigarg, ocl::KernelArg::ReadOnlyNoSize(mask), tmparg,
+                   cornersarg, counterarg, imgsize.height - 2, imgsize.width - 2);
+        }
+
+        size_t globalsize[2] = { imgsize.width - 2, imgsize.height - 2 };
+        if (!k.run(2, globalsize, NULL, false))
+            return false;
+
+        total = counter.getMat(ACCESS_READ).at<int>(0, 0);
+        int totalb = (int)(sizeof(Corner) * total);
+
+        tmpCorners.resize(total);
+        Mat mcorners(1, totalb, CV_8UC1, &tmpCorners[0]);
+        corners.colRange(0, totalb).getMat(ACCESS_READ).copyTo(mcorners);
+    }
+
+    std::sort( tmpCorners.begin(), tmpCorners.end() );
+    std::vector<Point2f> corners;
+    corners.reserve(total);
+
+    if (minDistance >= 1)
+    {
+         // Partition the image into larger grids
+        int w = imgsize.width, h = imgsize.height;
+
+        const int cell_size = cvRound(minDistance);
+        const int grid_width = (w + cell_size - 1) / cell_size;
+        const int grid_height = (h + cell_size - 1) / cell_size;
+
+        std::vector<std::vector<Point2f> > grid(grid_width*grid_height);
+        minDistance *= minDistance;
+
+        for( i = 0; i < total; i++ )
+        {
+            const Corner & c = tmpCorners[i];
+            bool good = true;
+
+            int x_cell = c.x / cell_size;
+            int y_cell = c.y / cell_size;
+
+            int x1 = x_cell - 1;
+            int y1 = y_cell - 1;
+            int x2 = x_cell + 1;
+            int y2 = y_cell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(grid_width-1, x2);
+            y2 = std::min(grid_height-1, y2);
+
+            for( int yy = y1; yy <= y2; yy++ )
+                for( int xx = x1; xx <= x2; xx++ )
+                {
+                    std::vector<Point2f> &m = grid[yy*grid_width + xx];
+
+                    if( m.size() )
+                    {
+                        for(j = 0; j < m.size(); j++)
+                        {
+                            float dx = c.x - m[j].x;
+                            float dy = c.y - m[j].y;
+
+                            if( dx*dx + dy*dy < minDistance )
+                            {
+                                good = false;
+                                goto break_out;
+                            }
+                        }
+                    }
+                }
+
+            break_out:
+
+            if (good)
+            {
+                grid[y_cell*grid_width + x_cell].push_back(Point2f((float)c.x, (float)c.y));
+
+                corners.push_back(Point2f((float)c.x, (float)c.y));
+                ++ncorners;
+
+                if( maxCorners > 0 && (int)ncorners == maxCorners )
+                    break;
+            }
+        }
+    }
+    else
+    {
+        for( i = 0; i < total; i++ )
+        {
+            const Corner & c = tmpCorners[i];
+
+            corners.push_back(Point2f((float)c.x, (float)c.y));
+            ++ncorners;
+            if( maxCorners > 0 && (int)ncorners == maxCorners )
+                break;
+        }
+    }
+
+    Mat(corners).convertTo(_corners, _corners.fixedType() ? _corners.type() : CV_32F);
+    return true;
+}
+
+#endif
+
 }

 void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
@@ -57,27 +222,29 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
                              InputArray _mask, int blockSize,
                              bool useHarrisDetector, double harrisK )
 {
-    Mat image = _image.getMat(), mask = _mask.getMat();
-
    CV_Assert( qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0 );
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
+    CV_Assert( _mask.empty() || (_mask.type() == CV_8UC1 && _mask.sameSize(_image)) );

-    Mat eig, tmp;
+    CV_OCL_RUN(_image.dims() <= 2 && _image.isUMat(),
+               ocl_goodFeaturesToTrack(_image, _corners, maxCorners, qualityLevel, minDistance,
+                                    _mask, blockSize, useHarrisDetector, harrisK))
+
+    Mat image = _image.getMat(), eig, tmp;
    if( useHarrisDetector )
        cornerHarris( image, eig, blockSize, 3, harrisK );
    else
        cornerMinEigenVal( image, eig, blockSize, 3 );

    double maxVal = 0;
-    minMaxLoc( eig, 0, &maxVal, 0, 0, mask );
+    minMaxLoc( eig, 0, &maxVal, 0, 0, _mask );
    threshold( eig, eig, maxVal*qualityLevel, 0, THRESH_TOZERO );
    dilate( eig, tmp, Mat());

    Size imgsize = image.size();
-
    std::vector<const float*> tmpCorners;

    // collect list of pointers to features - put them into temporary image
+    Mat mask = _mask.getMat();
    for( int y = 1; y < imgsize.height - 1; y++ )
    {
        const float* eig_data = (const float*)eig.ptr(y);
@@ -92,11 +259,11 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
        }
    }

-    std::sort( tmpCorners.begin(), tmpCorners.end(), greaterThanPtr<float>() );
+    std::sort( tmpCorners.begin(), tmpCorners.end(), greaterThanPtr() );
    std::vector<Point2f> corners;
    size_t i, j, total = tmpCorners.size(), ncorners = 0;

-    if(minDistance >= 1)
+    if (minDistance >= 1)
    {
         // Partition the image into larger grids
        int w = image.cols;
@@ -133,7 +300,6 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
            y2 = std::min(grid_height-1, y2);

            for( int yy = y1; yy <= y2; yy++ )
-            {
                for( int xx = x1; xx <= x2; xx++ )
                {
                    std::vector <Point2f> &m = grid[yy*grid_width + xx];
@@ -153,14 +319,11 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
                        }
                    }
                }
-            }

            break_out:

-            if(good)
+            if (good)
            {
-                // printf("%d: %d %d -> %d %d, %d, %d -- %d %d %d %d, %d %d, c=%d\n",
-                //    i,x, y, x_cell, y_cell, (int)minDistance, cell_size,x1,y1,x2,y2, grid_width,grid_height,c);
                grid[y_cell*grid_width + x_cell].push_back(Point2f((float)x, (float)y));

                corners.push_back(Point2f((float)x, (float)y));
@@ -187,33 +350,6 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
    }

    Mat(corners).convertTo(_corners, _corners.fixedType() ? _corners.type() : CV_32F);
-
-    /*
-    for( i = 0; i < total; i++ )
-    {
-        int ofs = (int)((const uchar*)tmpCorners[i] - eig.data);
-        int y = (int)(ofs / eig.step);
-        int x = (int)((ofs - y*eig.step)/sizeof(float));
-
-        if( minDistance > 0 )
-        {
-            for( j = 0; j < ncorners; j++ )
-            {
-                float dx = x - corners[j].x;
-                float dy = y - corners[j].y;
-                if( dx*dx + dy*dy < minDistance )
-                    break;
-            }
-            if( j < ncorners )
-                continue;
-        }
-
-        corners.push_back(Point2f((float)x, (float)y));
-        ++ncorners;
-        if( maxCorners > 0 && (int)ncorners == maxCorners )
-            break;
-    }
-*/
 }

 CV_IMPL void
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -3115,10 +3115,7 @@ template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFi
    VecOp vecOp;
 };

-}
-
-namespace cv
-{
+#ifdef HAVE_OPENCL

 #define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
 #define ROUNDUP(sz, n)      ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
@@ -3381,6 +3378,7 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
        btype,
        extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
        isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+    build_options += ocl::kernelToStr(kernelX, CV_32F);

    Size srcWholeSize; Point srcOffset;
    src.locateROI(srcWholeSize, srcOffset);
@@ -3393,7 +3391,8 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
        strKernel << "_D" << sdepth;

    ocl::Kernel kernelRow;
-    if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, build_options))
+    if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
+                          build_options))
        return false;

    int idxArg = 0;
@@ -3412,7 +3411,6 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor,
    idxArg = kernelRow.set(idxArg, buf.cols);
    idxArg = kernelRow.set(idxArg, buf.rows);
    idxArg = kernelRow.set(idxArg, radiusY);
-    idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(kernelX.getUMat(ACCESS_READ)));

    return kernelRow.run(2, globalsize, localsize, sync);
 }
@@ -3482,6 +3480,8 @@ static bool ocl_sepColFilter2D(UMat &buf, UMat &dst, Mat &kernelY, int anchor, b
        }
    }

+    build_options += ocl::kernelToStr(kernelY, CV_32F);
+
    ocl::Kernel kernelCol;
    if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options))
        return false;
@@ -3497,7 +3497,6 @@ static bool ocl_sepColFilter2D(UMat &buf, UMat &dst, Mat &kernelY, int anchor, b
    idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize()));
    idxArg = kernelCol.set(idxArg, dst.cols);
    idxArg = kernelCol.set(idxArg, dst.rows);
-    idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(kernelY.getUMat(ACCESS_READ)));

    return kernelCol.run(2, globalsize, localsize, sync);
 }
@@ -3510,8 +3509,8 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
        return false;

    int type = _src.type();
-    if ((CV_8UC1 != type) && (CV_8UC4 == type) &&
-        (CV_32FC1 != type) && (CV_32FC4 == type))
+    if ( !( (CV_8UC1 == type || CV_8UC4 == type || CV_32FC1 == type || CV_32FC4 == type) &&
+            (ddepth == CV_32F || ddepth == CV_8U || ddepth < 0) ) )
        return false;

    int cn = CV_MAT_CN(type);
@@ -3531,8 +3530,6 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,

    if( ddepth < 0 )
        ddepth = sdepth;
-    else if (ddepth != sdepth)
-        return false;

    UMat src = _src.getUMat();
    Size srcWholeSize; Point srcOffset;
@@ -3541,20 +3538,21 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
         (0 != (src.cols % 4))      ||
         (0 != ((src.step / src.elemSize()) % 4))
       )
-    {
        return false;
-    }

    Size srcSize = src.size();
    Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
    UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn));
-    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, true))
+    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false))
        return false;

    _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
    UMat dst = _dst.getUMat();
-    return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, true);
+    return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false);
 }
+
+#endif
+
 }

 cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
@@ -3672,9 +3670,8 @@ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
                   InputArray _kernel, Point anchor,
                   double delta, int borderType )
 {
-    bool use_opencl = ocl::useOpenCL() && _dst.isUMat();
-    if( use_opencl && ocl_filter2D(_src, _dst, ddepth, _kernel, anchor, delta, borderType))
-        return;
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_filter2D(_src, _dst, ddepth, _kernel, anchor, delta, borderType))

    Mat src = _src.getMat(), kernel = _kernel.getMat();

@@ -3722,9 +3719,8 @@ void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
                      InputArray _kernelX, InputArray _kernelY, Point anchor,
                      double delta, int borderType )
 {
-    bool use_opencl = ocl::useOpenCL() && _dst.isUMat();
-    if( use_opencl && ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
-        return;
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))

    Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();

--- a/modules/imgproc/src/gcgraph.hpp
+++ b/modules/imgproc/src/gcgraph.hpp
@@ -380,6 +380,6 @@ bool GCGraph<TWeight>::inSourceSegment( int i )
 {
    CV_Assert( i>=0 && i<(int)vtcs.size() );
    return vtcs[i].t == 0;
-};
+}

 #endif
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -1399,6 +1399,61 @@ static void calcHist( const Mat* images, int nimages, const int* channels,
    }
 }

+#ifdef HAVE_OPENCL
+
+enum
+{
+    BINS = 256
+};
+
+static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32S)
+{
+    int compunits = ocl::Device::getDefault().maxComputeUnits();
+    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+
+    ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
+                  format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, wgs));
+    if (k1.empty())
+        return false;
+
+    _hist.create(BINS, 1, ddepth);
+    UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1),
+            hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1);
+
+    k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist),
+            (int)src.total());
+
+    size_t globalsize = compunits * wgs;
+    if (!k1.run(1, &globalsize, &wgs, false))
+        return false;
+
+    ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc,
+                   format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, (int)wgs));
+    if (k2.empty())
+        return false;
+
+    k2.args(ocl::KernelArg::PtrReadOnly(ghist), ocl::KernelArg::PtrWriteOnly(hist));
+    if (!k2.run(1, &wgs, &wgs, false))
+        return false;
+
+    if (hist.depth() != ddepth)
+        hist.convertTo(_hist, ddepth);
+    else
+        _hist.getUMatRef() = hist;
+
+    return true;
+}
+
+static bool ocl_calcHist(InputArrayOfArrays images, OutputArray hist)
+{
+    std::vector<UMat> v;
+    images.getUMatVector(v);
+
+    return ocl_calcHist1(v[0], hist, CV_32F);
+}
+
+#endif
+
 }

 void cv::calcHist( const Mat* images, int nimages, const int* channels,
@@ -1417,6 +1472,12 @@ void cv::calcHist( InputArrayOfArrays images, const std::vector<int>& channels,
                   const std::vector<float>& ranges,
                   bool accumulate )
 {
+    CV_OCL_RUN(images.total() == 1 && channels.size() == 1 && images.channels(0) == 1 &&
+               channels[0] == 0 && images.isUMatVector() && mask.empty() && !accumulate &&
+               histSize.size() == 1 && histSize[0] == BINS && ranges.size() == 2 &&
+               ranges[0] == 0 && ranges[1] == 256,
+               ocl_calcHist(images, hist))
+
    int i, dims = (int)histSize.size(), rsz = (int)ranges.size(), csz = (int)channels.size();
    int nimages = (int)images.total();

@@ -1929,6 +1990,7 @@ void cv::calcBackProject( const Mat* images, int nimages, const int* channels,
        CV_Error(CV_StsUnsupportedFormat, "");
 }

+#ifdef HAVE_OPENCL

 namespace cv {

@@ -1962,7 +2024,9 @@ static bool ocl_calcBackProject( InputArrayOfArrays _images, std::vector<int> ch
                                 const std::vector<float>& ranges,
                                 float scale, size_t histdims )
 {
-    const std::vector<UMat> & images = *(const std::vector<UMat> *)_images.getObj();
+    std::vector<UMat> images;
+    _images.getUMatVector(images);
+
    size_t nimages = images.size(), totalcn = images[0].channels();

    CV_Assert(nimages > 0);
@@ -2066,19 +2130,22 @@ static bool ocl_calcBackProject( InputArrayOfArrays _images, std::vector<int> ch

 }

+#endif
+
 void cv::calcBackProject( InputArrayOfArrays images, const std::vector<int>& channels,
                          InputArray hist, OutputArray dst,
                          const std::vector<float>& ranges,
                          double scale )
 {
    Size histSize = hist.size();
+#ifdef HAVE_OPENCL
    bool _1D = histSize.height == 1 || histSize.width == 1;
    size_t histdims = _1D ? 1 : hist.dims();
+#endif

-    if (ocl::useOpenCL() && images.isUMatVector() && dst.isUMat() && hist.type() == CV_32FC1 &&
-            histdims <= 2 && ranges.size() == histdims * 2 && histdims == channels.size() &&
-            ocl_calcBackProject(images, channels, hist, dst, ranges, (float)scale, histdims))
-        return;
+    CV_OCL_RUN(dst.isUMat() && hist.type() == CV_32FC1 &&
+               histdims <= 2 && ranges.size() == histdims * 2 && histdims == channels.size(),
+               ocl_calcBackProject(images, channels, hist, dst, ranges, (float)scale, histdims))

    Mat H0 = hist.getMat(), H;
    int hcn = H0.channels();
@@ -3280,49 +3347,17 @@ CV_IMPL void cvEqualizeHist( const CvArr* srcarr, CvArr* dstarr )
    cv::equalizeHist(cv::cvarrToMat(srcarr), cv::cvarrToMat(dstarr));
 }

+#ifdef HAVE_OPENCL
+
 namespace cv {

-enum
-{
-    BINS = 256
-};
-
-static bool ocl_calcHist(InputArray _src, OutputArray _hist)
-{
-    int compunits = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
-
-    ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
-                  format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, wgs));
-    if (k1.empty())
-        return false;
-
-    _hist.create(1, BINS, CV_32SC1);
-    UMat src = _src.getUMat(), hist = _hist.getUMat(), ghist(1, BINS * compunits, CV_32SC1);
-
-    k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist),
-            (int)src.total());
-
-    size_t globalsize = compunits * wgs;
-    if (!k1.run(1, &globalsize, &wgs, false))
-        return false;
-
-    ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc,
-                   format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, (int)wgs));
-    if (k2.empty())
-        return false;
-
-    k2.args(ocl::KernelArg::PtrReadOnly(ghist), ocl::KernelArg::PtrWriteOnly(hist));
-    return k2.run(1, &wgs, &wgs, false);
-}
-
 static bool ocl_equalizeHist(InputArray _src, OutputArray _dst)
 {
    size_t wgs = std::min<size_t>(ocl::Device::getDefault().maxWorkGroupSize(), BINS);

    // calculation of histogram
    UMat hist;
-    if (!ocl_calcHist(_src, hist))
+    if (!ocl_calcHist1(_src, hist))
        return false;

    UMat lut(1, 256, CV_8UC1);
@@ -3340,6 +3375,8 @@ static bool ocl_equalizeHist(InputArray _src, OutputArray _dst)

 }

+#endif
+
 void cv::equalizeHist( InputArray _src, OutputArray _dst )
 {
    CV_Assert( _src.type() == CV_8UC1 );
@@ -3347,8 +3384,8 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
    if (_src.empty())
        return;

-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_equalizeHist(_src, _dst))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_equalizeHist(_src, _dst))

    Mat src = _src.getMat();
    _dst.create( src.size(), src.type() );
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1900,6 +1900,8 @@ private:
 };
 #endif

+#ifdef HAVE_OPENCL
+
 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
                                          float * const alpha_tab, int * const ofs_tab)
 {
@@ -2004,7 +2006,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
        {
            int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
            buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
-                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%f",
+                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
                                               ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
                                               ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
                                  iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
@@ -2069,6 +2071,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
    return k.run(2, globalsize, 0, false);
 }

+#endif
+
 }

 //////////////////////////////////////////////////////////////////////////////////////////
@@ -2196,9 +2200,8 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
        inv_scale_y = (double)dsize.height/ssize.height;
    }

-    if( ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT &&
-            ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))

    Mat src = _src.getMat();
    _dst.create(dsize, src.type());
@@ -2565,15 +2568,15 @@ struct RemapVec_8u
    int operator()( const Mat& _src, void* _dst, const short* XY,
                    const ushort* FXY, const void* _wtab, int width ) const
    {
-        int cn = _src.channels();
+        int cn = _src.channels(), x = 0, sstep = (int)_src.step;

-        if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) )
+        if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
+                sstep > 0x8000 )
            return 0;

        const uchar *S0 = _src.data, *S1 = _src.data + _src.step;
        const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
        uchar* D = (uchar*)_dst;
-        int x = 0, sstep = (int)_src.step;
        __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
        __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
        __m128i z = _mm_setzero_si128();
@@ -3390,6 +3393,8 @@ private:
    const void *ctab;
 };

+#ifdef HAVE_OPENCL
+
 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
                      int interpolation, int borderType, const Scalar& borderValue)
 {
@@ -3462,6 +3467,8 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
    return k.run(2, globalThreads, NULL, false);
 }

+#endif
+
 }

 void cv::remap( InputArray _src, OutputArray _dst,
@@ -3504,8 +3511,8 @@ void cv::remap( InputArray _src, OutputArray _dst,
    CV_Assert( _map1.size().area() > 0 );
    CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));

-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))

    Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
    _dst.create( map1.size(), src.type() );
@@ -3870,6 +3877,8 @@ private:
 };
 #endif

+#ifdef HAVE_OPENCL
+
 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };

 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
@@ -3953,6 +3962,8 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
    return k.run(2, globalThreads, NULL, false);
 }

+#endif
+
 }


@@ -3960,10 +3971,9 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
                     InputArray _M0, Size dsize,
                     int flags, int borderType, const Scalar& borderValue )
 {
-    if (ocl::useOpenCL() && _dst.isUMat() &&
-        ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
-                                            borderValue, OCL_OP_AFFINE))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
+                                 borderValue, OCL_OP_AFFINE))

    Mat src = _src.getMat(), M0 = _M0.getMat();
    _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
@@ -4206,10 +4216,9 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
 {
    CV_Assert( _src.total() > 0 );

-    if (ocl::useOpenCL() && _dst.isUMat() &&
-            ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
                              OCL_OP_PERSPECTIVE))
-        return;

    Mat src = _src.getMat(), M0 = _M0.getMat();
    _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
--- a/modules/imgproc/src/intersection.cpp
+++ b/modules/imgproc/src/intersection.cpp
@@ -231,7 +231,7 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
                // Found a dupe, remove it
                std::swap(intersection[j], intersection.back());
                intersection.pop_back();
-                i--; // restart check
+                j--; // restart check
            }
        }
    }
--- a/modules/imgproc/src/min_enclosing_triangle.cpp
+++ b/modules/imgproc/src/min_enclosing_triangle.cpp
@@ -287,7 +287,7 @@ static void updateSidesCA(const std::vector<cv::Point2f> &polygon,
                          cv::Point2f &sideAStartVertex, cv::Point2f &sideAEndVertex,
                          cv::Point2f &sideCStartVertex, cv::Point2f &sideCEndVertex);

-};
+}


 ///////////////////////////////////// Main functions /////////////////////////////////////
@@ -1560,4 +1560,4 @@ static bool lessOrEqual(double number1, double number2) {
    return ((number1 < number2) || (almostEqual(number1, number2)));
 }

-};
+}
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -363,6 +363,8 @@ Moments::Moments( double _m00, double _m10, double _m01, double _m20, double _m1
    nu30 = mu30*s3; nu21 = mu21*s3; nu12 = mu12*s3; nu03 = mu03*s3;
 }

+#ifdef HAVE_OPENCL
+
 static bool ocl_moments( InputArray _src, Moments& m)
 {
    const int TILE_SIZE = 32;
@@ -427,6 +429,8 @@ static bool ocl_moments( InputArray _src, Moments& m)
    return true;
 }

+#endif
+
 }


@@ -444,10 +448,12 @@ cv::Moments cv::moments( InputArray _src, bool binary )
    if( size.width <= 0 || size.height <= 0 )
        return m;

+#ifdef HAVE_OPENCL
    if( ocl::useOpenCL() && type == CV_8UC1 && !binary &&
        _src.isUMat() && ocl_moments(_src, m) )
        ;
    else
+#endif
    {
        Mat mat = _src.getMat();
        if( mat.checkVector(2) >= 0 && (depth == CV_32F || depth == CV_32S))
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -1284,7 +1284,7 @@ static bool IPPMorphOp(int op, InputArray _src, OutputArray _dst,
 }
 #endif

-static const char* op2str[] = {"ERODE", "DILATE"};
+#ifdef HAVE_OPENCL

 static bool ocl_morphology_op(InputArray _src, OutputArray _dst, InputArray _kernel, Size &ksize, const Point anchor, int iterations, int op)
 {
@@ -1315,6 +1315,7 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, InputArray _ker
        return false;

    char compile_option[128];
+    static const char* op2str[] = {"ERODE", "DILATE"};
    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s %s %s -D GENTYPE=%s -D DEPTH_%d",
        anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], op2str[op], doubleSupport?"-D DOUBLE_SUPPORT" :"", rectKernel?"-D RECTKERNEL":"",
        ocl::typeToStr(_src.type()), _src.depth() );
@@ -1396,19 +1397,17 @@ static bool ocl_morphology_op(InputArray _src, OutputArray _dst, InputArray _ker
    return true;
 }

+#endif
+
 static void morphOp( int op, InputArray _src, OutputArray _dst,
                     InputArray _kernel,
                     Point anchor, int iterations,
                     int borderType, const Scalar& borderValue )
 {
+#ifdef HAVE_OPENCL
    int src_type = _src.type(), dst_type = _dst.type(),
        src_cn = CV_MAT_CN(src_type), src_depth = CV_MAT_DEPTH(src_type);
-
-    bool useOpenCL = cv::ocl::useOpenCL() && _src.isUMat() && _src.size() == _dst.size() && src_type == dst_type &&
-        _src.dims()<=2 && (src_cn == 1 || src_cn == 4) && (anchor.x == -1) && (anchor.y == -1) &&
-        (src_depth == CV_8U || src_depth == CV_32F || src_depth == CV_64F ) &&
-        (borderType == cv::BORDER_CONSTANT) && (borderValue == morphologyDefaultBorderValue()) &&
-        (op == MORPH_ERODE || op == MORPH_DILATE);
+#endif

    Mat kernel = _kernel.getMat();
    Size ksize = kernel.data ? kernel.size() : Size(3,3);
@@ -1423,10 +1422,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,

    if( iterations == 0 || kernel.rows*kernel.cols == 1 )
    {
-        Mat src = _src.getMat();
-        _dst.create( src.size(), src.type() );
-        Mat dst = _dst.getMat();
-        src.copyTo(dst);
+        _src.copyTo(_dst);
        return;
    }

@@ -1446,8 +1442,12 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
        iterations = 1;
    }

-    if (useOpenCL && ocl_morphology_op(_src, _dst, kernel, ksize, anchor, iterations, op) )
-        return;
+    CV_OCL_RUN(_dst.isUMat() && _src.size() == _dst.size() && src_type == dst_type &&
+               _src.dims() <= 2 && (src_cn == 1 || src_cn == 4) && anchor.x == -1 && anchor.y == -1 &&
+               (src_depth == CV_8U || src_depth == CV_32F || src_depth == CV_64F ) &&
+               borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue() &&
+               (op == MORPH_ERODE || op == MORPH_DILATE),
+               ocl_morphology_op(_src, _dst, kernel, ksize, anchor, iterations, op) )

    Mat src = _src.getMat();

--- a/modules/imgproc/src/opencl/accumulate.cl
+++ b/modules/imgproc/src/opencl/accumulate.cl
@@ -0,0 +1,65 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
+__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,
+#ifdef ACCUMULATE_PRODUCT
+                         __global const uchar * src2ptr, int src2_step, int src2_offset,
+#endif
+                         __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
+#ifdef ACCUMULATE_WEIGHTED
+                         , dstT alpha
+#endif
+#ifdef HAVE_MASK
+                         , __global const uchar * mask, int mask_step, int mask_offset
+#endif
+                         )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int src_index = mad24(y, src_step, src_offset + x * cn * (int)sizeof(srcT));
+#ifdef HAVE_MASK
+        int mask_index = mad24(y, mask_step, mask_offset + x);
+        mask += mask_index;
+#endif
+        int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(dstT));
+
+        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
+#ifdef ACCUMULATE_PRODUCT
+        int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(srcT));
+        __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
+#endif
+        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+#ifdef HAVE_MASK
+            if (mask[0])
+#endif
+#ifdef ACCUMULATE
+                dst[c] += src[c];
+#elif defined ACCUMULATE_SQUARE
+                dst[c] += src[c] * src[c];
+#elif defined ACCUMULATE_PRODUCT
+                dst[c] += src[c] * src2[c];
+#elif defined ACCUMULATE_WEIGHTED
+                dst[c] = (1 - alpha) * dst[c] + src[c] * alpha;
+#else
+#error "Unknown accumulation type"
+#endif
+    }
+}
--- a/modules/imgproc/src/opencl/boxFilter.cl
+++ b/modules/imgproc/src/opencl/boxFilter.cl
@@ -39,45 +39,15 @@
 //
 //M*/

-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
 #endif

-#ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT_101
-//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-//blur function does not support BORDER_WRAP
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
-
-#ifdef EXTRA_EXTRAPOLATION // border > src image size
 #ifdef BORDER_CONSTANT
-// None
 #elif defined BORDER_REPLICATE
 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
    { \
@@ -131,248 +101,110 @@
 #else
 #error No extrapolation method
 #endif
-#else
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
-    { \
-        int _row = y - minY, _col = x - minX; \
-        _row = ADDR_H(_row, 0, maxY - minY); \
-        _row = ADDR_B(_row, maxY - minY, _row); \
-        y = _row + minY; \
-        \
-        _col = ADDR_L(_col, 0, maxX - minX); \
-        _col = ADDR_R(_col, maxX - minX, _col); \
-        x = _col + minX; \
-    }
-#endif

-#if USE_DOUBLE
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define FPTYPE double
-#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
-#else
-#define FPTYPE float
-#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
-#endif
-
-#if DATA_DEPTH == 0
-#define BASE_TYPE uchar
-#elif DATA_DEPTH == 1
-#define BASE_TYPE char
-#elif DATA_DEPTH == 2
-#define BASE_TYPE ushort
-#elif DATA_DEPTH == 3
-#define BASE_TYPE short
-#elif DATA_DEPTH == 4
-#define BASE_TYPE int
-#elif DATA_DEPTH == 5
-#define BASE_TYPE float
-#elif DATA_DEPTH == 6
-#define BASE_TYPE double
-#else
-#error data_depth
-#endif
-
-#define __CAT(x, y) x##y
-#define CAT(x, y) __CAT(x, y)
-
-#define uchar1 uchar
-#define char1 char
-#define ushort1 ushort
-#define short1 short
-#define int1 int
-#define float1 float
-#define double1 double
-
-#define convert_uchar1_sat_rte convert_uchar_sat_rte
-#define convert_char1_sat_rte convert_char_sat_rte
-#define convert_ushort1_sat_rte convert_ushort_sat_rte
-#define convert_short1_sat_rte convert_short_sat_rte
-#define convert_int1_sat_rte convert_int_sat_rte
-#define convert_float1
-#define convert_double1
-
-#if DATA_DEPTH == 5 || DATA_DEPTH == 6
-#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
-#else
-#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
-#endif
-
-#define VEC_SIZE DATA_CHAN
-
-#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
-#define TYPE VEC_TYPE
-
-#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
-
-#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
-
-#define TYPE_SIZE   (VEC_SIZE*sizeof(BASE_TYPE))
+#define noconvert

 struct RectCoords
 {
    int x1, y1, x2, y2;
 };

-//#define DEBUG
-#ifdef DEBUG
-#define DEBUG_ONLY(x) x
-#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
-#else
-#define DEBUG_ONLY(x)
-#define ASSERT(condition)
-#endif
-
-
-inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global const uchar* srcptr, int srcstep, const struct RectCoords srcCoords
-#ifdef BORDER_CONSTANT
-               , SCALAR_TYPE borderValue
-#endif
-    )
+inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, const struct RectCoords srcCoords)
 {
 #ifdef BORDER_ISOLATED
-    if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+    if (pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #else
-    if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+    if (pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #endif
    {
-        __global TYPE* ptr = (__global TYPE*)(srcptr + pos.y * srcstep + pos.x * sizeof(TYPE));
-        return CONVERT_TO_FPTYPE(*ptr);
+        int src_index = mad24(pos.y, src_step, pos.x * (int)sizeof(ST));
+        return convertToWT(*(__global const ST *)(srcptr + src_index));
    }
    else
    {
 #ifdef BORDER_CONSTANT
-        return borderValue;
+        return (WT)(0);
 #else
-        int selected_col = pos.x;
-        int selected_row = pos.y;
+        int selected_col = pos.x, selected_row = pos.y;

        EXTRAPOLATE(selected_col, selected_row,
 #ifdef BORDER_ISOLATED
-                srcCoords.x1, srcCoords.y1,
+            srcCoords.x1, srcCoords.y1,
 #else
-                0, 0,
+            0, 0,
 #endif
-                srcCoords.x2, srcCoords.y2
-         );
+            srcCoords.x2, srcCoords.y2);

-        // debug border mapping
-        //printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
-
-        pos = (int2)(selected_col, selected_row);
-        if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-        {
-            __global TYPE* ptr = (__global TYPE*)(srcptr + pos.y * srcstep + pos.x * sizeof(TYPE));
-            return CONVERT_TO_FPTYPE(*ptr);
-        }
-        else
-        {
-            // for debug only
-            DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
-            return (FPTYPE)(0.0f);
-        }
+        int src_index = mad24(selected_row, src_step, selected_col * (int)sizeof(ST));
+        return convertToWT(*(__global const ST *)(srcptr + src_index));
 #endif
    }
 }

-// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
-
-__kernel
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-void boxFilter(__global const uchar* srcptr, int srcstep, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
-                __global uchar* dstptr, int dststep, int dstoffset,
-               int rows, int cols,
-#ifdef BORDER_CONSTANT
-               SCALAR_TYPE borderValue,
+__kernel void boxFilter(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
+                        __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols
+#ifdef NORMALIZE
+                        , float alpha
 #endif
-               FPTYPE alpha
-               )
+                       )
 {
-    const struct RectCoords srcCoords = {srcOffsetX, srcOffsetY, srcEndX, srcEndY}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
+    const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY }; // for non-isolated border: offsetX, offsetY, wholeX, wholeY

-    const int x = get_local_id(0) + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
-    const int y = get_global_id(1) * BLOCK_SIZE_Y;
-
-    const int local_id = get_local_id(0);
-
-    INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
-    __local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
+    int x = get_local_id(0) + (LOCAL_SIZE_X - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
+    int y = get_global_id(1) * BLOCK_SIZE_Y;
+    int local_id = get_local_id(0);

+    WT data[KERNEL_SIZE_Y];
+    __local WT sumOfCols[LOCAL_SIZE_X];
    int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
-    for(int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
-    {
-        data[sy] = readSrcPixel(srcPos, srcptr, srcstep, srcCoords
-#ifdef BORDER_CONSTANT
-                , borderValue
-#endif
-                );
-    }

-    INTERMEDIATE_TYPE tmp_sum = 0;
-    for(int sy = 0; sy < KERNEL_SIZE_Y; sy++)
-    {
-        tmp_sum += (data[sy]);
-    }
+    #pragma unroll
+    for (int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
+        data[sy] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);
+
+    WT tmp_sum = (WT)(0);
+    #pragma unroll
+    for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)
+        tmp_sum += data[sy];

    sumOfCols[local_id] = tmp_sum;
    barrier(CLK_LOCAL_MEM_FENCE);

-    int2 pos = (int2)(x, y);
-    __global TYPE* dstPtr = (__global TYPE*)(dstptr + pos.y * dststep + dstoffset + pos.x * TYPE_SIZE/*sizeof(TYPE)*/); // Pointer can be out of bounds!
+    int dst_index = mad24(y, dst_step, x * (int)sizeof(DT) + dst_offset);
+    __global DT * dst = (__global DT *)(dstptr + dst_index);

    int sy_index = 0; // current index in data[] array
-    int stepsY = min(rows - pos.y, BLOCK_SIZE_Y);
-    ASSERT(stepsY > 0);
-    for (; ;)
+    for (int i = 0, stepY = min(rows - y, BLOCK_SIZE_Y); i < stepY; ++i)
    {
-        ASSERT(pos.y < rows);
-
-        if(local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
-            pos.x >= 0 && pos.x < cols)
+        if (local_id >= ANCHOR_X && local_id < LOCAL_SIZE_X - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
+            x >= 0 && x < cols)
        {
-            ASSERT(pos.y >= 0 && pos.y < rows);
+            WT total_sum = (WT)(0);

-            INTERMEDIATE_TYPE total_sum = 0;
-#pragma unroll
+            #pragma unroll
            for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
-            {
                total_sum += sumOfCols[local_id + sx - ANCHOR_X];
-            }
-            *dstPtr = CONVERT_TO_TYPE(((INTERMEDIATE_TYPE)alpha) * total_sum);
-        }

-#if BLOCK_SIZE_Y == 1
-        break;
+#ifdef NORMALIZE
+            dst[0] = convertToDT((WT)(alpha) * total_sum);
 #else
-        if (--stepsY == 0)
-            break;
-
+            dst[0] = convertToDT(total_sum);
+#endif
+        }
        barrier(CLK_LOCAL_MEM_FENCE);

-        tmp_sum = sumOfCols[local_id]; // TODO FIX IT: workaround for BUG in OpenCL compiler
-        // only works with scalars: ASSERT(fabs(tmp_sum - sumOfCols[local_id]) < (INTERMEDIATE_TYPE)1e-6);
+        tmp_sum = sumOfCols[local_id];
        tmp_sum -= data[sy_index];

-        data[sy_index] = readSrcPixel(srcPos, srcptr, srcstep, srcCoords
-#ifdef BORDER_CONSTANT
-                , borderValue
-#endif
-                );
+        data[sy_index] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);
        srcPos.y++;

        tmp_sum += data[sy_index];
        sumOfCols[local_id] = tmp_sum;

-        sy_index = (sy_index + 1 < KERNEL_SIZE_Y) ? sy_index + 1 : 0;
-
+        sy_index = sy_index + 1 < KERNEL_SIZE_Y ? sy_index + 1 : 0;
        barrier(CLK_LOCAL_MEM_FENCE);

-        // next line
-        DEBUG_ONLY(pos.y++);
-        dstPtr = (__global TYPE*)((__global char*)dstPtr + dststep); // Pointer can be out of bounds!
-#endif // BLOCK_SIZE_Y == 1
+        dst = (__global DT *)((__global uchar *)dst + dst_step);
    }
 }
--- a/modules/imgproc/src/opencl/canny.cl
+++ b/modules/imgproc/src/opencl/canny.cl
@@ -0,0 +1,514 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+// Smoothing perpendicular to the derivative direction with a triangle filter
+// only support 3x3 Sobel kernel
+// h (-1) =  1, h (0) =  2, h (1) =  1
+// h'(-1) = -1, h'(0) =  0, h'(1) =  1
+// thus sobel 2D operator can be calculated as:
+// h'(x, y) = h'(x)h(y) for x direction
+//
+// src		input 8bit single channel image data
+// dx_buf	output dx buffer
+// dy_buf	output dy buffer
+
+__kernel void __attribute__((reqd_work_group_size(16, 16, 1)))
+calcSobelRowPass
+    (__global const uchar * src, int src_step, int src_offset, int rows, int cols,
+     __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
+     __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    __local int smem[16][18];
+
+    smem[lidy][lidx + 1] = src[mad24(src_step, min(gidy, rows - 1), gidx + src_offset)];
+    if (lidx == 0)
+    {
+        smem[lidy][0]  = src[mad24(src_step, min(gidy, rows - 1), max(gidx - 1,  0)        + src_offset)];
+        smem[lidy][17] = src[mad24(src_step, min(gidy, rows - 1), min(gidx + 16, cols - 1) + src_offset)];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (gidy < rows && gidx < cols)
+    {
+        *(__global short *)(dx_buf + mad24(gidy, dx_buf_step, gidx * (int)sizeof(short) + dx_buf_offset)) =
+            smem[lidy][lidx + 2] - smem[lidy][lidx];
+        *(__global short *)(dy_buf + mad24(gidy, dy_buf_step, gidx * (int)sizeof(short) + dy_buf_offset)) =
+            smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
+    }
+}
+
+inline int calc(short x, short y)
+{
+#ifdef L2GRAD
+    return x * x + y * y;
+#else
+    return (x >= 0 ? x : -x) + (y >= 0 ? y : -y);
+#endif
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the non-buffered version(non-3x3 sobel)
+//
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+
+__kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_offset,
+                            __global const uchar * dyptr, int dy_step, int dy_offset,
+                            __global uchar * magptr, int mag_step, int mag_offset, int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        int dx_index = mad24(dx_step, y, x * (int)sizeof(short) + dx_offset);
+        int dy_index = mad24(dy_step, y, x * (int)sizeof(short) + dy_offset);
+        int mag_index = mad24(mag_step, y + 1, (x + 1) * (int)sizeof(int) + mag_offset);
+
+        __global const short * dx = (__global const short *)(dxptr + dx_index);
+        __global const short * dy = (__global const short *)(dyptr + dy_index);
+        __global int * mag = (__global int *)(magptr + mag_index);
+
+        mag[0] = calc(dx[0], dy[0]);
+    }
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the buffered version(3x3 sobel)
+//
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+__kernel void __attribute__((reqd_work_group_size(16, 16, 1)))
+calcMagnitude_buf
+    (__global const short * dx_buf, int dx_buf_step, int dx_buf_offset,
+     __global const short * dy_buf, int dy_buf_step, int dy_buf_offset,
+     __global short * dx, int dx_step, int dx_offset,
+     __global short * dy, int dy_step, int dy_offset,
+     __global int * mag, int mag_step, int mag_offset,
+     int rows, int cols)
+{
+    dx_buf_step    /= sizeof(*dx_buf);
+    dx_buf_offset  /= sizeof(*dx_buf);
+    dy_buf_step    /= sizeof(*dy_buf);
+    dy_buf_offset  /= sizeof(*dy_buf);
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    __local short sdx[18][16];
+    __local short sdy[18][16];
+
+    sdx[lidy + 1][lidx] = dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
+    sdy[lidy + 1][lidx] = dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
+    if (lidy == 0)
+    {
+        sdx[0][lidx]  = dx_buf[gidx + min(max(gidy - 1, 0), rows - 1) * dx_buf_step + dx_buf_offset];
+        sdx[17][lidx] = dx_buf[gidx + min(gidy + 16, rows - 1)        * dx_buf_step + dx_buf_offset];
+
+        sdy[0][lidx]  = dy_buf[gidx + min(max(gidy - 1, 0), rows - 1) * dy_buf_step + dy_buf_offset];
+        sdy[17][lidx] = dy_buf[gidx + min(gidy + 16, rows - 1)        * dy_buf_step + dy_buf_offset];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (gidx < cols && gidy < rows)
+    {
+        short x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
+        short y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+
+        dx[gidx + gidy * dx_step + dx_offset] = x;
+        dy[gidx + gidy * dy_step + dy_offset] = y;
+
+        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// 0.4142135623730950488016887242097 is tan(22.5)
+
+#define CANNY_SHIFT 15
+#define TG22        (int)(0.4142135623730950488016887242097f*(1<<CANNY_SHIFT) + 0.5f)
+
+// First pass of edge detection and non-maximum suppression
+// edgetype is set to for each pixel:
+// 0 - below low thres, not an edge
+// 1 - maybe an edge
+// 2 - is an edge, either magnitude is greater than high thres, or
+//     Given estimates of the image gradients, a search is then carried out
+//     to determine if the gradient magnitude assumes a local maximum in the gradient direction.
+//     if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
+//     if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
+//     if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
+//     if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
+//
+// dx, dy		direvitives of x and y direction
+// mag			magnitudes calculated from calcMagnitude function
+// map			output containing raw edge types
+
+__kernel void __attribute__((reqd_work_group_size(16,16,1)))
+calcMap(
+    __global const uchar * dx, int dx_step, int dx_offset,
+    __global const uchar * dy, int dy_step, int dy_offset,
+    __global const uchar * mag, int mag_step, int mag_offset,
+    __global uchar * map, int map_step, int map_offset,
+    int rows, int cols, int low_thresh, int high_thresh)
+{
+    __local int smem[18][18];
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_global_id(0) & 0xFFFFF0;
+    int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+    int tid = lidx + lidy * 16;
+    int lx = tid % 18;
+    int ly = tid / 18;
+
+    mag += mag_offset;
+    if (ly < 14)
+        smem[ly][lx] = *(__global const int *)(mag +
+            mad24(mag_step, min(grp_idy + ly, rows - 1), (int)sizeof(int) * (grp_idx + lx)));
+    if (ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+        smem[ly + 14][lx] = *(__global const int *)(mag +
+            mad24(mag_step, min(grp_idy + ly + 14, rows - 1), (int)sizeof(int) * (grp_idx + lx)));
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (gidy < rows && gidx < cols)
+    {
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+        int m = smem[lidy + 1][lidx + 1];
+
+        if (m > low_thresh)
+        {
+            short xs = *(__global const short *)(dx + mad24(gidy, dx_step, dx_offset + (int)sizeof(short) * gidx));
+            short ys = *(__global const short *)(dy + mad24(gidy, dy_step, dy_offset + (int)sizeof(short) * gidx));
+            int x = abs(xs), y = abs(ys);
+
+            int tg22x = x * TG22;
+            y <<= CANNY_SHIFT;
+
+            if (y < tg22x)
+            {
+                if (m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+                if (y > tg67x)
+                {
+                    if (m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+                else
+                {
+                    int s = (xs ^ ys) < 0 ? -1 : 1;
+                    if (m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+        }
+        *(__global int *)(map + mad24(map_step, gidy + 1, (gidx + 1) * (int)sizeof(int) + map_offset)) = edge_type;
+    }
+}
+
+#undef CANNY_SHIFT
+#undef TG22
+
+struct PtrStepSz
+{
+    __global uchar * ptr;
+    int step, rows, cols;
+};
+
+inline int get(struct PtrStepSz data, int y, int x)
+{
+    return *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1)));
+}
+
+inline void set(struct PtrStepSz data, int y, int x, int value)
+{
+    *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1))) = value;
+}
+
+// perform Hysteresis for pixel whose edge type is 1
+//
+// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
+// marked as edge. Each thread will iterate for 16 times to connect local edges.
+// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
+// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
+//
+// map		raw edge type results calculated from calcMap.
+// stack	the potiential edge points found in this kernel call
+// counter	the number of potiential edge points
+
+__kernel void __attribute__((reqd_work_group_size(16,16,1)))
+edgesHysteresisLocal
+    (__global uchar * map_ptr, int map_step, int map_offset,
+     __global ushort2 * st, __global unsigned int * counter,
+    int rows, int cols)
+{
+    struct PtrStepSz map = { map_ptr + map_offset, map_step, rows + 1, cols + 1 };
+
+    __local int smem[18][18];
+
+    int2 blockIdx = (int2)(get_group_id(0), get_group_id(1));
+    int2 blockDim = (int2)(get_local_size(0), get_local_size(1));
+    int2 threadIdx = (int2)(get_local_id(0), get_local_id(1));
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0;
+    if (threadIdx.y == 0)
+        smem[0][threadIdx.x + 1] = x < map.cols ? get(map, y - 1, x) : 0;
+    if (threadIdx.y == blockDim.y - 1)
+        smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0;
+    if (threadIdx.x == 0)
+        smem[threadIdx.y + 1][0] = y < map.rows ? get(map, y, x - 1) : 0;
+    if (threadIdx.x == blockDim.x - 1)
+        smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols && y < map.rows ? get(map, y, x + 1) : 0;
+    if (threadIdx.x == 0 && threadIdx.y == 0)
+        smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0;
+    if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+        smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? get(map, y - 1, x + 1) : 0;
+    if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+        smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? get(map, y + 1, x - 1) : 0;
+    if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+        smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? get(map, y + 1, x + 1) : 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x >= cols || y >= rows)
+        return;
+
+    int n;
+
+    #pragma unroll
+    for (int k = 0; k < 16; ++k)
+    {
+        n = 0;
+
+        if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+        {
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+        }
+
+        if (n > 0)
+            smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+    }
+
+    const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+    set(map, y, x, e);
+    n = 0;
+
+    if (e == 2)
+    {
+        n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+        n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+        n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+
+        n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+
+        n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+    }
+
+    if (n > 0)
+    {
+        const int ind = atomic_inc(counter);
+        st[ind] = (ushort2)(x + 1, y + 1);
+    }
+}
+
+__constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+__constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+
+#define stack_size 512
+#define map_index mad24(map_step, pos.y, pos.x * (int)sizeof(int))
+
+__kernel void __attribute__((reqd_work_group_size(128, 1, 1)))
+edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
+    __global ushort2 * st1, __global ushort2 * st2, __global int * counter,
+    int rows, int cols, int count)
+{
+    map += map_offset;
+
+    int lidx = get_local_id(0);
+
+    int grp_idx = get_group_id(0);
+    int grp_idy = get_group_id(1);
+
+    __local unsigned int s_counter, s_ind;
+    __local ushort2 s_st[stack_size];
+
+    if (lidx == 0)
+        s_counter = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
+
+    if (ind < count)
+    {
+        ushort2 pos = st1[ind];
+        if (lidx < 8)
+        {
+            pos.x += c_dx[lidx];
+            pos.y += c_dy[lidx];
+            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1)
+            {
+                *(__global int *)(map + map_index) = 2;
+                ind = atomic_inc(&s_counter);
+                s_st[ind] = pos;
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
+        {
+            const int subTaskIdx = lidx >> 3;
+            const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
+
+            if (subTaskIdx < portion)
+                pos = s_st[s_counter - 1 - subTaskIdx];
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if (lidx == 0)
+                s_counter -= portion;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if (subTaskIdx < portion)
+            {
+                pos.x += c_dx[lidx & 7];
+                pos.y += c_dy[lidx & 7];
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1)
+                {
+                    *(__global int *)(map + map_index) = 2;
+                    ind = atomic_inc(&s_counter);
+                    s_st[ind] = pos;
+                }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if (s_counter > 0)
+        {
+            if (lidx == 0)
+            {
+                ind = atomic_add(counter, s_counter);
+                s_ind = ind - s_counter;
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            ind = s_ind;
+            for (int i = lidx; i < (int)s_counter; i += get_local_size(0))
+                st2[ind + i] = s_st[i];
+        }
+    }
+}
+
+#undef map_index
+#undef stack_size
+
+// Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
+// map		edge type mappings
+// dst		edge output
+
+__kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offset,
+                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        int map_index = mad24(map_step, y + 1, (x + 1) * (int)sizeof(int) + map_offset);
+        int dst_index = mad24(dst_step, y, x + dst_offset);
+
+        __global const int * map = (__global const int *)(mapptr + map_index);
+
+        dst[dst_index] = (uchar)(-(map[0] >> 1));
+    }
+}
--- a/modules/imgproc/src/opencl/corner.cl
+++ b/modules/imgproc/src/opencl/corner.cl
@@ -0,0 +1,227 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef BORDER_CONSTANT
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        x = max(min(x, maxV - 1), 0); \
+    }
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(x, maxV) \
+    { \
+        if (x < 0) \
+            x -= ((x - maxV + 1) / maxV) * maxV; \
+        if (x >= maxV) \
+            x %= maxV; \
+    }
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)
+#define EXTRAPOLATE_(x, maxV, delta) \
+    { \
+        if (maxV == 1) \
+            x = 0; \
+        else \
+            do \
+            { \
+                if ( x < 0 ) \
+                    x = -x - 1 + delta; \
+                else \
+                    x = maxV - 1 - (x - maxV) - delta; \
+            } \
+            while (x >= maxV || x < 0); \
+    }
+#ifdef BORDER_REFLECT
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
+#else
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
+#endif
+#else
+#error No extrapolation method
+#endif
+
+#define THREADS 256
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////calcHarris////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void corner(__global const float * Dx, int dx_step, int dx_offset, int dx_whole_rows, int dx_whole_cols,
+                     __global const float * Dy, int dy_step, int dy_offset, int dy_whole_rows, int dy_whole_cols,
+                     __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float k)
+{
+    int col = get_local_id(0);
+    int gX = get_group_id(0);
+    int gY = get_group_id(1);
+    int gly = get_global_id(1);
+
+    int dx_x_off = (dx_offset % dx_step) >> 2;
+    int dx_y_off = dx_offset / dx_step;
+    int dy_x_off = (dy_offset % dy_step) >> 2;
+    int dy_y_off = dy_offset / dy_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
+    int dx_startY = (gY << 1) - anY + dx_y_off;
+    int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
+    int dy_startY = (gY << 1) - anY + dy_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    float dx_data[ksY+1],dy_data[ksY+1], data[3][ksY+1];
+    __local float temp[6][THREADS];
+
+#ifdef BORDER_CONSTANT
+    for (int i=0; i < ksY+1; i++)
+    {
+        bool dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
+        int indexDx = (dx_startY+i)*(dx_step>>2)+(dx_startX+col);
+        float dx_s = dx_con ? Dx[indexDx] : 0.0f;
+        dx_data[i] = dx_s;
+
+        bool dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
+        int indexDy = (dy_startY+i)*(dy_step>>2)+(dy_startX+col);
+        float dy_s = dy_con ? Dy[indexDy] : 0.0f;
+        dy_data[i] = dy_s;
+
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+    }
+#else
+    int clamped_col = min(2*dst_cols, col);
+    for (int i=0; i < ksY+1; i++)
+    {
+        int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;
+        EXTRAPOLATE(dx_selected_row, dx_whole_rows)
+        EXTRAPOLATE(dx_selected_col, dx_whole_cols)
+        dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
+
+        int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;
+        EXTRAPOLATE(dy_selected_row, dy_whole_rows)
+        EXTRAPOLATE(dy_selected_col, dy_whole_cols)
+        dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
+
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+    }
+#endif
+    float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
+    for (int i=1; i < ksY; i++)
+    {
+        sum0 += data[0][i];
+        sum1 += data[1][i];
+        sum2 += data[2][i];
+    }
+
+    float sum01 = sum0 + data[0][0];
+    float sum02 = sum0 + data[0][ksY];
+    temp[0][col] = sum01;
+    temp[1][col] = sum02;
+    float sum11 = sum1 + data[1][0];
+    float sum12 = sum1 + data[1][ksY];
+    temp[2][col] = sum11;
+    temp[3][col] = sum12;
+    float sum21 = sum2 + data[2][0];
+    float sum22 = sum2 + data[2][ksY];
+    temp[4][col] = sum21;
+    temp[5][col] = sum22;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (col < (THREADS - (ksX - 1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gly << 1);
+        int till = (ksX + 1)%2;
+        float tmp_sum[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+        for (int k=0; k<6; k++)
+        {
+            float temp_sum = 0;
+            for (int i=-anX; i<=anX - till; i++)
+                temp_sum += temp[k][col+i];
+            tmp_sum[k] = temp_sum;
+        }
+
+#ifdef CORNER_HARRIS
+        if (posX < dst_cols && (posY) < dst_rows)
+        {
+            int dst_index = mad24(dst_step, dst_startY, (int)sizeof(float) * (dst_startX + col - anX));
+            *(__global float *)(dst + dst_index) =
+                    tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
+        }
+        if (posX < dst_cols && (posY + 1) < dst_rows)
+        {
+            int dst_index = mad24(dst_step, dst_startY + 1, (int)sizeof(float) * (dst_startX + col - anX));
+            *(__global float *)(dst + dst_index) =
+                    tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
+        }
+#elif defined CORNER_MINEIGENVAL
+        if (posX < dst_cols && (posY) < dst_rows)
+        {
+            int dst_index = mad24(dst_step, dst_startY, (int)sizeof(float) * (dst_startX + col - anX));
+            float a = tmp_sum[0] * 0.5f;
+            float b = tmp_sum[2];
+            float c = tmp_sum[4] * 0.5f;
+            *(__global float *)(dst + dst_index) = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
+        }
+        if (posX < dst_cols && (posY + 1) < dst_rows)
+        {
+            int dst_index = mad24(dst_step, dst_startY + 1, (int)sizeof(float) * (dst_startX + col - anX));
+            float a = tmp_sum[1] * 0.5f;
+            float b = tmp_sum[3];
+            float c = tmp_sum[5] * 0.5f;
+            *(__global float *)(dst + dst_index) = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
+        }
+#else
+#error "No such corners type"
+#endif
+    }
+}
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@@ -472,7 +472,7 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
        dst[0] = src[2];
        dst[1] = src[1];
        dst[2] = src[0];
-#elif defined ORDER
+#else
        dst[0] = src[0];
        dst[1] = src[1];
        dst[2] = src[2];
@@ -728,7 +728,7 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset

        diff = v - vmin;
        s = diff/(float)(fabs(v) + FLT_EPSILON);
-        diff = (float)(60./(diff + FLT_EPSILON));
+        diff = (float)(60.f/(diff + FLT_EPSILON));
        if( v == r )
            h = (g - b)*diff;
        else if( v == g )
--- a/modules/imgproc/src/opencl/filterSepCol.cl
+++ b/modules/imgproc/src/opencl/filterSepCol.cl
@@ -60,6 +60,8 @@ Niko
 The info above maybe obsolete.
 ***********************************************************************************/

+#define DIG(a) a,
+__constant float mat_kernel[] = { COEFF };

 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
                        (__global const GENTYPE_SRC * restrict src,
@@ -70,8 +72,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
                         const int dst_offset_in_pixel,
                         const int dst_step_in_pixel,
                         const int dst_cols,
-                         const int dst_rows,
-                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
+                         const int dst_rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
--- a/modules/imgproc/src/opencl/filterSepRow.cl
+++ b/modules/imgproc/src/opencl/filterSepRow.cl
@@ -144,6 +144,9 @@ Niko
 The info above maybe obsolete.
 ***********************************************************************************/

+#define DIG(a) a,
+__constant float mat_kernel[] = { COEFF };
+
 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
    (__global uchar * restrict src,
     int src_step_in_pixel,
@@ -153,8 +156,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     __global float * dst,
     int dst_step_in_pixel,
     int dst_cols, int dst_rows,
-     int radiusy,
-     __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+     int radiusy)
 {
    int x = get_global_id(0)<<2;
    int y = get_global_id(1);
@@ -297,8 +299,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     __global float4 * dst,
     int dst_step_in_pixel,
     int dst_cols, int dst_rows,
-     int radiusy,
-     __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+     int radiusy)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -391,8 +392,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     __global float * dst,
     int dst_step_in_pixel,
     int dst_cols, int dst_rows,
-     int radiusy,
-     __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+     int radiusy)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -484,8 +484,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     __global float4 * dst,
     int dst_step_in_pixel,
     int dst_cols, int dst_rows,
-     int radiusy,
-     __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+     int radiusy)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
--- a/modules/imgproc/src/opencl/gftt.cl
+++ b/modules/imgproc/src/opencl/gftt.cl
@@ -0,0 +1,81 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void findCorners(__global const uchar * eigptr, int eig_step, int eig_offset,
+#ifdef HAVE_MASK
+                          __global const uchar * mask, int mask_step, int mask_offset,
+#endif
+                          __global const uchar * tmpptr, int tmp_step, int tmp_offset,
+                          __global uchar * cornersptr, __global int * counter,
+                          int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        ++x, ++y;
+
+        int eig_index = mad24(y, eig_step, eig_offset + x * (int)sizeof(float));
+        int tmp_index = mad24(y, tmp_step, tmp_offset + x * (int)sizeof(float));
+#ifdef HAVE_MASK
+        int mask_index = mad24(y, mask_step, mask_offset + x);
+        mask += mask_index;
+#endif
+
+        float val = *(__global const float *)(eigptr + eig_index);
+        float tmp = *(__global const float *)(tmpptr + tmp_index);
+
+        if (val != 0 && val == tmp
+#ifdef HAVE_MASK
+            && mask[0] != 0
+#endif
+            )
+        {
+            __global float2 * corners = (__global float2 *)(cornersptr + (int)sizeof(float2) * atomic_inc(counter));
+            corners[0] = (float2)(val, as_float( (x<<16) | y ));
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/match_template.cl
+++ b/modules/imgproc/src/opencl/match_template.cl
@@ -0,0 +1,397 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+
+#define DATA_SIZE ((int)sizeof(type))
+#define ELEM_TYPE elem_type
+#define ELEM_SIZE ((int)sizeof(elem_type))
+#define CN cn
+
+#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx*CN + img_sqsums_offset + ox*CN)
+#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step,   gidx*CN + img_sums_offset + ox*CN)
+
+inline float normAcc(float num, float denum)
+{
+    if(fabs(num) < denum)
+    {
+        return num / denum;
+    }
+    if(fabs(num) < denum * 1.125f)
+    {
+        return num > 0 ? 1 : -1;
+    }
+    return 0;
+}
+
+inline float normAcc_SQDIFF(float num, float denum)
+{
+    if(fabs(num) < denum)
+    {
+        return num / denum;
+    }
+    if(fabs(num) < denum * 1.125f)
+    {
+        return num > 0 ? 1 : -1;
+    }
+    return 1;
+}
+
+//////////////////////////////////////////CCORR/////////////////////////////////////////////////////////////////////////
+
+__kernel void matchTemplate_Naive_CCORR (__global const uchar * img,int img_step,int img_offset,
+                                         __global const uchar * tpl,int tpl_step,int tpl_offset,int tpl_rows, int tpl_cols,
+                                         __global uchar * res,int res_step,int res_offset,int res_rows,int res_cols)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    float sum = 0;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            __global const ELEM_TYPE * img_ptr = (__global const ELEM_TYPE *)(img + mad24(gidy + i, img_step, gidx*DATA_SIZE + img_offset));
+            __global const ELEM_TYPE * tpl_ptr = (__global const ELEM_TYPE *)(tpl + mad24(i, tpl_step, tpl_offset));
+
+            for(j = 0; j < tpl_cols; j ++)
+
+#pragma unroll
+                for (int c = 0; c < CN; c++)
+
+                    sum += (float)(img_ptr[j*CN+c] * tpl_ptr[j*CN+c]);
+
+        }
+        __global float * result = (__global float *)(res+res_idx);
+        *result = sum;
+    }
+}
+
+__kernel void matchTemplate_CCORR_NORMED ( __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                           __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                           int tpl_rows, int tpl_cols, ulong tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sqsums_step /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global float * sqsum = (__global float*)(img_sqsums);
+        float image_sqsum_ = (float)(
+                                 (sqsum[SQSUMS_PTR(tpl_cols, tpl_rows)] - sqsum[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (sqsum[SQSUMS_PTR(0, tpl_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
+
+        __global float * result = (__global float *)(res+res_idx);
+        *result = normAcc(*result, sqrt(image_sqsum_ * tpl_sqsum));
+    }
+}
+
+////////////////////////////////////////////SQDIFF////////////////////////////////////////////////////////////////////////
+
+__kernel void matchTemplate_Naive_SQDIFF(__global const uchar * img,int img_step,int img_offset,
+                                         __global const uchar * tpl,int tpl_step,int tpl_offset,int tpl_rows, int tpl_cols,
+                                         __global uchar * res,int res_step,int res_offset,int res_rows,int res_cols)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    float delta;
+    float sum = 0;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            __global const ELEM_TYPE * img_ptr = (__global const ELEM_TYPE *)(img + mad24(gidy + i, img_step, gidx*DATA_SIZE + img_offset));
+            __global const ELEM_TYPE * tpl_ptr = (__global const ELEM_TYPE *)(tpl + mad24(i, tpl_step, tpl_offset));
+
+            for(j = 0; j < tpl_cols; j ++)
+
+#pragma unroll
+                for (int c = 0; c < CN; c++)
+                {
+                    delta = (float)(img_ptr[j*CN+c] - tpl_ptr[j*CN+c]);
+                    sum += delta*delta;
+                }
+        }
+        __global float * result = (__global float *)(res+res_idx);
+        *result = sum;
+    }
+}
+
+__kernel void matchTemplate_SQDIFF_NORMED ( __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                            __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                            int tpl_rows, int tpl_cols, ulong tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sqsums_step /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global float * sqsum = (__global float*)(img_sqsums);
+        float image_sqsum_ = (float)(
+                                 (sqsum[SQSUMS_PTR(tpl_cols, tpl_rows)] - sqsum[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (sqsum[SQSUMS_PTR(0, tpl_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
+
+        __global float * result = (__global float *)(res+res_idx);
+
+        *result = normAcc_SQDIFF(image_sqsum_ - 2.f * result[0] + tpl_sqsum, sqrt(image_sqsum_ * tpl_sqsum));
+    }
+}
+
+////////////////////////////////////////////CCOEFF/////////////////////////////////////////////////////////////////
+
+__kernel void matchTemplate_Prepared_CCOEFF_C1 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                                  __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                                  int tpl_rows, int tpl_cols, float tpl_sum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_step /= ELEM_SIZE;
+    img_sums_offset /= ELEM_SIZE;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+    float image_sum_ = 0;
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+
+        image_sum_ += (float)((sum[SUMS_PTR(tpl_cols, tpl_rows)] - sum[SUMS_PTR(tpl_cols, 0)])-
+                              (sum[SUMS_PTR(0, tpl_rows)] - sum[SUMS_PTR(0, 0)])) * tpl_sum;
+
+        __global float * result = (__global float *)(res+res_idx);
+        *result -= image_sum_;
+    }
+}
+
+__kernel void matchTemplate_Prepared_CCOEFF_C2 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                                  __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                                  int tpl_rows, int tpl_cols, float tpl_sum_0,float tpl_sum_1)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_step /= ELEM_SIZE;
+    img_sums_offset /= ELEM_SIZE;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+    float image_sum_ = 0;
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+
+        image_sum_ += tpl_sum_0 * (float)((sum[SUMS_PTR(tpl_cols, tpl_rows)] - sum[SUMS_PTR(tpl_cols, 0)])    -(sum[SUMS_PTR(0, tpl_rows)] - sum[SUMS_PTR(0, 0)]));
+        image_sum_ += tpl_sum_1 * (float)((sum[SUMS_PTR(tpl_cols, tpl_rows)+1] - sum[SUMS_PTR(tpl_cols, 0)+1])-(sum[SUMS_PTR(0, tpl_rows)+1] - sum[SUMS_PTR(0, 0)+1]));
+
+        __global float * result = (__global float *)(res+res_idx);
+
+        *result -= image_sum_;
+    }
+}
+
+__kernel void matchTemplate_Prepared_CCOEFF_C4 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                                __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                                int tpl_rows, int tpl_cols, float tpl_sum_0,float tpl_sum_1,float tpl_sum_2,float tpl_sum_3)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_step /= ELEM_SIZE;
+    img_sums_offset /= ELEM_SIZE;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+    float image_sum_ = 0;
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+
+        int c_r = SUMS_PTR(tpl_cols, tpl_rows);
+        int c_o = SUMS_PTR(tpl_cols, 0);
+        int o_r = SUMS_PTR(0,tpl_rows);
+        int oo = SUMS_PTR(0, 0);
+
+        image_sum_ += tpl_sum_0 * (float)((sum[c_r]   - sum[c_o])  -(sum[o_r]   - sum[oo]));
+        image_sum_ += tpl_sum_1 * (float)((sum[c_r+1] - sum[c_o+1])-(sum[o_r+1] - sum[oo+1]));
+        image_sum_ += tpl_sum_2 * (float)((sum[c_r+2] - sum[c_o+2])-(sum[o_r+2] - sum[oo+2]));
+        image_sum_ += tpl_sum_3 * (float)((sum[c_r+3] - sum[c_o+3])-(sum[o_r+3] - sum[oo+3]));
+
+        __global float * result = (__global float *)(res+res_idx);
+
+        *result -= image_sum_;
+    }
+}
+
+__kernel void matchTemplate_CCOEFF_NORMED_C1 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                              __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                              __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                              int t_rows, int t_cols, float weight, float tpl_sum, float tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_offset   /= ELEM_SIZE;
+    img_sums_step     /= ELEM_SIZE;
+    img_sqsums_step   /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+        __global float * sqsum = (__global float*)(img_sqsums);
+
+        float image_sum_ =  (float)((sum[SUMS_PTR(t_cols, t_rows)] - sum[SUMS_PTR(t_cols, 0)]) -
+                                    (sum[SUMS_PTR(0, t_rows)] - sum[SUMS_PTR(0, 0)]));
+
+        float image_sqsum_ = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)] - sqsum[SQSUMS_PTR(t_cols, 0)]) -
+                                     (sqsum[SQSUMS_PTR(0, t_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
+
+        __global float * result = (__global float *)(res+res_idx);
+
+        *result = normAcc((*result) - image_sum_ * tpl_sum,
+                          sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+    }
+}
+
+__kernel void matchTemplate_CCOEFF_NORMED_C2 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                              __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                              __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                              int t_rows, int t_cols, float weight, float tpl_sum_0, float tpl_sum_1, float tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_offset   /= ELEM_SIZE;
+    img_sums_step     /= ELEM_SIZE;
+    img_sqsums_step   /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    float sum_[2];
+    float sqsum_[2];
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+        __global float * sqsum = (__global float*)(img_sqsums);
+
+        sum_[0] =  (float)((sum[SUMS_PTR(t_cols, t_rows)] - sum[SUMS_PTR(t_cols, 0)])-(sum[SUMS_PTR(0, t_rows)] - sum[SUMS_PTR(0, 0)]));
+        sum_[1] =  (float)((sum[SUMS_PTR(t_cols, t_rows)+1] - sum[SUMS_PTR(t_cols, 0)+1])-(sum[SUMS_PTR(0, t_rows)+1] - sum[SUMS_PTR(0, 0)+1]));
+
+        sqsum_[0] = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)] - sqsum[SQSUMS_PTR(t_cols, 0)])-(sqsum[SQSUMS_PTR(0, t_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
+        sqsum_[1] = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)+1] - sqsum[SQSUMS_PTR(t_cols, 0)+1])-(sqsum[SQSUMS_PTR(0, t_rows)+1] - sqsum[SQSUMS_PTR(0, 0)+1]));
+
+        float num = sum_[0]*tpl_sum_0 + sum_[1]*tpl_sum_1;
+
+        float denum = sqrt( tpl_sqsum * (sqsum_[0] - weight * sum_[0]* sum_[0] +
+                                         sqsum_[1] - weight * sum_[1]* sum_[1]));
+
+        __global float * result = (__global float *)(res+res_idx);
+        *result = normAcc((*result) - num, denum);
+    }
+}
+
+__kernel void matchTemplate_CCOEFF_NORMED_C4 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                              __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                              __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                              int t_rows, int t_cols, float weight,
+                                              float tpl_sum_0,float tpl_sum_1,float tpl_sum_2,float tpl_sum_3,
+                                              float tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_offset   /= ELEM_SIZE;
+    img_sums_step     /= ELEM_SIZE;
+    img_sqsums_step   /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    float sum_[4];
+    float sqsum_[4];
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+        __global float * sqsum = (__global float*)(img_sqsums);
+
+        int c_r = SUMS_PTR(t_cols, t_rows);
+        int c_o = SUMS_PTR(t_cols, 0);
+        int o_r = SUMS_PTR(0, t_rows);
+        int o_o = SUMS_PTR(0, 0);
+
+        sum_[0] =  (float)((sum[c_r]   - sum[c_o])  -(sum[o_r]   - sum[o_o ]));
+        sum_[1] =  (float)((sum[c_r+1] - sum[c_o+1])-(sum[o_r+1] - sum[o_o +1]));
+        sum_[2] =  (float)((sum[c_r+2] - sum[c_o+2])-(sum[o_r+2] - sum[o_o +2]));
+        sum_[3] =  (float)((sum[c_r+3] - sum[c_o+3])-(sum[o_r+3] - sum[o_o +3]));
+
+        c_r = SQSUMS_PTR(t_cols, t_rows);
+        c_o = SQSUMS_PTR(t_cols, 0);
+        o_r = SQSUMS_PTR(0, t_rows);
+        o_o = SQSUMS_PTR(0, 0);
+
+        sqsum_[0] = (float)((sqsum[c_r]   - sqsum[c_o])  -(sqsum[o_r]   - sqsum[o_o]));
+        sqsum_[1] = (float)((sqsum[c_r+1] - sqsum[c_o+1])-(sqsum[o_r+1] - sqsum[o_o+1]));
+        sqsum_[2] = (float)((sqsum[c_r+2] - sqsum[c_o+2])-(sqsum[o_r+2] - sqsum[o_o+2]));
+        sqsum_[3] = (float)((sqsum[c_r+3] - sqsum[c_o+3])-(sqsum[o_r+3] - sqsum[o_o+3]));
+
+        float num = sum_[0]*tpl_sum_0 + sum_[1]*tpl_sum_1 + sum_[2]*tpl_sum_2 + sum_[3]*tpl_sum_3;
+
+        float denum = sqrt( tpl_sqsum * (
+                                sqsum_[0] - weight * sum_[0]* sum_[0] +
+                                sqsum_[1] - weight * sum_[1]* sum_[1] +
+                                sqsum_[2] - weight * sum_[2]* sum_[2] +
+                                sqsum_[3] - weight * sum_[3]* sum_[3] ));
+
+        __global float * result = (__global float *)(res+res_idx);
+        *result = normAcc((*result) - num, denum);
+    }
+}
--- a/modules/imgproc/src/opencl/precornerdetect.cl
+++ b/modules/imgproc/src/opencl/precornerdetect.cl
@@ -0,0 +1,75 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void preCornerDetect(__global const uchar * Dxptr, int dx_step, int dx_offset,
+                              __global const uchar * Dyptr, int dy_step, int dy_offset,
+                              __global const uchar * D2xptr, int d2x_step, int d2x_offset,
+                              __global const uchar * D2yptr, int d2y_step, int d2y_offset,
+                              __global const uchar * Dxyptr, int dxy_step, int dxy_offset,
+                              __global uchar * dstptr, int dst_step, int dst_offset,
+                              int dst_rows, int dst_cols, float factor)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int dx_index = mad24(dx_step, y, (int)sizeof(float) * x + dx_offset);
+        int dy_index = mad24(dy_step, y, (int)sizeof(float) * x + dy_offset);
+        int d2x_index = mad24(d2x_step, y, (int)sizeof(float) * x + d2x_offset);
+        int d2y_index = mad24(d2y_step, y, (int)sizeof(float) * x + d2y_offset);
+        int dxy_index = mad24(dxy_step, y, (int)sizeof(float) * x + dxy_offset);
+        int dst_index = mad24(dst_step, y, (int)sizeof(float) * x + dst_offset);
+
+        float dx = *(__global const float *)(Dxptr + dx_index);
+        float dy = *(__global const float *)(Dyptr + dy_index);
+        float d2x = *(__global const float *)(D2xptr + d2x_index);
+        float d2y = *(__global const float *)(D2yptr + d2y_index);
+        float dxy = *(__global const float *)(Dxyptr + dxy_index);
+        __global float * dst = (__global float *)(dstptr + dst_index);
+
+        dst[0] = factor * (dx*dx*d2y + dy*dy*d2x - 2*dx*dy*dxy);
+    }
+}
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@@ -50,7 +50,6 @@
 #define INTER_RESIZE_COEF_BITS 11
 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
 #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
-#define CAST_SCALE (1.0f/(1<<CAST_BITS))
 #define INC(x,l) min(x+1,l-1)

 #define PIXSIZE ((int)sizeof(PIXTYPE))
@@ -79,7 +78,6 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,

    int y_ = INC(y,srcrows);
    int x_ = INC(x,srccols);
-    __global const PIXTYPE* src = (__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE));

 #if depth <= 4

--- a/modules/imgproc/src/phasecorr.cpp
+++ b/modules/imgproc/src/phasecorr.cpp
@@ -576,20 +576,23 @@ void cv::createHanningWindow(OutputArray _dst, cv::Size winSize, int type)
    _dst.create(winSize, type);
    Mat dst = _dst.getMat();

-    int rows = dst.rows;
-    int cols = dst.cols;
+    int rows = dst.rows, cols = dst.cols;
+
+    AutoBuffer<double> _wc(cols);
+    double * const wc = (double *)_wc;
+
+    double coeff0 = 2.0 * CV_PI / (double)(cols - 1), coeff1 = 2.0f * CV_PI / (double)(rows - 1);
+    for(int j = 0; j < cols; j++)
+        wc[j] = 0.5 * (1.0 - cos(coeff0 * j));

    if(dst.depth() == CV_32F)
    {
        for(int i = 0; i < rows; i++)
        {
            float* dstData = dst.ptr<float>(i);
-            double wr = 0.5 * (1.0f - cos(2.0f * CV_PI * (double)i / (double)(rows - 1)));
+            double wr = 0.5 * (1.0 - cos(coeff1 * i));
            for(int j = 0; j < cols; j++)
-            {
-                double wc = 0.5 * (1.0f - cos(2.0f * CV_PI * (double)j / (double)(cols - 1)));
-                dstData[j] = (float)(wr * wc);
-            }
+                dstData[j] = (float)(wr * wc[j]);
        }
    }
    else
@@ -597,12 +600,9 @@ void cv::createHanningWindow(OutputArray _dst, cv::Size winSize, int type)
        for(int i = 0; i < rows; i++)
        {
            double* dstData = dst.ptr<double>(i);
-            double wr = 0.5 * (1.0 - cos(2.0 * CV_PI * (double)i / (double)(rows - 1)));
+            double wr = 0.5 * (1.0 - cos(coeff1 * i));
            for(int j = 0; j < cols; j++)
-            {
-                double wc = 0.5 * (1.0 - cos(2.0 * CV_PI * (double)j / (double)(cols - 1)));
-                dstData[j] = wr * wc;
-            }
+                dstData[j] = wr * wc[j];
        }
    }

--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -401,6 +401,8 @@ pyrUp_( const Mat& _src, Mat& _dst, int)

 typedef void (*PyrFunc)(const Mat&, Mat&, int);

+#ifdef HAVE_OPENCL
+
 static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
 {
    int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type);
@@ -484,13 +486,14 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
    return k.run(2, globalThreads, localThreads, false);
 }

+#endif
+
 }

 void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
 {
-    if (ocl::useOpenCL() && _dst.isUMat() &&
-        ocl_pyrDown(_src, _dst, _dsz, borderType))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_pyrDown(_src, _dst, _dsz, borderType))

    Mat src = _src.getMat();
    Size dsz = _dsz.area() == 0 ? Size((src.cols + 1)/2, (src.rows + 1)/2) : _dsz;
@@ -522,9 +525,8 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde

 void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
 {
-    if (ocl::useOpenCL() && _dst.isUMat() &&
-        ocl_pyrUp(_src, _dst, _dsz, borderType))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_pyrUp(_src, _dst, _dsz, borderType))

    Mat src = _src.getMat();
    Size dsz = _dsz.area() == 0 ? Size(src.cols*2, src.rows*2) : _dsz;
@@ -556,6 +558,16 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT

 void cv::buildPyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType )
 {
+    if (_src.dims() <= 2 && _dst.isUMatVector())
+    {
+        UMat src = _src.getUMat();
+        _dst.create( maxlevel + 1, 1, 0 );
+        _dst.getUMatRef(0) = src;
+        for( int i = 1; i <= maxlevel; i++ )
+            pyrDown( _dst.getUMatRef(i-1), _dst.getUMatRef(i), Size(), borderType );
+        return;
+    }
+
    Mat src = _src.getMat();
    _dst.create( maxlevel + 1, 1, 0 );
    _dst.getMatRef(0) = src;
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -611,155 +611,114 @@ template<> struct ColumnSum<int, ushort> : public BaseColumnFilter
    std::vector<int> sum;
 };

+#ifdef HAVE_OPENCL
+
 #define DIVUP(total, grain) ((total + grain - 1) / (grain))

 static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
-                           Size ksize, Point anchor, int borderType )
+                           Size ksize, Point anchor, int borderType, bool normalize )
 {
-    int type = _src.type();
-    int cn = CV_MAT_CN(type);
-    if ((1 != cn) && (2 != cn) && (4 != cn))
-        return false;//TODO
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

-    int sdepth = CV_MAT_DEPTH(type);
-    if( ddepth < 0 )
+    if (ddepth < 0)
        ddepth = sdepth;
-    else if (ddepth != sdepth)
+
+    if (!(cn == 1 || cn == 2 || cn == 4) || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
+        _src.offset() % esz != 0 || _src.step() % esz != 0)
        return false;
-    if( anchor.x < 0 )
+
+    if (anchor.x < 0)
        anchor.x = ksize.width / 2;
-    if( anchor.y < 0 )
+    if (anchor.y < 0)
        anchor.y = ksize.height / 2;

-    ocl::Kernel kernel;
-
-    //Normalize the result by default
+    int computeUnits = ocl::Device::getDefault().maxComputeUnits();
    float alpha = 1.0f / (ksize.height * ksize.width);
-    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-    bool useDouble = (CV_64F == sdepth);
-    const cv::ocl::Device &device = cv::ocl::Device::getDefault();
-    int doubleFPConfig = device.doubleFPConfig();
-    if (useDouble && (0 == doubleFPConfig))
-        return false;// may be we have to check is  (0 != (CL_FP_SOFT_FLOAT & doubleFPConfig)) ?
+    Size size = _src.size(), wholeSize;
+    bool isolated = (borderType & BORDER_ISOLATED) != 0;
+    borderType &= ~BORDER_ISOLATED;
+    int wdepth = std::max(CV_32F, std::max(ddepth, sdepth));

-    const char* btype = NULL;
-    switch (borderType & ~BORDER_ISOLATED)
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
+    size_t globalsize[2] = { size.width, size.height };
+    size_t localsize[2] = { 0, 1 };
+
+    UMat src = _src.getUMat();
+    if (!isolated)
    {
-    case BORDER_CONSTANT:
-        btype = "BORDER_CONSTANT";
-        break;
-    case BORDER_REPLICATE:
-        btype = "BORDER_REPLICATE";
-        break;
-    case BORDER_REFLECT:
-        btype = "BORDER_REFLECT";
-        break;
-    case BORDER_WRAP:
-        //CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-        return false;
-    case BORDER_REFLECT101:
-        btype = "BORDER_REFLECT_101";
-        break;
-    }
-
-    cv::Size sz = _src.size();
-
-    size_t globalsize[2] = {sz.width, sz.height};
-    size_t localsize[2] = {0, 1};
-
-    UMat src; Size wholeSize;
-    if (!isIsolatedBorder)
-    {
-        src = _src.getUMat();
        Point ofs;
        src.locateROI(wholeSize, ofs);
    }

-    size_t maxWorkItemSizes[32]; device.maxWorkItemSizes(maxWorkItemSizes);
-    size_t tryWorkItems = maxWorkItemSizes[0];
-    for (;;)
+    int h = isolated ? size.height : wholeSize.height;
+    int w = isolated ? size.width : wholeSize.width;
+
+    size_t maxWorkItemSizes[32];
+    ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
+    int tryWorkItems = (int)maxWorkItemSizes[0];
+
+    ocl::Kernel kernel;
+    for ( ; ; )
    {
-        size_t BLOCK_SIZE = tryWorkItems;
-        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
-            BLOCK_SIZE /= 2;
-        size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
-        while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * device.maxComputeUnits() * 32 < (size_t)sz.height)
+        int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = 8;
+
+        while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
+            BLOCK_SIZE_X /= 2;
+        while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
            BLOCK_SIZE_Y *= 2;

-        if ((size_t)ksize.width > BLOCK_SIZE)
+        if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
            return false;

-        int requiredTop = anchor.y;
-        int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x;
-        int requiredBottom = ksize.height - 1 - anchor.y;
-        int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
-        int h = isIsolatedBorder ? sz.height : wholeSize.height;
-        int w = isIsolatedBorder ? sz.width : wholeSize.width;
+        char cvt[2][50];
+        String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s "
+                             "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s",
+                             BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
+                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
+                             ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
+                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
+                             anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
+                             isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                             normalize ? " -D NORMALIZE" : "");

-        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+        localsize[0] = BLOCK_SIZE_X;
+        globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
+        globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y);

-        if ((w < ksize.width) || (h < ksize.height))
-            return false;
-
-        char build_options[1024];
-        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
-                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
-                sdepth, cn, useDouble ? 1 : 0,
-                anchor.x, anchor.y, ksize.width, ksize.height,
-                btype,
-                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-
-        localsize[0] = BLOCK_SIZE;
-        globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE;
-        globalsize[1] = DIVUP(sz.height, BLOCK_SIZE_Y);
-
-        cv::String errmsg;
-        kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, build_options);
+        kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);

        size_t kernelWorkGroupSize = kernel.workGroupSize();
        if (localsize[0] <= kernelWorkGroupSize)
            break;
-
-        if (BLOCK_SIZE < kernelWorkGroupSize)
+        if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
            return false;
-        tryWorkItems = kernelWorkGroupSize;
+
+        tryWorkItems = (int)kernelWorkGroupSize;
    }

-    _dst.create(sz, CV_MAKETYPE(ddepth, cn));
+    _dst.create(size, CV_MAKETYPE(ddepth, cn));
    UMat dst = _dst.getUMat();
-    if (src.empty())
-        src = _src.getUMat();
-    int idxArg = 0;
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
+
+    int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
    idxArg = kernel.set(idxArg, (int)src.step);
    int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
    int srcOffsetY = (int)(src.offset / src.step);
-    int srcEndX = (isIsolatedBorder ? (srcOffsetX + sz.width) : wholeSize.width);
-    int srcEndY = (isIsolatedBorder ? (srcOffsetY + sz.height) : wholeSize.height);
+    int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
+    int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
    idxArg = kernel.set(idxArg, srcOffsetX);
    idxArg = kernel.set(idxArg, srcOffsetY);
    idxArg = kernel.set(idxArg, srcEndX);
    idxArg = kernel.set(idxArg, srcEndY);
    idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
-    float borderValue[4] = {0, 0, 0, 0};
-    double borderValueDouble[4] = {0, 0, 0, 0};
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
-    {
-        int cnocl = (3 == cn) ? 4 : cn;
-        if (useDouble)
-            idxArg = kernel.set(idxArg, (void *)&borderValueDouble[0], sizeof(double) * cnocl);
-        else
-            idxArg = kernel.set(idxArg, (void *)&borderValue[0], sizeof(float) * cnocl);
-    }
-    if (useDouble)
-        idxArg = kernel.set(idxArg, (double)alpha);
-    else
+    if (normalize)
        idxArg = kernel.set(idxArg, (float)alpha);

-    return kernel.run(2, globalsize, localsize, true);
+    return kernel.run(2, globalsize, localsize, false);
 }

+#endif
+
 }


@@ -862,9 +821,7 @@ void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
                Size ksize, Point anchor,
                bool normalize, int borderType )
 {
-    bool use_opencl = ocl::useOpenCL() && _dst.isUMat() && normalize;
-    if( use_opencl && ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType) )
-        return;
+    CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))

    Mat src = _src.getMat();
    int sdepth = src.depth(), cn = src.channels();
@@ -1920,39 +1877,41 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
    }
 }

+#ifdef HAVE_OPENCL
+
+static bool ocl_medianFilter ( InputArray _src, OutputArray _dst, int m)
+{
+    int type = _src.type();
+    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    if (!((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn != 3 && cn <= 4)))
+        return false;
+
+    const char * kernelName;
+
+    if (m == 3)
+        kernelName = "medianFilter3";
+    else if (m == 5)
+        kernelName = "medianFilter5";
+    else
+        return false;
+
+    ocl::Kernel k(kernelName,ocl::imgproc::medianFilter_oclsrc,format("-D type=%s",ocl::typeToStr(type)));
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+    _dst.create(_src.size(),type);
+    UMat dst = _dst.getUMat();
+
+    size_t globalsize[2] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16};
+    size_t localsize[2] = {16, 16};
+
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)).run(2,globalsize,localsize,false);
 }

-namespace cv
-{
-    static bool ocl_medianFilter ( InputArray _src, OutputArray _dst, int m)
-    {
-        int type = _src.type();
-        int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+#endif

-        if (!((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn != 3 && cn <= 4)))
-            return false;
-
-        const char * kernelName;
-
-        if (m==3)
-            kernelName = "medianFilter3";
-        else if (m==5)
-            kernelName = "medianFilter5";
-        else
-            return false;
-
-        ocl::Kernel k(kernelName,ocl::imgproc::medianFilter_oclsrc,format("-D type=%s",ocl::typeToStr(type)));
-        if (k.empty())
-            return false;
-
-        _dst.create(_src.size(),type);
-        UMat src = _src.getUMat(), dst = _dst.getUMat();
-
-        size_t globalsize[2] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16};
-        size_t localsize[2] = {16, 16};
-
-        return k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)).run(2,globalsize,localsize,false);
-    }
 }

 void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
@@ -1961,16 +1920,12 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )

    if( ksize <= 1 )
    {
-        Mat src0 = _src0.getMat();
-        _dst.create( src0.size(), src0.type() );
-        Mat dst = _dst.getMat();
-        src0.copyTo(dst);
+        _src0.copyTo(_dst);
        return;
    }

-    bool use_opencl = ocl::useOpenCL() && _dst.isUMat();
-    if ( use_opencl && ocl_medianFilter(_src0,_dst, ksize))
-        return;
+    CV_OCL_RUN(_src0.dims() <= 2 && _dst.isUMat(),
+               ocl_medianFilter(_src0,_dst, ksize))

    Mat src0 = _src0.getMat();
    _dst.create( src0.size(), src0.type() );
@@ -2226,6 +2181,8 @@ private:
 };
 #endif

+#ifdef HAVE_OPENCL
+
 static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
                                   double sigma_color, double sigma_space,
                                   int borderType)
@@ -2301,6 +2258,8 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
    return k.run(2, globalsize, NULL, false);
 }

+#endif
+
 static void
 bilateralFilter_8u( const Mat& src, Mat& dst, int d,
    double sigma_color, double sigma_space,
@@ -2651,9 +2610,8 @@ void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
 {
    _dst.create( _src.size(), _src.type() );

-    if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat() &&
-            ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))

    Mat src = _src.getMat(), dst = _dst.getMat();

@@ -2666,285 +2624,6 @@ void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
        "Bilateral filtering is only implemented for 8u and 32f images" );
 }

-
-/****************************************************************************************\
-                                  Adaptive Bilateral Filtering
-\****************************************************************************************/
-
-namespace cv
-{
-#ifndef ABF_CALCVAR
-#define ABF_CALCVAR 1
-#endif
-
-#ifndef ABF_FIXED_WEIGHT
-#define ABF_FIXED_WEIGHT 0
-#endif
-
-#ifndef ABF_GAUSSIAN
-#define ABF_GAUSSIAN 1
-#endif
-
-class adaptiveBilateralFilter_8u_Invoker :
-    public ParallelLoopBody
-{
-public:
-    adaptiveBilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, Size _ksize, double _sigma_space, double _maxSigmaColor, Point _anchor) :
-        temp(&_temp), dest(&_dest), ksize(_ksize), sigma_space(_sigma_space), maxSigma_Color(_maxSigmaColor), anchor(_anchor)
-    {
-        if( sigma_space <= 0 )
-            sigma_space = 1;
-        CV_Assert((ksize.width & 1) && (ksize.height & 1));
-        space_weight.resize(ksize.width * ksize.height);
-        double sigma2 = sigma_space * sigma_space;
-        int idx = 0;
-        int w = ksize.width / 2;
-        int h = ksize.height / 2;
-        for(int y=-h; y<=h; y++)
-            for(int x=-w; x<=w; x++)
-        {
-#if ABF_GAUSSIAN
-            space_weight[idx++] = (float)exp ( -0.5*(x * x + y * y)/sigma2);
-#else
-            space_weight[idx++] = (float)(sigma2 / (sigma2 + x * x + y * y));
-#endif
-        }
-    }
-    virtual void operator()(const Range& range) const
-    {
-        int cn = dest->channels();
-        int anX = anchor.x;
-
-        const uchar *tptr;
-
-        for(int i = range.start;i < range.end; i++)
-        {
-            int startY = i;
-            if(cn == 1)
-            {
-                float var;
-                int currVal;
-                int sumVal = 0;
-                int sumValSqr = 0;
-                int currValCenter;
-                int currWRTCenter;
-                float weight;
-                float totalWeight = 0.;
-                float tmpSum = 0.;
-
-                for(int j = 0;j < dest->cols *cn; j+=cn)
-                {
-                    sumVal = 0;
-                    sumValSqr= 0;
-                    totalWeight = 0.;
-                    tmpSum = 0.;
-
-                    // Top row: don't sum the very last element
-                    int startLMJ = 0;
-                    int endLMJ  = ksize.width  - 1;
-                    int howManyAll = (anX *2 +1)*(ksize.width );
-#if ABF_CALCVAR
-                    for(int x = startLMJ; x< endLMJ; x++)
-                    {
-                        tptr = temp->ptr(startY + x) +j;
-                        for(int y=-anX; y<=anX; y++)
-                        {
-                            currVal = tptr[cn*(y+anX)];
-                            sumVal += currVal;
-                            sumValSqr += (currVal *currVal);
-                        }
-                    }
-                    var = ( (sumValSqr * howManyAll)- sumVal * sumVal )  /  ( (float)(howManyAll*howManyAll));
-
-                    if(var < 0.01)
-                        var = 0.01f;
-                    else if(var > (float)(maxSigma_Color*maxSigma_Color) )
-                        var =  (float)(maxSigma_Color*maxSigma_Color) ;
-
-#else
-                    var = maxSigmaColor*maxSigmaColor;
-#endif
-                    startLMJ = 0;
-                    endLMJ = ksize.width;
-                    tptr = temp->ptr(startY + (startLMJ+ endLMJ)/2);
-                    currValCenter =tptr[j+cn*anX];
-                    for(int x = startLMJ; x< endLMJ; x++)
-                    {
-                        tptr = temp->ptr(startY + x) +j;
-                        for(int y=-anX; y<=anX; y++)
-                        {
-#if ABF_FIXED_WEIGHT
-                            weight = 1.0;
-#else
-                            currVal = tptr[cn*(y+anX)];
-                            currWRTCenter = currVal - currValCenter;
-
-#if ABF_GAUSSIAN
-                            weight = exp ( -0.5f * currWRTCenter * currWRTCenter/var ) * space_weight[x*ksize.width+y+anX];
-#else
-                            weight = var / ( var + (currWRTCenter * currWRTCenter) ) * space_weight[x*ksize.width+y+anX];
-#endif
-
-#endif
-                            tmpSum += ((float)tptr[cn*(y+anX)] * weight);
-                            totalWeight += weight;
-                        }
-                    }
-                    tmpSum /= totalWeight;
-
-                   dest->at<uchar>(startY ,j)= static_cast<uchar>(tmpSum);
-                }
-            }
-            else
-            {
-                assert(cn == 3);
-                float var_b, var_g, var_r;
-                int currVal_b, currVal_g, currVal_r;
-                int sumVal_b= 0, sumVal_g= 0, sumVal_r= 0;
-                int sumValSqr_b= 0, sumValSqr_g= 0, sumValSqr_r= 0;
-                int currValCenter_b= 0, currValCenter_g= 0, currValCenter_r= 0;
-                int currWRTCenter_b, currWRTCenter_g, currWRTCenter_r;
-                float weight_b, weight_g, weight_r;
-                float totalWeight_b= 0., totalWeight_g= 0., totalWeight_r= 0.;
-                float tmpSum_b = 0., tmpSum_g= 0., tmpSum_r = 0.;
-
-                for(int j = 0;j < dest->cols *cn; j+=cn)
-                {
-                    sumVal_b= 0, sumVal_g= 0, sumVal_r= 0;
-                    sumValSqr_b= 0, sumValSqr_g= 0, sumValSqr_r= 0;
-                    totalWeight_b= 0., totalWeight_g= 0., totalWeight_r= 0.;
-                    tmpSum_b = 0., tmpSum_g= 0., tmpSum_r = 0.;
-
-                    // Top row: don't sum the very last element
-                    int startLMJ = 0;
-                    int endLMJ  = ksize.width - 1;
-                    int howManyAll = (anX *2 +1)*(ksize.width);
-#if ABF_CALCVAR
-                    float max_var = (float)( maxSigma_Color*maxSigma_Color);
-                    for(int x = startLMJ; x< endLMJ; x++)
-                    {
-                        tptr = temp->ptr(startY + x) +j;
-                        for(int y=-anX; y<=anX; y++)
-                        {
-                            currVal_b = tptr[cn*(y+anX)], currVal_g = tptr[cn*(y+anX)+1], currVal_r =tptr[cn*(y+anX)+2];
-                            sumVal_b += currVal_b;
-                            sumVal_g += currVal_g;
-                            sumVal_r += currVal_r;
-                            sumValSqr_b += (currVal_b *currVal_b);
-                            sumValSqr_g += (currVal_g *currVal_g);
-                            sumValSqr_r += (currVal_r *currVal_r);
-                        }
-                    }
-                    var_b =  ( (sumValSqr_b * howManyAll)- sumVal_b * sumVal_b )  /  ( (float)(howManyAll*howManyAll));
-                    var_g =  ( (sumValSqr_g * howManyAll)- sumVal_g * sumVal_g )  /  ( (float)(howManyAll*howManyAll));
-                    var_r =  ( (sumValSqr_r * howManyAll)- sumVal_r * sumVal_r )  /  ( (float)(howManyAll*howManyAll));
-
-                    if(var_b < 0.01)
-                        var_b = 0.01f;
-                    else if(var_b > max_var )
-                        var_b =  (float)(max_var) ;
-
-                    if(var_g < 0.01)
-                        var_g = 0.01f;
-                    else if(var_g > max_var )
-                        var_g =  (float)(max_var) ;
-
-                    if(var_r < 0.01)
-                        var_r = 0.01f;
-                    else if(var_r > max_var )
-                        var_r =  (float)(max_var) ;
-
-#else
-                    var_b = maxSigma_Color*maxSigma_Color; var_g = maxSigma_Color*maxSigma_Color; var_r = maxSigma_Color*maxSigma_Color;
-#endif
-                    startLMJ = 0;
-                    endLMJ = ksize.width;
-                    tptr = temp->ptr(startY + (startLMJ+ endLMJ)/2) + j;
-                    currValCenter_b =tptr[cn*anX], currValCenter_g =tptr[cn*anX+1], currValCenter_r =tptr[cn*anX+2];
-                    for(int x = startLMJ; x< endLMJ; x++)
-                    {
-                        tptr = temp->ptr(startY + x) +j;
-                        for(int y=-anX; y<=anX; y++)
-                        {
-#if ABF_FIXED_WEIGHT
-                            weight_b = 1.0;
-                            weight_g = 1.0;
-                            weight_r = 1.0;
-#else
-                            currVal_b = tptr[cn*(y+anX)];currVal_g=tptr[cn*(y+anX)+1];currVal_r=tptr[cn*(y+anX)+2];
-                            currWRTCenter_b = currVal_b - currValCenter_b;
-                            currWRTCenter_g = currVal_g - currValCenter_g;
-                            currWRTCenter_r = currVal_r - currValCenter_r;
-
-                            float cur_spw = space_weight[x*ksize.width+y+anX];
-
-#if ABF_GAUSSIAN
-                            weight_b = exp( -0.5f * currWRTCenter_b * currWRTCenter_b/ var_b ) * cur_spw;
-                            weight_g = exp( -0.5f * currWRTCenter_g * currWRTCenter_g/ var_g ) * cur_spw;
-                            weight_r = exp( -0.5f * currWRTCenter_r * currWRTCenter_r/ var_r ) * cur_spw;
-#else
-                            weight_b = var_b / ( var_b + (currWRTCenter_b * currWRTCenter_b) ) * cur_spw;
-                            weight_g = var_g / ( var_g + (currWRTCenter_g * currWRTCenter_g) ) * cur_spw;
-                            weight_r = var_r / ( var_r + (currWRTCenter_r * currWRTCenter_r) ) * cur_spw;
-#endif
-#endif
-                            tmpSum_b += ((float)tptr[cn*(y+anX)]   * weight_b);
-                            tmpSum_g += ((float)tptr[cn*(y+anX)+1] * weight_g);
-                            tmpSum_r += ((float)tptr[cn*(y+anX)+2] * weight_r);
-                            totalWeight_b += weight_b, totalWeight_g += weight_g, totalWeight_r += weight_r;
-                        }
-                    }
-                    tmpSum_b /= totalWeight_b;
-                    tmpSum_g /= totalWeight_g;
-                    tmpSum_r /= totalWeight_r;
-
-                    dest->at<uchar>(startY,j  )= static_cast<uchar>(tmpSum_b);
-                    dest->at<uchar>(startY,j+1)= static_cast<uchar>(tmpSum_g);
-                    dest->at<uchar>(startY,j+2)= static_cast<uchar>(tmpSum_r);
-                }
-            }
-        }
-    }
-private:
-    const Mat *temp;
-    Mat *dest;
-    Size ksize;
-    double sigma_space;
-    double maxSigma_Color;
-    Point anchor;
-    std::vector<float> space_weight;
-};
-static void adaptiveBilateralFilter_8u( const Mat& src, Mat& dst, Size ksize, double sigmaSpace, double maxSigmaColor, Point anchor, int borderType )
-{
-    Size size = src.size();
-
-    CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) &&
-              src.type() == dst.type() && src.size() == dst.size() &&
-              src.data != dst.data );
-    Mat temp;
-    copyMakeBorder(src, temp, anchor.x, anchor.y, anchor.x, anchor.y, borderType);
-
-    adaptiveBilateralFilter_8u_Invoker body(dst, temp, ksize, sigmaSpace, maxSigmaColor, anchor);
-    parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
-}
-}
-void cv::adaptiveBilateralFilter( InputArray _src, OutputArray _dst, Size ksize,
-                                  double sigmaSpace, double maxSigmaColor, Point anchor, int borderType )
-{
-    Mat src = _src.getMat();
-    _dst.create(src.size(), src.type());
-    Mat dst = _dst.getMat();
-
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);
-
-    anchor = normalizeAnchor(anchor,ksize);
-    if( src.depth() == CV_8U )
-        adaptiveBilateralFilter_8u( src, dst, ksize, sigmaSpace, maxSigmaColor, anchor, borderType );
-    else
-        CV_Error( CV_StsUnsupportedFormat,
-        "Adaptive Bilateral filtering is only implemented for 8u images" );
-}
-
 //////////////////////////////////////////////////////////////////////////////////////////

 CV_IMPL void
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -231,6 +231,8 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_
                             uchar* sqsum, size_t sqsumstep, uchar* tilted, size_t tstep,
                             Size size, int cn );

+#ifdef HAVE_OPENCL
+
 enum { vlen = 4 };

 static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
@@ -324,6 +326,8 @@ static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum,
    return k2.run(1, &gt2, &lt2, false);
 }

+#endif
+
 }


@@ -336,19 +340,17 @@ void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, Output
         sqdepth = CV_64F;
    sdepth = CV_MAT_DEPTH(sdepth), sqdepth = CV_MAT_DEPTH(sqdepth);

+#ifdef HAVE_OPENCL
    if (ocl::useOpenCL() && _sum.isUMat() && !_tilted.needed())
    {
        if (!_sqsum.needed())
        {
-            if (ocl_integral(_src, _sum, sdepth))
-                return;
+            CV_OCL_RUN(ocl::useOpenCL(), ocl_integral(_src, _sum, sdepth))
        }
        else if (_sqsum.isUMat())
-        {
-            if (ocl_integral(_src, _sum, _sqsum, sdepth, sqdepth))
-                return;
-        }
+            CV_OCL_RUN(ocl::useOpenCL(), ocl_integral(_src, _sum, _sqsum, sdepth, sqdepth))
    }
+#endif

    Size ssize = _src.size(), isize(ssize.width + 1, ssize.height + 1);
    _sum.create( isize, CV_MAKETYPE(sdepth, cn) );
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -40,10 +40,329 @@
 //M*/

 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
+////////////////////////////////////////////////// matchTemplate //////////////////////////////////////////////////////////

 namespace cv
 {

+#ifdef HAVE_OPENCL
+
+static bool useNaive(int method, int depth, Size size)
+{
+#ifdef HAVE_CLAMDFFT
+    if (method == TM_SQDIFF && depth == CV_32F)
+        return true;
+    else if(method == TM_CCORR || (method == TM_SQDIFF && depth == CV_8U))
+        return size.height < 18 && size.width < 18;
+    else
+        return false;
+#else
+#define UNUSED(x) (void)(x);
+        UNUSED(method) UNUSED(depth) UNUSED(size)
+#undef  UNUSED
+        return true;
+#endif
+}
+
+/////////////////////////////////////////////////// CCORR //////////////////////////////////////////////////////////////
+
+static bool matchTemplateNaive_CCORR (InputArray _image, InputArray _templ, OutputArray _result, int cn)
+{
+    int type = _image.type();
+    int depth = CV_MAT_DEPTH(type);
+
+    const char * kernelName = "matchTemplate_Naive_CCORR";
+
+    ocl::Kernel k (kernelName, ocl::imgproc::match_template_oclsrc, format("-D type=%s -D elem_type=%s -D cn=%d",ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat();
+    UMat templ = _templ.getUMat(), result;
+    _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    result = _result.getUMat();
+
+    size_t globalsize[2] = {result.cols, result.rows};
+
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(image), ocl::KernelArg::ReadOnly(templ), ocl::KernelArg::WriteOnly(result)).run(2,globalsize,NULL,false);
+}
+
+static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    matchTemplate(_image, _templ, _result, CV_TM_CCORR);
+
+    int type = _image.type();
+    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    const char * kernelName = "matchTemplate_CCORR_NORMED";
+
+    ocl::Kernel k(kernelName, ocl::imgproc::match_template_oclsrc, format("-D type=%s -D elem_type=%s -D cn=%d",ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat();
+    UMat templ = _templ.getUMat(), result;
+    _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    result = _result.getUMat();
+
+    UMat image_sums, image_sqsums;
+    integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
+
+    UMat templ_resh, temp;
+    templ.reshape(1).convertTo(templ_resh, CV_32F);
+
+    multiply(templ_resh, templ_resh, temp);
+    unsigned long long templ_sqsum = (unsigned long long)sum(temp)[0];
+
+    size_t globalsize[2] = {result.cols, result.rows};
+
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::WriteOnly(result), templ.rows, templ.cols, templ_sqsum).run(2,globalsize,NULL,false);
+}
+
+static bool matchTemplate_CCORR(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    if (useNaive(TM_CCORR, _image.depth(), _templ.size())  )
+        return matchTemplateNaive_CCORR(_image, _templ, _result, _image.channels());
+    else
+        return false;
+}
+
+////////////////////////////////////// SQDIFF //////////////////////////////////////////////////////////////
+
+static bool matchTemplateNaive_SQDIFF(InputArray _image, InputArray _templ, OutputArray _result, int cn)
+{
+    int type = _image.type();
+    int depth = CV_MAT_DEPTH(type);
+
+    const char * kernelName = "matchTemplate_Naive_SQDIFF";
+
+    ocl::Kernel k (kernelName, ocl::imgproc::match_template_oclsrc, format("-D type=%s -D elem_type=%s -D cn=%d",ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat();
+    UMat templ = _templ.getUMat(), result;
+    _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    result = _result.getUMat();
+
+    size_t globalsize[2] = {result.cols, result.rows};
+
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(image), ocl::KernelArg::ReadOnly(templ), ocl::KernelArg::WriteOnly(result)).run(2,globalsize,NULL,false);
+}
+
+static bool matchTemplate_SQDIFF_NORMED (InputArray _image, InputArray _templ, OutputArray _result)
+{
+    matchTemplate(_image, _templ, _result, CV_TM_CCORR);
+
+    int type = _image.type();
+    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    const char * kernelName = "matchTemplate_SQDIFF_NORMED";
+
+    ocl::Kernel k(kernelName, ocl::imgproc::match_template_oclsrc, format("-D type=%s -D elem_type=%s -D cn=%d",ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat();
+    UMat templ = _templ.getUMat(), result;
+    _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    result = _result.getUMat();
+
+    UMat image_sums, image_sqsums;
+    integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
+
+    UMat temp, templ_resh;
+    templ.reshape(1).convertTo(templ_resh, CV_32F);
+
+    multiply(templ_resh, templ_resh, temp);
+    unsigned long long templ_sqsum = (unsigned long long)sum(temp)[0];
+
+    size_t globalsize[2] = {result.cols, result.rows};
+
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::WriteOnly(result), templ.rows, templ.cols, templ_sqsum).run(2,globalsize,NULL,false);
+}
+
+static bool matchTemplate_SQDIFF(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    if (useNaive(TM_SQDIFF, _image.depth(), _templ.size()))
+        return matchTemplateNaive_SQDIFF(_image, _templ, _result, _image.channels());
+    else
+        return false;
+}
+
+///////////////////////////////////// CCOEFF /////////////////////////////////////////////////////////////////
+
+static bool matchTemplate_CCOEFF(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    matchTemplate(_image, _templ, _result, CV_TM_CCORR);
+
+    UMat image_sums;
+    integral(_image, image_sums);
+
+    int type = image_sums.type();
+    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    const char * kernelName;
+
+    if (cn==1)
+        kernelName = "matchTemplate_Prepared_CCOEFF_C1";
+    else if (cn==2)
+        kernelName = "matchTemplate_Prepared_CCOEFF_C2";
+    else
+        kernelName = "matchTemplate_Prepared_CCOEFF_C4";
+
+    ocl::Kernel k(kernelName, ocl::imgproc::match_template_oclsrc, format("-D type=%s -D elem_type=%s -D cn=%d",ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat templ = _templ.getUMat(), result;
+    Size size = _image.size();
+    _result.create(size.height - templ.rows + 1, size.width - templ.cols + 1, CV_32F);
+    result = _result.getUMat();
+
+    size_t globalsize[2] = {result.cols, result.rows};
+
+    if (cn==1)
+    {
+        float templ_sum = (float)sum(_templ)[0]/ _templ.size().area();
+        return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::WriteOnly(result), templ.rows, templ.cols, templ_sum).run(2,globalsize,NULL,false);
+    }
+    else
+    {
+        Vec4f templ_sum = Vec4f::all(0);
+        templ_sum = sum(templ)/ templ.size().area();
+        if (cn==2)
+            return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::WriteOnly(result), templ.rows, templ.cols,
+            templ_sum[0],templ_sum[1]).run(2,globalsize,NULL,false);
+
+        return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::WriteOnly(result), templ.rows, templ.cols,
+        templ_sum[0],templ_sum[1],templ_sum[2],templ_sum[3]).run(2,globalsize,NULL,false);
+    }
+}
+
+static bool matchTemplate_CCOEFF_NORMED(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    UMat imagef, templf;
+
+    _image.getUMat().convertTo(imagef, CV_32F);
+    _templ.getUMat().convertTo(templf, CV_32F);
+
+    matchTemplate(imagef, templf, _result, CV_TM_CCORR);
+
+    const char * kernelName;
+
+    UMat temp, image_sums, image_sqsums;
+    integral(_image,image_sums, image_sqsums, CV_32F, CV_32F);
+
+    int type = image_sums.type();
+    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    if (cn== 1)
+        kernelName = "matchTemplate_CCOEFF_NORMED_C1";
+    else if (cn==2)
+        kernelName = "matchTemplate_CCOEFF_NORMED_C2";
+    else
+        kernelName = "matchTemplate_CCOEFF_NORMED_C4";
+
+    ocl::Kernel k(kernelName, ocl::imgproc::match_template_oclsrc,
+        format("-D type=%s -D elem_type=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat();
+    UMat templ = _templ.getUMat(), result;
+    int image_rows = _image.size().height, image_cols = _image.size().width;
+    _result.create(image_rows - templ.rows + 1, image_cols - templ.cols + 1, CV_32F);
+    result = _result.getUMat();
+
+    size_t globalsize[2] = {result.cols, result.rows};
+
+    float scale = 1.f / templ.size().area();
+
+    if (cn==1)
+    {
+        float templ_sum = (float)sum(templ)[0];
+
+        multiply(templf, templf, temp);
+        float templ_sqsum = (float)sum(temp)[0];
+
+        templ_sqsum -= scale * templ_sum * templ_sum;
+        templ_sum   *= scale;
+
+        if (templ_sqsum < DBL_EPSILON)
+        {
+            result = Scalar::all(1);
+            return true;
+        }
+
+        return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums),ocl::KernelArg::ReadOnlyNoSize(image_sqsums),
+                      ocl::KernelArg::WriteOnly(result), templ.rows, templ.cols, scale, templ_sum, templ_sqsum)
+                     .run(2,globalsize,NULL,false);
+    }
+    else
+    {
+        Vec4f templ_sum = Vec4f::all(0);
+        Vec4f templ_sqsum = Vec4f::all(0);
+
+        templ_sum = sum(templ);
+
+        multiply(templf, templf, temp);
+        templ_sqsum = sum(temp);
+
+        float templ_sqsum_sum = 0;
+            for(int i = 0; i < cn; i ++)
+            {
+                templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
+            }
+
+        templ_sum   *= scale;
+
+        if (templ_sqsum_sum < DBL_EPSILON)
+        {
+            result = Scalar::all(1);
+            return true;
+        }
+
+        if (cn==2)
+             return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums),
+                           ocl::KernelArg::WriteOnly(result), templ.rows, templ.cols, scale,
+                           templ_sum[0],templ_sum[1], templ_sqsum_sum)
+                           .run(2,globalsize,NULL,false);
+
+        return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums),
+                      ocl::KernelArg::WriteOnly(result), templ.rows, templ.cols, scale,
+                      templ_sum[0],templ_sum[1],templ_sum[2],templ_sum[3], templ_sqsum_sum)
+                     .run(2,globalsize,NULL,false);
+    }
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, int method)
+{
+    int cn = CV_MAT_CN(_img.type());
+
+    if (cn == 3 || cn > 4)
+        return false;
+
+    typedef bool (*Caller)(InputArray _img, InputArray _templ, OutputArray _result);
+
+    const Caller callers[] =
+    {
+        matchTemplate_SQDIFF, matchTemplate_SQDIFF_NORMED, matchTemplate_CCORR,
+        matchTemplate_CCORR_NORMED, matchTemplate_CCOEFF, matchTemplate_CCOEFF_NORMED
+    };
+
+    Caller caller = callers[method];
+
+    return caller(_img, _templ, _result);
+}
+
+#endif
+
 void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                Size corrsize, int ctype,
                Point anchor, double delta, int borderType )
@@ -226,14 +545,24 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
        }
    }
 }
-
 }

-/*****************************************************************************************/
+////////////////////////////////////////////////////////////////////////////////////////////////////////

 void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, int method )
 {
    CV_Assert( CV_TM_SQDIFF <= method && method <= CV_TM_CCOEFF_NORMED );
+    CV_Assert( (_img.depth() == CV_8U || _img.depth() == CV_32F) && _img.type() == _templ.type() );
+    CV_Assert(_img.dims() <= 2);
+
+    bool swapNotNeed = (_img.size().height >= _templ.size().height && _img.size().width >= _templ.size().width);
+    if (!swapNotNeed)
+    {
+        CV_Assert(_img.size().height <= _templ.size().height && _img.size().width <= _templ.size().width);
+    }
+
+    CV_OCL_RUN(_img.dims() <= 2 && _result.isUMat(),
+               (swapNotNeed ? ocl_matchTemplate(_img,_templ,_result,method) : ocl_matchTemplate(_templ,_img,_result,method)))

    int numType = method == CV_TM_CCORR || method == CV_TM_CCORR_NORMED ? 0 :
                  method == CV_TM_CCOEFF || method == CV_TM_CCOEFF_NORMED ? 1 : 2;
@@ -242,14 +571,9 @@ void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result,
                    method == CV_TM_CCOEFF_NORMED;

    Mat img = _img.getMat(), templ = _templ.getMat();
-    if( img.rows < templ.rows || img.cols < templ.cols )
+    if(!swapNotNeed )
        std::swap(img, templ);

-    CV_Assert( (img.depth() == CV_8U || img.depth() == CV_32F) &&
-               img.type() == templ.type() );
-
-    CV_Assert( img.rows >= templ.rows && img.cols >= templ.cols);
-
    Size corrSize(img.cols - templ.cols + 1, img.rows - templ.rows + 1);
    _result.create(corrSize, CV_32F);
    Mat result = _result.getMat();
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -706,6 +706,8 @@ private:
    int thresholdType;
 };

+#ifdef HAVE_OPENCL
+
 static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, double maxval, int thresh_type )
 {
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), ktype = CV_MAKE_TYPE(depth, 1);
@@ -739,13 +741,14 @@ static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, d
    return k.run(2, globalsize, NULL, false);
 }

+#endif
+
 }

 double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double maxval, int type )
 {
-    if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat() &&
-            ocl_threshold(_src, _dst, thresh, maxval, type))
-        return thresh;
+    CV_OCL_RUN_(_src.dims() <= 2 && _dst.isUMat(),
+                ocl_threshold(_src, _dst, thresh, maxval, type), thresh)

    Mat src = _src.getMat();
    bool use_otsu = (type & THRESH_OTSU) != 0;
--- a/modules/imgproc/test/ocl/test_accumulate.cpp
+++ b/modules/imgproc/test/ocl/test_accumulate.cpp
@@ -0,0 +1,240 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "cvconfig.h"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+PARAM_TEST_CASE(AccumulateBase, std::pair<MatDepth, MatDepth>, Channels, bool)
+{
+    int sdepth, ddepth, channels;
+    bool useRoi;
+    double alpha;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_INPUT_PARAMETER(mask)
+    TEST_DECLARE_INPUT_PARAMETER(src2)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        const std::pair<MatDepth, MatDepth> depths = GET_PARAM(0);
+        sdepth = depths.first, ddepth = depths.second;
+        channels = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+    }
+
+    void random_roi()
+    {
+        const int stype = CV_MAKE_TYPE(sdepth, channels),
+                dtype = CV_MAKE_TYPE(ddepth, channels);
+
+        Size roiSize = randomSize(1, 10);
+        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, stype, -MAX_VALUE, MAX_VALUE);
+
+        Border maskBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(mask, mask_roi, roiSize, maskBorder, CV_8UC1, -MAX_VALUE, MAX_VALUE);
+        threshold(mask, mask, 80, 255, THRESH_BINARY);
+
+        Border src2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src2, src2_roi, roiSize, src2Border, stype, -MAX_VALUE, MAX_VALUE);
+
+        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, -MAX_VALUE, MAX_VALUE);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_INPUT_PARAMETER(mask)
+        UMAT_UPLOAD_INPUT_PARAMETER(src2)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+
+        alpha = randomDouble(-5, 5);
+    }
+};
+
+/////////////////////////////////// Accumulate ///////////////////////////////////
+
+typedef AccumulateBase Accumulate;
+
+OCL_TEST_P(Accumulate, Mat)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulate(src_roi, dst_roi));
+        OCL_ON(cv::accumulate(usrc_roi, udst_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-6);
+    }
+}
+
+OCL_TEST_P(Accumulate, Mask)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulate(src_roi, dst_roi, mask_roi));
+        OCL_ON(cv::accumulate(usrc_roi, udst_roi, umask_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-6);
+    }
+}
+
+/////////////////////////////////// AccumulateSquare ///////////////////////////////////
+
+typedef AccumulateBase AccumulateSquare;
+
+OCL_TEST_P(AccumulateSquare, Mat)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateSquare(src_roi, dst_roi));
+        OCL_ON(cv::accumulateSquare(usrc_roi, udst_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+OCL_TEST_P(AccumulateSquare, Mask)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateSquare(src_roi, dst_roi, mask_roi));
+        OCL_ON(cv::accumulateSquare(usrc_roi, udst_roi, umask_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+/////////////////////////////////// AccumulateProduct ///////////////////////////////////
+
+typedef AccumulateBase AccumulateProduct;
+
+OCL_TEST_P(AccumulateProduct, Mat)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateProduct(src_roi, src2_roi, dst_roi));
+        OCL_ON(cv::accumulateProduct(usrc_roi, usrc2_roi, udst_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+OCL_TEST_P(AccumulateProduct, Mask)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateProduct(src_roi, src2_roi, dst_roi, mask_roi));
+        OCL_ON(cv::accumulateProduct(usrc_roi, usrc2_roi, udst_roi, umask_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+/////////////////////////////////// AccumulateWeighted ///////////////////////////////////
+
+typedef AccumulateBase AccumulateWeighted;
+
+OCL_TEST_P(AccumulateWeighted, Mat)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateWeighted(src_roi, dst_roi, alpha));
+        OCL_ON(cv::accumulateWeighted(usrc_roi, udst_roi, alpha));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+OCL_TEST_P(AccumulateWeighted, Mask)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateWeighted(src_roi, dst_roi, alpha));
+        OCL_ON(cv::accumulateWeighted(usrc_roi, udst_roi, alpha));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+/////////////////////////////////// Instantiation ///////////////////////////////////
+
+#define OCL_DEPTH_ALL_COMBINATIONS \
+    testing::Values(std::make_pair<MatDepth, MatDepth>(CV_8U, CV_32F), \
+    std::make_pair<MatDepth, MatDepth>(CV_16U, CV_32F), \
+    std::make_pair<MatDepth, MatDepth>(CV_32F, CV_32F), \
+    std::make_pair<MatDepth, MatDepth>(CV_8U, CV_64F), \
+    std::make_pair<MatDepth, MatDepth>(CV_16U, CV_64F), \
+    std::make_pair<MatDepth, MatDepth>(CV_32F, CV_64F), \
+    std::make_pair<MatDepth, MatDepth>(CV_64F, CV_64F))
+
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, Accumulate, Combine(OCL_DEPTH_ALL_COMBINATIONS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, AccumulateSquare, Combine(OCL_DEPTH_ALL_COMBINATIONS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, AccumulateProduct, Combine(OCL_DEPTH_ALL_COMBINATIONS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, AccumulateWeighted, Combine(OCL_DEPTH_ALL_COMBINATIONS, OCL_ALL_CHANNELS, Bool()));
+
+} } // namespace cvtest::ocl
+
+#endif
--- a/modules/imgproc/test/ocl/test_blend.cpp
+++ b/modules/imgproc/test/ocl/test_blend.cpp
@@ -75,7 +75,7 @@ PARAM_TEST_CASE(BlendLinear, MatDepth, Channels, bool)
        const int type = CV_MAKE_TYPE(depth, channels);
        const double upValue = 256;

-        Size roiSize = randomSize(1, 20);
+        Size roiSize = randomSize(1, MAX_VALUE);
        Border src1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
        randomSubMat(src1, src1_roi, roiSize, src1Border, type, -upValue, upValue);

@@ -104,8 +104,7 @@ PARAM_TEST_CASE(BlendLinear, MatDepth, Channels, bool)

    void Near(double eps = 0.0)
    {
-        EXPECT_MAT_NEAR(dst, udst, eps);
-        EXPECT_MAT_NEAR(dst_roi, udst_roi, eps);
+        OCL_EXPECT_MATS_NEAR(dst, eps)
    }
 };

--- a/modules/imgproc/test/ocl/test_boxfilter.cpp
+++ b/modules/imgproc/test/ocl/test_boxfilter.cpp
@@ -56,32 +56,34 @@ enum

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // boxFilter
-PARAM_TEST_CASE(BoxFilter, MatDepth, Channels, BorderType, bool)
+PARAM_TEST_CASE(BoxFilter, MatDepth, Channels, BorderType, bool, bool)
 {
    static const int kernelMinSize = 2;
    static const int kernelMaxSize = 10;

-    int type;
+    int depth, cn;
    Size ksize;
    Size dsize;
    Point anchor;
    int borderType;
-    bool useRoi;
+    bool normalize, useRoi;

    TEST_DECLARE_INPUT_PARAMETER(src)
    TEST_DECLARE_OUTPUT_PARAMETER(dst)

    virtual void SetUp()
    {
-        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
        borderType = GET_PARAM(2); // only not isolated border tested, because CPU module doesn't support isolated border case.
-        useRoi = GET_PARAM(3);
+        normalize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
    }

    void random_roi()
    {
+        int type = CV_MAKE_TYPE(depth, cn);
        dsize = randomSize(1, MAX_VALUE);
-
        ksize = randomSize(kernelMinSize, kernelMaxSize);

        Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
@@ -100,8 +102,7 @@ PARAM_TEST_CASE(BoxFilter, MatDepth, Channels, BorderType, bool)

    void Near(double threshold = 0.0)
    {
-        EXPECT_MAT_NEAR(dst, udst, threshold);
-        EXPECT_MAT_NEAR(dst_roi, udst_roi, threshold);
+        OCL_EXPECT_MATS_NEAR(dst, threshold)
    }
 };

@@ -111,10 +112,10 @@ OCL_TEST_P(BoxFilter, Mat)
    {
        random_roi();

-        OCL_OFF(cv::boxFilter(src_roi, dst_roi, -1, ksize, anchor, true, borderType));
-        OCL_ON(cv::boxFilter(usrc_roi, udst_roi, -1, ksize, anchor, true, borderType));
+        OCL_OFF(cv::boxFilter(src_roi, dst_roi, -1, ksize, anchor, normalize, borderType));
+        OCL_ON(cv::boxFilter(usrc_roi, udst_roi, -1, ksize, anchor, normalize, borderType));

-        Near(1.0);
+        Near(depth <= CV_32S ? 1 : 1e-3);
    }
 }

@@ -127,6 +128,7 @@ OCL_INSTANTIATE_TEST_CASE_P(ImageProc, BoxFilter,
                                       (BorderType)BORDER_REPLICATE,
                                       (BorderType)BORDER_REFLECT,
                                       (BorderType)BORDER_REFLECT_101),
+                                Bool(),
                                Bool()  // ROI
                                )
                           );
--- a/modules/imgproc/test/ocl/test_canny.cpp
+++ b/modules/imgproc/test/ocl/test_canny.cpp
@@ -0,0 +1,117 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+////////////////////////////////////////////////////////
+// Canny
+
+IMPLEMENT_PARAM_CLASS(AppertureSize, int)
+IMPLEMENT_PARAM_CLASS(L2gradient, bool)
+IMPLEMENT_PARAM_CLASS(UseRoi, bool)
+
+PARAM_TEST_CASE(Canny, AppertureSize, L2gradient, UseRoi)
+{
+    int apperture_size;
+    bool useL2gradient, use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        apperture_size = GET_PARAM(0);
+        useL2gradient = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+    }
+
+    void generateTestData()
+    {
+        Mat img = readImage("shared/fruits.png", IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty()) << "cann't load shared/fruits.png";
+
+        Size roiSize = img.size();
+        int type = img.type();
+        ASSERT_EQ(CV_8UC1, type);
+
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, 2, 100);
+        img.copyTo(src_roi);
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+    }
+};
+
+OCL_TEST_P(Canny, Accuracy)
+{
+    generateTestData();
+
+    const double low_thresh = 50.0, high_thresh = 100.0;
+
+    OCL_OFF(cv::Canny(src_roi, dst_roi, low_thresh, high_thresh, apperture_size, useL2gradient));
+    OCL_ON(cv::Canny(usrc_roi, udst_roi, low_thresh, high_thresh, apperture_size, useL2gradient));
+
+    EXPECT_MAT_SIMILAR(dst_roi, udst_roi, 1e-2);
+    EXPECT_MAT_SIMILAR(dst, udst, 1e-2);
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine(
+                                testing::Values(AppertureSize(3), AppertureSize(5)),
+                                testing::Values(L2gradient(false), L2gradient(true)),
+                                testing::Values(UseRoi(false), UseRoi(true))));
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
--- a/modules/imgproc/test/ocl/test_gftt.cpp
+++ b/modules/imgproc/test/ocl/test_gftt.cpp
@@ -0,0 +1,143 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+//////////////////////////// GoodFeaturesToTrack //////////////////////////
+
+
+PARAM_TEST_CASE(GoodFeaturesToTrack, double, bool)
+{
+    double minDistance;
+    bool useRoi;
+
+    static const int maxCorners;
+    static const double qualityLevel;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    UMat points, upoints;
+
+    virtual void SetUp()
+    {
+        minDistance = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
+    }
+
+    void generateTestData()
+    {
+        Mat frame = readImage("../gpu/opticalflow/rubberwhale1.png", IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame.empty()) << "could not load gpu/opticalflow/rubberwhale1.png";
+
+        Size roiSize = frame.size();
+        Border srcBorder = randomBorder(0, useRoi ? 2 : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, frame.type(), 5, 256);
+        src_roi.copyTo(frame);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+    }
+
+    void UMatToVector(const UMat & um, std::vector<Point2f> & v) const
+    {
+        v.resize(points.cols);
+        um.getMat(ACCESS_READ).copyTo(v);
+    }
+};
+
+const int GoodFeaturesToTrack::maxCorners = 1000;
+const double GoodFeaturesToTrack::qualityLevel = 0.01;
+
+OCL_TEST_P(GoodFeaturesToTrack, Accuracy)
+{
+    for (int j = 0; j < test_loop_times; ++j)
+    {
+        generateTestData();
+
+        std::vector<Point2f> upts, pts;
+
+        OCL_OFF(cv::goodFeaturesToTrack(src_roi, points, maxCorners, qualityLevel, minDistance, noArray()));
+        ASSERT_FALSE(points.empty());
+        UMatToVector(points, pts);
+
+        OCL_ON(cv::goodFeaturesToTrack(usrc_roi, upoints, maxCorners, qualityLevel, minDistance));
+        ASSERT_FALSE(upoints.empty());
+        UMatToVector(upoints, upts);
+
+        ASSERT_EQ(upts.size(), pts.size());
+
+        int mistmatch = 0;
+        for (size_t i = 0; i < pts.size(); ++i)
+        {
+            Point2i a = upts[i], b = pts[i];
+            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+
+            if (!eq)
+                ++mistmatch;
+        }
+
+        double bad_ratio = static_cast<double>(mistmatch) / pts.size();
+        ASSERT_GE(1e-2, bad_ratio);
+    }
+}
+
+OCL_TEST_P(GoodFeaturesToTrack, EmptyCorners)
+{
+    generateTestData();
+    usrc_roi.setTo(Scalar::all(0));
+
+    OCL_ON(cv::goodFeaturesToTrack(usrc_roi, upoints, maxCorners, qualityLevel, minDistance));
+
+    ASSERT_TRUE(upoints.empty());
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Imgproc, GoodFeaturesToTrack,
+                            ::testing::Combine(testing::Values(0.0, 3.0), Bool()));
+
+} } // namespace cvtest::ocl
+
+#endif
--- a/modules/imgproc/test/ocl/test_histogram.cpp
+++ b/modules/imgproc/test/ocl/test_histogram.cpp
@@ -144,11 +144,6 @@ PARAM_TEST_CASE(CalcBackProject, MatDepth, int, bool)

        scale = randomDouble(0.1, 1);
    }
-
-    void Near()
-    {
-        OCL_EXPECT_MATS_NEAR(dst, 0.0)
-    }
 };

 //////////////////////////////// CalcBackProject //////////////////////////////////////////////
@@ -162,13 +157,62 @@ OCL_TEST_P(CalcBackProject, Mat)
        OCL_OFF(cv::calcBackProject(images_roi, channels, hist_roi, dst_roi, ranges, scale));
        OCL_ON(cv::calcBackProject(uimages_roi, channels, uhist_roi, udst_roi, ranges, scale));

-        Near();
+        OCL_EXPECT_MATS_NEAR(dst, 0.0)
+    }
+}
+
+//////////////////////////////// CalcHist //////////////////////////////////////////////
+
+PARAM_TEST_CASE(CalcHist, bool)
+{
+    bool useRoi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(hist)
+
+    virtual void SetUp()
+    {
+        useRoi = GET_PARAM(0);
+    }
+
+    virtual void random_roi()
+    {
+        Size roiSize = randomSize(1, MAX_VALUE);
+
+        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, CV_8UC1, 0, 256);
+
+        Border histBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(hist, hist_roi, Size(1, 256), histBorder, CV_32SC1, 0, MAX_VALUE);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(hist)
+    }
+};
+
+OCL_TEST_P(CalcHist, Mat)
+{
+    const std::vector<int> channels(1, 0);
+    std::vector<float> ranges(2);
+    std::vector<int> histSize(1, 256);
+    ranges[0] = 0;
+    ranges[1] = 256;
+
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+
+        OCL_OFF(cv::calcHist(std::vector<Mat>(1, src_roi), channels, noArray(), hist_roi, histSize, ranges, false));
+        OCL_ON(cv::calcHist(std::vector<UMat>(1, usrc_roi), channels, noArray(), uhist_roi, histSize, ranges, false));
+
+        OCL_EXPECT_MATS_NEAR(hist, 0.0)
    }
 }

 /////////////////////////////////////////////////////////////////////////////////////

 OCL_INSTANTIATE_TEST_CASE_P(Imgproc, CalcBackProject, Combine(Values((MatDepth)CV_8U), Values(1, 2), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Imgproc, CalcHist, Values(true, false));

 } } // namespace cvtest::ocl

--- a/modules/imgproc/test/ocl/test_imgproc.cpp
+++ b/modules/imgproc/test/ocl/test_imgproc.cpp
@@ -103,7 +103,7 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType,
    }
 };

-////////////////////////////////copyMakeBorder////////////////////////////////////////////
+//////////////////////////////// copyMakeBorder ////////////////////////////////////////////

 PARAM_TEST_CASE(CopyMakeBorder, MatDepth, // depth
                Channels, // channels
@@ -171,7 +171,7 @@ OCL_TEST_P(CopyMakeBorder, Mat)
    }
 }

-////////////////////////////////equalizeHist//////////////////////////////////////////////
+//////////////////////////////// equalizeHist //////////////////////////////////////////////

 typedef ImgprocTestBase EqualizeHist;

@@ -188,14 +188,14 @@ OCL_TEST_P(EqualizeHist, Mat)
    }
 }

-////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
+//////////////////////////////// Corners test //////////////////////////////////////////

 struct CornerTestBase :
        public ImgprocTestBase
 {
    virtual void random_roi()
    {
-        Mat image = readImageType("gpu/stereobm/aloe-L.png", type);
+        Mat image = readImageType("../gpu/stereobm/aloe-L.png", type);
        ASSERT_FALSE(image.empty());

        bool isFP = CV_MAT_DEPTH(type) >= CV_32F;
@@ -224,7 +224,7 @@ struct CornerTestBase :

 typedef CornerTestBase CornerMinEigenVal;

-OCL_TEST_P(CornerMinEigenVal, DISABLED_Mat)
+OCL_TEST_P(CornerMinEigenVal, Mat)
 {
    for (int j = 0; j < test_loop_times; j++)
    {
@@ -239,11 +239,11 @@ OCL_TEST_P(CornerMinEigenVal, DISABLED_Mat)
    }
 }

-////////////////////////////////cornerHarris//////////////////////////////////////////
+//////////////////////////////// cornerHarris //////////////////////////////////////////

 typedef CornerTestBase CornerHarris;

-OCL_TEST_P(CornerHarris, DISABLED_Mat)
+OCL_TEST_P(CornerHarris, Mat)
 {
    for (int j = 0; j < test_loop_times; j++)
    {
@@ -255,11 +255,31 @@ OCL_TEST_P(CornerHarris, DISABLED_Mat)
        OCL_OFF(cv::cornerHarris(src_roi, dst_roi, blockSize, apertureSize, k, borderType));
        OCL_ON(cv::cornerHarris(usrc_roi, udst_roi, blockSize, apertureSize, k, borderType));

-        Near(1e-5, true);
+        Near(1e-6, true);
    }
 }

-//////////////////////////////////integral/////////////////////////////////////////////////
+//////////////////////////////// preCornerDetect //////////////////////////////////////////
+
+typedef ImgprocTestBase PreCornerDetect;
+
+OCL_TEST_P(PreCornerDetect, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+
+        const int apertureSize = blockSize;
+
+        OCL_OFF(cv::preCornerDetect(src_roi, dst_roi, apertureSize, borderType));
+        OCL_ON(cv::preCornerDetect(usrc_roi, udst_roi, apertureSize, borderType));
+
+        Near(1e-6, true);
+    }
+}
+
+
+////////////////////////////////// integral /////////////////////////////////////////////////

 struct Integral :
        public ImgprocTestBase
@@ -331,8 +351,7 @@ OCL_TEST_P(Integral, Mat2)
    }
 }

-///////////////////////////////////////////////////////////////////////////////////////////////////
-//// threshold
+////////////////////////////////////////  threshold //////////////////////////////////////////////

 struct Threshold :
        public ImgprocTestBase
@@ -342,7 +361,6 @@ struct Threshold :
    virtual void SetUp()
    {
        type = GET_PARAM(0);
-        blockSize = GET_PARAM(1);
        thresholdType = GET_PARAM(2);
        useRoi = GET_PARAM(3);
    }
@@ -364,9 +382,7 @@ OCL_TEST_P(Threshold, Mat)
    }
 }

-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// CLAHE
+/////////////////////////////////////////// CLAHE //////////////////////////////////////////////////

 PARAM_TEST_CASE(CLAHETest, Size, double, bool)
 {
@@ -440,6 +456,13 @@ OCL_INSTANTIATE_TEST_CASE_P(Imgproc, CornerHarris, Combine(
                                    (BorderType)BORDER_REFLECT, (BorderType)BORDER_REFLECT_101),
                            Bool()));

+OCL_INSTANTIATE_TEST_CASE_P(Imgproc, PreCornerDetect, Combine(
+                            Values((MatType)CV_8UC1, CV_32FC1),
+                            Values(3, 5),
+                            Values( (BorderType)BORDER_CONSTANT, (BorderType)BORDER_REPLICATE,
+                                    (BorderType)BORDER_REFLECT, (BorderType)BORDER_REFLECT_101),
+                            Bool()));
+
 OCL_INSTANTIATE_TEST_CASE_P(Imgproc, Integral, Combine(
                            Values((MatType)CV_8UC1), // TODO does not work with CV_32F, CV_64F
                            Values(CV_32SC1, CV_32FC1), // desired sdepth
--- a/modules/imgproc/test/ocl/test_match_template.cpp
+++ b/modules/imgproc/test/ocl/test_match_template.cpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+#include "iostream"
+#include "fstream"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////////////////matchTemplate//////////////////////////////////////////////////////////
+
+PARAM_TEST_CASE(MatchTemplate, MatDepth, Channels, int, bool)
+{
+    int type;
+    int depth;
+    int method;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(image)
+    TEST_DECLARE_INPUT_PARAMETER(templ)
+    TEST_DECLARE_OUTPUT_PARAMETER(result)
+
+    virtual void SetUp()
+    {
+        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
+        depth = GET_PARAM(0);
+        method = GET_PARAM(2);
+        use_roi = GET_PARAM(3);
+    }
+
+    virtual void generateTestData()
+    {
+        Size image_roiSize = randomSize(2, 100);
+        Size templ_roiSize = Size(randomInt(1, image_roiSize.width), randomInt(1, image_roiSize.height));
+        Size result_roiSize = Size(image_roiSize.width - templ_roiSize.width + 1,
+                                   image_roiSize.height - templ_roiSize.height + 1);
+
+        const double upValue = 256;
+
+        Border imageBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(image, image_roi, image_roiSize, imageBorder, type, -upValue, upValue);
+
+        Border templBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(templ, templ_roi, templ_roiSize, templBorder, type, -upValue, upValue);
+
+        Border resultBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(result, result_roi, result_roiSize, resultBorder, CV_32F, -upValue, upValue);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(image)
+        UMAT_UPLOAD_INPUT_PARAMETER(templ)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(result)
+    }
+
+    void Near(double threshold = 0.0)
+    {
+        OCL_EXPECT_MATS_NEAR(result,threshold);
+    }
+};
+
+OCL_TEST_P(MatchTemplate, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::matchTemplate(image_roi,templ_roi,result_roi, method));
+        OCL_ON(cv::matchTemplate(uimage_roi,utempl_roi,uresult_roi, method));
+
+       if (method == 0)
+           Near(10.0f);
+       else
+           Near(method % 2 == 1 ? 0.001f : 1.0f);
+    }
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(ImageProc, MatchTemplate, Combine(
+                                Values(CV_8U, CV_32F),
+                                Values(1, 2, 4),
+                                Values(0,1,2,3,4,5),
+                                Bool())
+                           );
+} } // namespace cvtest::ocl
+
+#endif
--- a/modules/imgproc/test/test_cvtyuv.cpp
+++ b/modules/imgproc/test/test_cvtyuv.cpp
@@ -603,7 +603,7 @@ CV_ENUM(YUVCVTS, CV_YUV2RGB_NV12, CV_YUV2BGR_NV12, CV_YUV2RGB_NV21, CV_YUV2BGR_N
                 CV_YUV2RGBA_YUY2, CV_YUV2BGRA_YUY2, CV_YUV2RGBA_YVYU, CV_YUV2BGRA_YVYU,
                 CV_YUV2GRAY_420, CV_YUV2GRAY_UYVY, CV_YUV2GRAY_YUY2,
                 CV_YUV2BGR, CV_YUV2RGB, CV_RGB2YUV_YV12, CV_BGR2YUV_YV12, CV_RGBA2YUV_YV12,
-                 CV_BGRA2YUV_YV12, CV_RGB2YUV_I420, CV_BGR2YUV_I420, CV_RGBA2YUV_I420, CV_BGRA2YUV_I420);
+                 CV_BGRA2YUV_YV12, CV_RGB2YUV_I420, CV_BGR2YUV_I420, CV_RGBA2YUV_I420, CV_BGRA2YUV_I420)

 typedef ::testing::TestWithParam<YUVCVTS> Imgproc_ColorYUV;

--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -898,8 +898,8 @@ struct median_pair
 {
    int col;
    int val;
-    median_pair() {};
-    median_pair( int _col, int _val ) : col(_col), val(_val) {};
+    median_pair() { }
+    median_pair( int _col, int _val ) : col(_col), val(_val) { }
 };


--- a/modules/imgproc/test/test_lsd.cpp
+++ b/modules/imgproc/test/test_lsd.cpp
@@ -12,7 +12,7 @@ const int EPOCHS = 20;
 class LSDBase : public testing::Test
 {
 public:
-    LSDBase() {};
+    LSDBase() { }

 protected:
    Mat test_image;
@@ -30,7 +30,7 @@ protected:
 class Imgproc_LSD_ADV: public LSDBase
 {
 public:
-    Imgproc_LSD_ADV() {};
+    Imgproc_LSD_ADV() { }
 protected:

 };
@@ -38,7 +38,7 @@ protected:
 class Imgproc_LSD_STD: public LSDBase
 {
 public:
-    Imgproc_LSD_STD() {};
+    Imgproc_LSD_STD() { }
 protected:

 };
@@ -46,7 +46,7 @@ protected:
 class Imgproc_LSD_NONE: public LSDBase
 {
 public:
-    Imgproc_LSD_NONE() {};
+    Imgproc_LSD_NONE() { }
 protected:

 };