Merge remote-tracking branch 'origin/2.4'

Pull requests: #943 from jet47:cuda-5.5-support #944 from jet47:cmake-2.8.11-cuda-fix #912 from SpecLad:contributing #934 from SpecLad:parallel-for #931 from jet47:gpu-test-fixes #932 from bitwangyaoyao:2.4_fixBFM #918 from bitwangyaoyao:2.4_samples #924 from pengx17:2.4_arithm_fix #925 from pengx17:2.4_canny_tmp_fix #927 from bitwangyaoyao:2.4_perf #930 from pengx17:2.4_haar_ext #928 from apavlenko:bugfix_3027 #920 from asmorkalov:android_move #910 from pengx17:2.4_oclgfft #913 from janm399:2.4 #916 from bitwangyaoyao:2.4_fixPyrLK #919 from abidrahmank:2.4 #923 from pengx17:2.4_macfix Conflicts: modules/calib3d/src/stereobm.cpp modules/features2d/src/detectors.cpp modules/gpu/src/error.cpp modules/gpu/src/precomp.hpp modules/imgproc/src/distransform.cpp modules/imgproc/src/morph.cpp modules/ocl/include/opencv2/ocl/ocl.hpp modules/ocl/perf/perf_color.cpp modules/ocl/perf/perf_imgproc.cpp modules/ocl/perf/perf_match_template.cpp modules/ocl/perf/precomp.cpp modules/ocl/perf/precomp.hpp modules/ocl/src/arithm.cpp modules/ocl/src/canny.cpp modules/ocl/src/filtering.cpp modules/ocl/src/haar.cpp modules/ocl/src/hog.cpp modules/ocl/src/imgproc.cpp modules/ocl/src/opencl/haarobjectdetect.cl modules/ocl/src/pyrlk.cpp modules/video/src/bgfg_gaussmix2.cpp modules/video/src/lkpyramid.cpp platforms/linux/scripts/cmake_arm_gnueabi_hardfp.sh platforms/linux/scripts/cmake_arm_gnueabi_softfp.sh platforms/scripts/ABI_compat_generator.py samples/ocl/facedetect.cpp
2013-06-04 18:31:51 +04:00
parent d81d3fc830 75cf5cc4ee
commit bae85660da
236 changed files with 5549 additions and 3276 deletions
--- a/modules/androidcamera/CMakeLists.txt
+++ b/modules/androidcamera/CMakeLists.txt
@@ -6,7 +6,7 @@ set(the_description "Auxiliary module for Android native camera support")
 set(OPENCV_MODULE_TYPE STATIC)

 ocv_define_module(androidcamera INTERNAL opencv_core log dl)
-ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/camera_wrapper" "${OpenCV_SOURCE_DIR}/android/service/engine/jni/include")
+ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/camera_wrapper" "${OpenCV_SOURCE_DIR}/platforms/android/service/engine/jni/include")

 # Android source tree for native camera
 SET (ANDROID_SOURCE_TREE "ANDROID_SOURCE_TREE-NOTFOUND" CACHE PATH
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -117,31 +117,6 @@ namespace cv
            transform(points, modif_points, transformation);
        }

-        class Mutex
-        {
-        public:
-            Mutex() {
-            }
-            void lock()
-            {
-#ifdef HAVE_TBB
-                resultsMutex.lock();
-#endif
-            }
-
-            void unlock()
-            {
-#ifdef HAVE_TBB
-                resultsMutex.unlock();
-#endif
-            }
-
-        private:
-#ifdef HAVE_TBB
-            tbb::mutex resultsMutex;
-#endif
-        };
-
        struct CameraParameters
        {
            void init(Mat _intrinsics, Mat _distCoeffs)
--- a/modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
+++ b/modules/core/include/opencv2/core/cuda/detail/color_detail.hpp
@@ -120,11 +120,8 @@ namespace cv { namespace gpu { namespace cudev
                return dst;
            }

-            __device__ __forceinline__ RGB2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-
-            __device__ __forceinline__ RGB2RGB(const RGB2RGB& other_)
-                :unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
        };

        template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>
@@ -141,8 +138,8 @@ namespace cv { namespace gpu { namespace cudev
                return dst;
            }

-            __device__ __forceinline__ RGB2RGB():unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2RGB(const RGB2RGB& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
        };
    }

@@ -203,8 +200,8 @@ namespace cv { namespace gpu { namespace cudev
                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
            }

-            __device__ __forceinline__ RGB2RGB5x5():unary_function<uchar3, ushort>(){}
-            __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5& other_):unary_function<uchar3, ushort>(){}
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
        };

        template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>
@@ -214,8 +211,8 @@ namespace cv { namespace gpu { namespace cudev
                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
            }

-            __device__ __forceinline__ RGB2RGB5x5():unary_function<uint, ushort>(){}
-            __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5& other_):unary_function<uint, ushort>(){}
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
        };
    }

@@ -282,8 +279,8 @@ namespace cv { namespace gpu { namespace cudev
                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
                return dst;
            }
-            __device__ __forceinline__ RGB5x52RGB():unary_function<ushort, uchar3>(){}
-            __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB& other_):unary_function<ushort, uchar3>(){}
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}

        };

@@ -295,8 +292,8 @@ namespace cv { namespace gpu { namespace cudev
                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
                return dst;
            }
-            __device__ __forceinline__ RGB5x52RGB():unary_function<ushort, uint>(){}
-            __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB& other_):unary_function<ushort, uint>(){}
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
        };
    }

@@ -325,9 +322,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ Gray2RGB():unary_function<T, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ Gray2RGB(const Gray2RGB& other_)
-                : unary_function<T, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
        };

        template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>
@@ -342,8 +338,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ Gray2RGB():unary_function<uchar, uint>(){}
-            __device__ __forceinline__ Gray2RGB(const Gray2RGB& other_):unary_function<uchar, uint>(){}
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
        };
    }

@@ -384,8 +380,8 @@ namespace cv { namespace gpu { namespace cudev
                return Gray2RGB5x5Converter<green_bits>::cvt(src);
            }

-            __device__ __forceinline__ Gray2RGB5x5():unary_function<uchar, ushort>(){}
-            __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5& other_):unary_function<uchar, ushort>(){}
+            __host__ __device__ __forceinline__ Gray2RGB5x5() {}
+            __host__ __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5&) {}
        };
    }

@@ -426,8 +422,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return RGB5x52GrayConverter<green_bits>::cvt(src);
            }
-            __device__ __forceinline__ RGB5x52Gray() : unary_function<ushort, uchar>(){}
-            __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray& other_) : unary_function<ushort, uchar>(){}
+            __host__ __device__ __forceinline__ RGB5x52Gray() {}
+            __host__ __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray&) {}
        };
    }

@@ -467,9 +463,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return RGB2GrayConvert<bidx>(&src.x);
            }
-            __device__ __forceinline__ RGB2Gray() : unary_function<typename TypeVec<T, scn>::vec_type, T>(){}
-            __device__ __forceinline__ RGB2Gray(const RGB2Gray& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, T>(){}
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
        };

        template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>
@@ -478,8 +473,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return RGB2GrayConvert<bidx>(src);
            }
-            __device__ __forceinline__ RGB2Gray() : unary_function<uint, uchar>(){}
-            __device__ __forceinline__ RGB2Gray(const RGB2Gray& other_) : unary_function<uint, uchar>(){}
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
        };
    }

@@ -529,10 +524,8 @@ namespace cv { namespace gpu { namespace cudev
                RGB2YUVConvert<bidx>(&src.x, dst);
                return dst;
            }
-            __device__ __forceinline__ RGB2YUV()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2YUV(const RGB2YUV& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2YUV() {}
+            __host__ __device__ __forceinline__ RGB2YUV(const RGB2YUV&) {}
        };
    }

@@ -609,10 +602,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ YUV2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ YUV2RGB(const YUV2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
        };

        template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -621,8 +612,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return YUV2RGBConvert<bidx>(src);
            }
-            __device__ __forceinline__ YUV2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ YUV2RGB(const YUV2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
        };
    }

@@ -689,10 +680,8 @@ namespace cv { namespace gpu { namespace cudev
                RGB2YCrCbConvert<bidx>(&src.x, dst);
                return dst;
            }
-            __device__ __forceinline__ RGB2YCrCb()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
        };

        template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -702,8 +691,8 @@ namespace cv { namespace gpu { namespace cudev
                return RGB2YCrCbConvert<bidx>(src);
            }

-            __device__ __forceinline__ RGB2YCrCb() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
        };
    }

@@ -771,10 +760,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ YCrCb2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
        };

        template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -783,8 +770,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return YCrCb2RGBConvert<bidx>(src);
            }
-            __device__ __forceinline__ YCrCb2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
        };
    }

@@ -849,10 +836,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ RGB2XYZ()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2XYZ(const RGB2XYZ& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
        };

        template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -861,8 +846,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return RGB2XYZConvert<bidx>(src);
            }
-            __device__ __forceinline__ RGB2XYZ() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2XYZ(const RGB2XYZ& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
        };
    }

@@ -926,10 +911,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ XYZ2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ XYZ2RGB(const XYZ2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
        };

        template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@@ -938,8 +921,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return XYZ2RGBConvert<bidx>(src);
            }
-            __device__ __forceinline__ XYZ2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ XYZ2RGB(const XYZ2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
        };
    }

@@ -1066,10 +1049,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ RGB2HSV()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2HSV(const RGB2HSV& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
        };

        template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1078,8 +1059,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return RGB2HSVConvert<bidx, hr>(src);
            }
-            __device__ __forceinline__ RGB2HSV():unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2HSV(const RGB2HSV& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
        };
    }

@@ -1208,10 +1189,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ HSV2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ HSV2RGB(const HSV2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
        };

        template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1220,8 +1199,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return HSV2RGBConvert<bidx, hr>(src);
            }
-            __device__ __forceinline__ HSV2RGB():unary_function<uint, uint>(){}
-            __device__ __forceinline__ HSV2RGB(const HSV2RGB& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
        };
    }

@@ -1343,10 +1322,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ RGB2HLS()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2HLS(const RGB2HLS& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
        };

        template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1355,8 +1332,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return RGB2HLSConvert<bidx, hr>(src);
            }
-            __device__ __forceinline__ RGB2HLS() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2HLS(const RGB2HLS& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
        };
    }

@@ -1485,10 +1462,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ HLS2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ HLS2RGB(const HLS2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
        };

        template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@@ -1497,8 +1472,8 @@ namespace cv { namespace gpu { namespace cudev
            {
                return HLS2RGBConvert<bidx, hr>(src);
            }
-            __device__ __forceinline__ HLS2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ HLS2RGB(const HLS2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
        };
    }

@@ -1651,8 +1626,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ RGB2Lab() {}
-            __device__ __forceinline__ RGB2Lab(const RGB2Lab& other_) {}
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
        };
        template <int scn, int dcn, bool srgb, int blueIdx>
        struct RGB2Lab<float, scn, dcn, srgb, blueIdx>
@@ -1666,8 +1641,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ RGB2Lab() {}
-            __device__ __forceinline__ RGB2Lab(const RGB2Lab& other_) {}
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
        };
    }

@@ -1764,8 +1739,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ Lab2RGB() {}
-            __device__ __forceinline__ Lab2RGB(const Lab2RGB& other_) {}
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
        };
        template <int scn, int dcn, bool srgb, int blueIdx>
        struct Lab2RGB<float, scn, dcn, srgb, blueIdx>
@@ -1779,8 +1754,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ Lab2RGB() {}
-            __device__ __forceinline__ Lab2RGB(const Lab2RGB& other_) {}
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
        };
    }

@@ -1863,8 +1838,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ RGB2Luv() {}
-            __device__ __forceinline__ RGB2Luv(const RGB2Luv& other_) {}
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
        };
        template <int scn, int dcn, bool srgb, int blueIdx>
        struct RGB2Luv<float, scn, dcn, srgb, blueIdx>
@@ -1878,8 +1853,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ RGB2Luv() {}
-            __device__ __forceinline__ RGB2Luv(const RGB2Luv& other_) {}
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
        };
    }

@@ -1964,8 +1939,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ Luv2RGB() {}
-            __device__ __forceinline__ Luv2RGB(const Luv2RGB& other_) {}
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
        };
        template <int scn, int dcn, bool srgb, int blueIdx>
        struct Luv2RGB<float, scn, dcn, srgb, blueIdx>
@@ -1979,8 +1954,8 @@ namespace cv { namespace gpu { namespace cudev

                return dst;
            }
-            __device__ __forceinline__ Luv2RGB() {}
-            __device__ __forceinline__ Luv2RGB(const Luv2RGB& other_) {}
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
        };
    }

--- a/modules/core/include/opencv2/core/cuda/functional.hpp
+++ b/modules/core/include/opencv2/core/cuda/functional.hpp
@@ -63,8 +63,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a + b;
        }
-        __device__ __forceinline__ plus(const plus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ plus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ plus() {}
+        __host__ __device__ __forceinline__ plus(const plus&) {}
    };

    template <typename T> struct minus : binary_function<T, T, T>
@@ -74,8 +74,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a - b;
        }
-        __device__ __forceinline__ minus(const minus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ minus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ minus() {}
+        __host__ __device__ __forceinline__ minus(const minus&) {}
    };

    template <typename T> struct multiplies : binary_function<T, T, T>
@@ -85,8 +85,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a * b;
        }
-        __device__ __forceinline__ multiplies(const multiplies& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ multiplies():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ multiplies() {}
+        __host__ __device__ __forceinline__ multiplies(const multiplies&) {}
    };

    template <typename T> struct divides : binary_function<T, T, T>
@@ -96,8 +96,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a / b;
        }
-        __device__ __forceinline__ divides(const divides& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ divides():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ divides() {}
+        __host__ __device__ __forceinline__ divides(const divides&) {}
    };

    template <typename T> struct modulus : binary_function<T, T, T>
@@ -107,8 +107,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a % b;
        }
-        __device__ __forceinline__ modulus(const modulus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ modulus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ modulus() {}
+        __host__ __device__ __forceinline__ modulus(const modulus&) {}
    };

    template <typename T> struct negate : unary_function<T, T>
@@ -117,8 +117,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return -a;
        }
-        __device__ __forceinline__ negate(const negate& other):unary_function<T,T>(){}
-        __device__ __forceinline__ negate():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ negate() {}
+        __host__ __device__ __forceinline__ negate(const negate&) {}
    };

    // Comparison Operations
@@ -129,8 +129,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a == b;
        }
-        __device__ __forceinline__ equal_to(const equal_to& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ equal_to():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ equal_to() {}
+        __host__ __device__ __forceinline__ equal_to(const equal_to&) {}
    };

    template <typename T> struct not_equal_to : binary_function<T, T, bool>
@@ -140,8 +140,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a != b;
        }
-        __device__ __forceinline__ not_equal_to(const not_equal_to& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ not_equal_to():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ not_equal_to() {}
+        __host__ __device__ __forceinline__ not_equal_to(const not_equal_to&) {}
    };

    template <typename T> struct greater : binary_function<T, T, bool>
@@ -151,8 +151,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a > b;
        }
-        __device__ __forceinline__ greater(const greater& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ greater():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ greater() {}
+        __host__ __device__ __forceinline__ greater(const greater&) {}
    };

    template <typename T> struct less : binary_function<T, T, bool>
@@ -162,8 +162,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a < b;
        }
-        __device__ __forceinline__ less(const less& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ less():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ less() {}
+        __host__ __device__ __forceinline__ less(const less&) {}
    };

    template <typename T> struct greater_equal : binary_function<T, T, bool>
@@ -173,8 +173,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a >= b;
        }
-        __device__ __forceinline__ greater_equal(const greater_equal& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ greater_equal():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ greater_equal() {}
+        __host__ __device__ __forceinline__ greater_equal(const greater_equal&) {}
    };

    template <typename T> struct less_equal : binary_function<T, T, bool>
@@ -184,8 +184,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a <= b;
        }
-        __device__ __forceinline__ less_equal(const less_equal& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ less_equal():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ less_equal() {}
+        __host__ __device__ __forceinline__ less_equal(const less_equal&) {}
    };

    // Logical Operations
@@ -196,8 +196,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a && b;
        }
-        __device__ __forceinline__ logical_and(const logical_and& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ logical_and():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ logical_and() {}
+        __host__ __device__ __forceinline__ logical_and(const logical_and&) {}
    };

    template <typename T> struct logical_or : binary_function<T, T, bool>
@@ -207,8 +207,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a || b;
        }
-        __device__ __forceinline__ logical_or(const logical_or& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ logical_or():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ logical_or() {}
+        __host__ __device__ __forceinline__ logical_or(const logical_or&) {}
    };

    template <typename T> struct logical_not : unary_function<T, bool>
@@ -217,8 +217,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return !a;
        }
-        __device__ __forceinline__ logical_not(const logical_not& other):unary_function<T,bool>(){}
-        __device__ __forceinline__ logical_not():unary_function<T,bool>(){}
+        __host__ __device__ __forceinline__ logical_not() {}
+        __host__ __device__ __forceinline__ logical_not(const logical_not&) {}
    };

    // Bitwise Operations
@@ -229,8 +229,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a & b;
        }
-        __device__ __forceinline__ bit_and(const bit_and& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_and():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_and() {}
+        __host__ __device__ __forceinline__ bit_and(const bit_and&) {}
    };

    template <typename T> struct bit_or : binary_function<T, T, T>
@@ -240,8 +240,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a | b;
        }
-        __device__ __forceinline__ bit_or(const bit_or& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_or():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_or() {}
+        __host__ __device__ __forceinline__ bit_or(const bit_or&) {}
    };

    template <typename T> struct bit_xor : binary_function<T, T, T>
@@ -251,8 +251,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return a ^ b;
        }
-        __device__ __forceinline__ bit_xor(const bit_xor& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_xor():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_xor() {}
+        __host__ __device__ __forceinline__ bit_xor(const bit_xor&) {}
    };

    template <typename T> struct bit_not : unary_function<T, T>
@@ -261,8 +261,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return ~v;
        }
-        __device__ __forceinline__ bit_not(const bit_not& other):unary_function<T,T>(){}
-        __device__ __forceinline__ bit_not():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ bit_not() {}
+        __host__ __device__ __forceinline__ bit_not(const bit_not&) {}
    };

    // Generalized Identity Operations
@@ -272,8 +272,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return x;
        }
-        __device__ __forceinline__ identity(const identity& other):unary_function<T,T>(){}
-        __device__ __forceinline__ identity():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ identity() {}
+        __host__ __device__ __forceinline__ identity(const identity&) {}
    };

    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
@@ -282,8 +282,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return lhs;
        }
-        __device__ __forceinline__ project1st(const project1st& other):binary_function<T1,T2,T1>(){}
-        __device__ __forceinline__ project1st():binary_function<T1,T2,T1>(){}
+        __host__ __device__ __forceinline__ project1st() {}
+        __host__ __device__ __forceinline__ project1st(const project1st&) {}
    };

    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
@@ -292,8 +292,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return rhs;
        }
-        __device__ __forceinline__ project2nd(const project2nd& other):binary_function<T1,T2,T2>(){}
-        __device__ __forceinline__ project2nd():binary_function<T1,T2,T2>(){}
+        __host__ __device__ __forceinline__ project2nd() {}
+        __host__ __device__ __forceinline__ project2nd(const project2nd&) {}
    };

    // Min/Max Operations
@@ -302,8 +302,8 @@ namespace cv { namespace gpu { namespace cudev
    template <> struct name<type> : binary_function<type, type, type> \
    { \
        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
-        __device__ __forceinline__ name() {}\
-        __device__ __forceinline__ name(const name&) {}\
+        __host__ __device__ __forceinline__ name() {}\
+        __host__ __device__ __forceinline__ name(const name&) {}\
    };

    template <typename T> struct maximum : binary_function<T, T, T>
@@ -312,8 +312,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return max(lhs, rhs);
        }
-        __device__ __forceinline__ maximum() {}
-        __device__ __forceinline__ maximum(const maximum&) {}
+        __host__ __device__ __forceinline__ maximum() {}
+        __host__ __device__ __forceinline__ maximum(const maximum&) {}
    };

    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@@ -332,8 +332,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return min(lhs, rhs);
        }
-        __device__ __forceinline__ minimum() {}
-        __device__ __forceinline__ minimum(const minimum&) {}
+        __host__ __device__ __forceinline__ minimum() {}
+        __host__ __device__ __forceinline__ minimum(const minimum&) {}
    };

    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@@ -349,7 +349,6 @@ namespace cv { namespace gpu { namespace cudev
 #undef OPENCV_GPU_IMPLEMENT_MINMAX

    // Math functions
-///bound=========================================

    template <typename T> struct abs_func : unary_function<T, T>
    {
@@ -358,8 +357,8 @@ namespace cv { namespace gpu { namespace cudev
            return abs(x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
    {
@@ -368,8 +367,8 @@ namespace cv { namespace gpu { namespace cudev
            return x;
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
    {
@@ -378,8 +377,8 @@ namespace cv { namespace gpu { namespace cudev
            return ::abs((int)x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<char> : unary_function<char, char>
    {
@@ -388,8 +387,8 @@ namespace cv { namespace gpu { namespace cudev
            return ::abs((int)x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
    {
@@ -398,8 +397,8 @@ namespace cv { namespace gpu { namespace cudev
            return x;
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<short> : unary_function<short, short>
    {
@@ -408,8 +407,8 @@ namespace cv { namespace gpu { namespace cudev
            return ::abs((int)x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
    {
@@ -418,8 +417,8 @@ namespace cv { namespace gpu { namespace cudev
            return x;
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<int> : unary_function<int, int>
    {
@@ -428,8 +427,8 @@ namespace cv { namespace gpu { namespace cudev
            return ::abs(x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<float> : unary_function<float, float>
    {
@@ -438,8 +437,8 @@ namespace cv { namespace gpu { namespace cudev
            return ::fabsf(x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<double> : unary_function<double, double>
    {
@@ -448,8 +447,8 @@ namespace cv { namespace gpu { namespace cudev
            return ::fabs(x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };

 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
@@ -459,8 +458,8 @@ namespace cv { namespace gpu { namespace cudev
        { \
            return func ## f(v); \
        } \
-        __device__ __forceinline__ name ## _func() {} \
-        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    }; \
    template <> struct name ## _func<double> : unary_function<double, double> \
    { \
@@ -468,8 +467,8 @@ namespace cv { namespace gpu { namespace cudev
        { \
            return func(v); \
        } \
-        __device__ __forceinline__ name ## _func() {} \
-        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    };

 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@@ -479,6 +478,8 @@ namespace cv { namespace gpu { namespace cudev
        { \
            return func ## f(v1, v2); \
        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    }; \
    template <> struct name ## _func<double> : binary_function<double, double, double> \
    { \
@@ -486,6 +487,8 @@ namespace cv { namespace gpu { namespace cudev
        { \
            return func(v1, v2); \
        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    };

    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
@@ -522,8 +525,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return src1 * src1 + src2 * src2;
        }
-        __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
-        __device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
+        __host__ __device__ __forceinline__ hypot_sqr_func() {}
+        __host__ __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func&) {}
    };

    // Saturate Cast Functor
@@ -533,8 +536,8 @@ namespace cv { namespace gpu { namespace cudev
        {
            return saturate_cast<D>(v);
        }
-        __device__ __forceinline__ saturate_cast_func(const saturate_cast_func& other):unary_function<T, D>(){}
-        __device__ __forceinline__ saturate_cast_func():unary_function<T, D>(){}
+        __host__ __device__ __forceinline__ saturate_cast_func() {}
+        __host__ __device__ __forceinline__ saturate_cast_func(const saturate_cast_func&) {}
    };

    // Threshold Functors
@@ -547,10 +550,9 @@ namespace cv { namespace gpu { namespace cudev
            return (src > thresh) * maxVal;
        }

-        __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
-            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
-
-        __device__ __forceinline__ thresh_binary_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_binary_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}

        const T thresh;
        const T maxVal;
@@ -565,10 +567,9 @@ namespace cv { namespace gpu { namespace cudev
            return (src <= thresh) * maxVal;
        }

-        __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
-            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
-
-        __device__ __forceinline__ thresh_binary_inv_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}

        const T thresh;
        const T maxVal;
@@ -583,10 +584,9 @@ namespace cv { namespace gpu { namespace cudev
            return minimum<T>()(src, thresh);
        }

-        __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}
-
-        __device__ __forceinline__ thresh_trunc_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_trunc_func() {}
+        __host__ __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
+            : thresh(other.thresh) {}

        const T thresh;
    };
@@ -599,10 +599,10 @@ namespace cv { namespace gpu { namespace cudev
        {
            return (src > thresh) * src;
        }
-        __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}

-        __device__ __forceinline__ thresh_to_zero_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_to_zero_func() {}
+       __host__  __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
+            : thresh(other.thresh) {}

        const T thresh;
    };
@@ -615,14 +615,14 @@ namespace cv { namespace gpu { namespace cudev
        {
            return (src <= thresh) * src;
        }
-        __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}

-        __device__ __forceinline__ thresh_to_zero_inv_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
+            : thresh(other.thresh) {}

        const T thresh;
    };
-//bound!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ============>
+
    // Function Object Adaptors
    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
    {
@@ -633,8 +633,8 @@ namespace cv { namespace gpu { namespace cudev
          return !pred(x);
      }

-        __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
-        __device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
+      __host__ __device__ __forceinline__ unary_negate() {}
+      __host__ __device__ __forceinline__ unary_negate(const unary_negate& other) : pred(other.pred) {}

      const Predicate pred;
    };
@@ -653,11 +653,9 @@ namespace cv { namespace gpu { namespace cudev
        {
            return !pred(x,y);
        }
-        __device__ __forceinline__ binary_negate(const binary_negate& other)
-        : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}

-        __device__ __forceinline__ binary_negate() :
-        binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
+        __host__ __device__ __forceinline__ binary_negate() {}
+        __host__ __device__ __forceinline__ binary_negate(const binary_negate& other) : pred(other.pred) {}

        const Predicate pred;
    };
@@ -676,8 +674,8 @@ namespace cv { namespace gpu { namespace cudev
            return op(arg1, a);
        }

-        __device__ __forceinline__ binder1st(const binder1st& other) :
-        unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
+        __host__ __device__ __forceinline__ binder1st() {}
+        __host__ __device__ __forceinline__ binder1st(const binder1st& other) : op(other.op), arg1(other.arg1) {}

        const Op op;
        const typename Op::first_argument_type arg1;
@@ -697,8 +695,8 @@ namespace cv { namespace gpu { namespace cudev
            return op(a, arg2);
        }

-         __device__ __forceinline__ binder2nd(const binder2nd& other) :
-        unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
+        __host__ __device__ __forceinline__ binder2nd() {}
+        __host__ __device__ __forceinline__ binder2nd(const binder2nd& other) : op(other.op), arg2(other.arg2) {}

        const Op op;
        const typename Op::second_argument_type arg2;
--- a/modules/core/include/opencv2/core/cuda/utility.hpp
+++ b/modules/core/include/opencv2/core/cuda/utility.hpp
@@ -124,8 +124,8 @@ namespace cv { namespace gpu { namespace cudev

    struct WithOutMask
    {
-        __device__ __forceinline__ WithOutMask(){}
-        __device__ __forceinline__ WithOutMask(const WithOutMask& mask){}
+        __host__ __device__ __forceinline__ WithOutMask(){}
+        __host__ __device__ __forceinline__ WithOutMask(const WithOutMask&){}

        __device__ __forceinline__ void next() const
        {
--- a/modules/features2d/src/detectors.cpp
+++ b/modules/features2d/src/detectors.cpp
@@ -212,7 +212,7 @@ static void keepStrongest( int N, std::vector<KeyPoint>& keypoints )
 }

 namespace {
-class GridAdaptedFeatureDetectorInvoker
+class GridAdaptedFeatureDetectorInvoker : public ParallelLoopBody
 {
 private:
    int gridRows_, gridCols_;
@@ -221,29 +221,24 @@ private:
    const Mat& image_;
    const Mat& mask_;
    const Ptr<FeatureDetector>& detector_;
-#ifdef HAVE_TBB
-    tbb::mutex* kptLock_;
-#endif
+    Mutex* kptLock_;

    GridAdaptedFeatureDetectorInvoker& operator=(const GridAdaptedFeatureDetectorInvoker&); // to quiet MSVC

 public:

-    GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask, std::vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols
-#ifdef HAVE_TBB
-        , tbb::mutex* kptLock
-#endif
-        ) : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
-            keypoints_(keypoints), image_(image), mask_(mask), detector_(detector)
-#ifdef HAVE_TBB
-            , kptLock_(kptLock)
-#endif
+    GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask,
+                                      std::vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols,
+                                      cv::Mutex* kptLock)
+        : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
+          keypoints_(keypoints), image_(image), mask_(mask), detector_(detector),
+          kptLock_(kptLock)
    {
    }

-    void operator() (const BlockedRange& range) const
+    void operator() (const Range& range) const
    {
-        for (int i = range.begin(); i < range.end(); ++i)
+        for (int i = range.start; i < range.end; ++i)
        {
            int celly = i / gridCols_;
            int cellx = i - celly * gridCols_;
@@ -268,9 +263,8 @@ public:
                it->pt.x += col_range.start;
                it->pt.y += row_range.start;
            }
-#ifdef HAVE_TBB
-            tbb::mutex::scoped_lock join_keypoints(*kptLock_);
-#endif
+
+            cv::AutoLock join_keypoints(*kptLock_);
            keypoints_.insert( keypoints_.end(), sub_keypoints.begin(), sub_keypoints.end() );
        }
    }
@@ -287,13 +281,9 @@ void GridAdaptedFeatureDetector::detectImpl( const Mat& image, std::vector<KeyPo
    keypoints.reserve(maxTotalKeypoints);
    int maxPerCell = maxTotalKeypoints / (gridRows * gridCols);

-#ifdef HAVE_TBB
-    tbb::mutex kptLock;
-    cv::parallel_for(cv::BlockedRange(0, gridRows * gridCols),
+    cv::Mutex kptLock;
+    cv::parallel_for_(cv::Range(0, gridRows * gridCols),
        GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols, &kptLock));
-#else
-    GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols)(cv::BlockedRange(0, gridRows * gridCols));
-#endif
 }

 /*
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -38,7 +38,7 @@ if(HAVE_CUDA)

  ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})

-  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_npp_LIBRARY})

  if(WITH_NVCUVID)
    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -150,7 +150,7 @@ namespace
    }

    // Computes rotation, translation pair for small subsets if the input data
-    class TransformHypothesesGenerator
+    class TransformHypothesesGenerator : public ParallelLoopBody
    {
    public:
        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
@@ -160,7 +160,7 @@ namespace
                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
                  transl_vectors(transl_vectors_) {}

-        void operator()(const BlockedRange& range) const
+        void operator()(const Range& range) const
        {
            // Input data for generation of the current hypothesis
            std::vector<int> subset_indices(subset_size);
@@ -172,7 +172,7 @@ namespace
            Mat rot_mat(3, 3, CV_64F);
            Mat transl_vec(1, 3, CV_64F);

-            for (int iter = range.begin(); iter < range.end(); ++iter)
+            for (int iter = range.start; iter < range.end; ++iter)
            {
                selectRandom(subset_size, num_points, subset_indices);
                for (int i = 0; i < subset_size; ++i)
@@ -238,7 +238,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    // Generate set of hypotheses using small subsets of the input data
    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
                                      num_points, subset_size, rot_matrices, transl_vectors);
-    parallel_for(BlockedRange(0, num_iters), body);
+    parallel_for_(Range(0, num_iters), body);

    // Compute scores (i.e. number of inliers) for each hypothesis
    GpuMat d_object(object);
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -67,8 +67,8 @@ namespace cv { namespace gpu { namespace cudev
                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
            }
-            __device__ __forceinline__ TransformOp() {}
-            __device__ __forceinline__ TransformOp(const TransformOp&) {}
+            __host__ __device__ __forceinline__ TransformOp() {}
+            __host__ __device__ __forceinline__ TransformOp(const TransformOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
@@ -106,8 +106,8 @@ namespace cv { namespace gpu { namespace cudev
                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
            }
-            __device__ __forceinline__ ProjectOp() {}
-            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
+            __host__ __device__ __forceinline__ ProjectOp() {}
+            __host__ __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -62,8 +62,8 @@ namespace canny
            return ::abs(x) + ::abs(y);
        }

-        __device__ __forceinline__ L1() {}
-        __device__ __forceinline__ L1(const L1&) {}
+        __host__ __device__ __forceinline__ L1() {}
+        __host__ __device__ __forceinline__ L1(const L1&) {}
    };
    struct L2 : binary_function<int, int, float>
    {
@@ -72,8 +72,8 @@ namespace canny
            return ::sqrtf(x * x + y * y);
        }

-        __device__ __forceinline__ L2() {}
-        __device__ __forceinline__ L2(const L2&) {}
+        __host__ __device__ __forceinline__ L2() {}
+        __host__ __device__ __forceinline__ L2(const L2&) {}
    };
 }

@@ -470,8 +470,8 @@ namespace canny
            return (uchar)(-(e >> 1));
        }

-        __device__ __forceinline__ GetEdges() {}
-        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+        __host__ __device__ __forceinline__ GetEdges() {}
+        __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
    };
 }

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -162,8 +162,8 @@ namespace arithm
            return vadd4(a, b);
        }

-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
+        __host__ __device__ __forceinline__ VAdd4() {}
+        __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
    };

    ////////////////////////////////////
@@ -175,8 +175,8 @@ namespace arithm
            return vadd2(a, b);
        }

-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
+        __host__ __device__ __forceinline__ VAdd2() {}
+        __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
    };

    ////////////////////////////////////
@@ -188,8 +188,8 @@ namespace arithm
            return saturate_cast<D>(a + b);
        }

-        __device__ __forceinline__ AddMat() {}
-        __device__ __forceinline__ AddMat(const AddMat& other) {}
+        __host__ __device__ __forceinline__ AddMat() {}
+        __host__ __device__ __forceinline__ AddMat(const AddMat&) {}
    };
 }

@@ -397,8 +397,8 @@ namespace arithm
            return vsub4(a, b);
        }

-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4& other) {}
+        __host__ __device__ __forceinline__ VSub4() {}
+        __host__ __device__ __forceinline__ VSub4(const VSub4&) {}
    };

    ////////////////////////////////////
@@ -410,8 +410,8 @@ namespace arithm
            return vsub2(a, b);
        }

-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2& other) {}
+        __host__ __device__ __forceinline__ VSub2() {}
+        __host__ __device__ __forceinline__ VSub2(const VSub2&) {}
    };

    ////////////////////////////////////
@@ -423,8 +423,8 @@ namespace arithm
            return saturate_cast<D>(a - b);
        }

-        __device__ __forceinline__ SubMat() {}
-        __device__ __forceinline__ SubMat(const SubMat& other) {}
+        __host__ __device__ __forceinline__ SubMat() {}
+        __host__ __device__ __forceinline__ SubMat(const SubMat&) {}
    };
 }

@@ -617,8 +617,8 @@ namespace arithm
            return res;
        }

-        __device__ __forceinline__ Mul_8uc4_32f() {}
-        __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
    };

    struct Mul_16sc4_32f : binary_function<short4, float, short4>
@@ -629,8 +629,8 @@ namespace arithm
                               saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
        }

-        __device__ __forceinline__ Mul_16sc4_32f() {}
-        __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
    };

    template <typename T, typename D> struct Mul : binary_function<T, T, D>
@@ -640,8 +640,8 @@ namespace arithm
            return saturate_cast<D>(a * b);
        }

-        __device__ __forceinline__ Mul() {}
-        __device__ __forceinline__ Mul(const Mul& other) {}
+        __host__ __device__ __forceinline__ Mul() {}
+        __host__ __device__ __forceinline__ Mul(const Mul&) {}
    };

    template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
@@ -888,8 +888,8 @@ namespace arithm
            return b != 0 ? saturate_cast<D>(a / b) : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, float> : binary_function<T, T, float>
    {
@@ -898,8 +898,8 @@ namespace arithm
            return b != 0 ? static_cast<float>(a) / b : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, double> : binary_function<T, T, double>
    {
@@ -908,8 +908,8 @@ namespace arithm
            return b != 0 ? static_cast<double>(a) / b : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };

    template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
@@ -1196,8 +1196,8 @@ namespace arithm
            return vabsdiff4(a, b);
        }

-        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff4() {}
+        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
    };

    ////////////////////////////////////
@@ -1209,8 +1209,8 @@ namespace arithm
            return vabsdiff2(a, b);
        }

-        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff2() {}
+        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
    };

    ////////////////////////////////////
@@ -1235,8 +1235,8 @@ namespace arithm
            return saturate_cast<T>(_abs(a - b));
        }

-        __device__ __forceinline__ AbsDiffMat() {}
-        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+        __host__ __device__ __forceinline__ AbsDiffMat() {}
+        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
    };
 }

@@ -1370,8 +1370,8 @@ namespace arithm
            return saturate_cast<T>(x * x);
        }

-        __device__ __forceinline__ Sqr() {}
-        __device__ __forceinline__ Sqr(const Sqr& other) {}
+        __host__ __device__ __forceinline__ Sqr() {}
+        __host__ __device__ __forceinline__ Sqr(const Sqr&) {}
    };
 }

@@ -1466,8 +1466,8 @@ namespace arithm
            return saturate_cast<T>(f(x));
        }

-        __device__ __forceinline__ Exp() {}
-        __device__ __forceinline__ Exp(const Exp& other) {}
+        __host__ __device__ __forceinline__ Exp() {}
+        __host__ __device__ __forceinline__ Exp(const Exp&) {}
    };
 }

@@ -1507,8 +1507,8 @@ namespace arithm
            return vcmpeq4(a, b);
        }

-        __device__ __forceinline__ VCmpEq4() {}
-        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+        __host__ __device__ __forceinline__ VCmpEq4() {}
+        __host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
    };
    struct VCmpNe4 : binary_function<uint, uint, uint>
    {
@@ -1517,8 +1517,8 @@ namespace arithm
            return vcmpne4(a, b);
        }

-        __device__ __forceinline__ VCmpNe4() {}
-        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+        __host__ __device__ __forceinline__ VCmpNe4() {}
+        __host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
    };
    struct VCmpLt4 : binary_function<uint, uint, uint>
    {
@@ -1527,8 +1527,8 @@ namespace arithm
            return vcmplt4(a, b);
        }

-        __device__ __forceinline__ VCmpLt4() {}
-        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+        __host__ __device__ __forceinline__ VCmpLt4() {}
+        __host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
    };
    struct VCmpLe4 : binary_function<uint, uint, uint>
    {
@@ -1537,8 +1537,8 @@ namespace arithm
            return vcmple4(a, b);
        }

-        __device__ __forceinline__ VCmpLe4() {}
-        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+        __host__ __device__ __forceinline__ VCmpLe4() {}
+        __host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
    };

    ////////////////////////////////////
@@ -2008,8 +2008,8 @@ namespace arithm
            return vmin4(a, b);
        }

-        __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
+        __host__ __device__ __forceinline__ VMin4() {}
+        __host__ __device__ __forceinline__ VMin4(const VMin4&) {}
    };

    ////////////////////////////////////
@@ -2021,8 +2021,8 @@ namespace arithm
            return vmin2(a, b);
        }

-        __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
+        __host__ __device__ __forceinline__ VMin2() {}
+        __host__ __device__ __forceinline__ VMin2(const VMin2&) {}
    };
 }

@@ -2100,8 +2100,8 @@ namespace arithm
            return vmax4(a, b);
        }

-        __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
+        __host__ __device__ __forceinline__ VMax4() {}
+        __host__ __device__ __forceinline__ VMax4(const VMax4&) {}
    };

    ////////////////////////////////////
@@ -2113,8 +2113,8 @@ namespace arithm
            return vmax2(a, b);
        }

-        __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
+        __host__ __device__ __forceinline__ VMax2() {}
+        __host__ __device__ __forceinline__ VMax2(const VMax2&) {}
    };
 }

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -188,10 +188,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);

-    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
-        NppiSize oSizeROI, Npp64f* pRetVal);
+#if CUDA_VERSION < 5050
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);

-    static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
+        NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
+
+    typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
+
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+
+    static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
+#endif

    NppiSize sz;
    sz.width  = src1.cols;
@@ -203,7 +213,16 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)

    DeviceBuffer dbuf;

-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#if CUDA_VERSION < 5050
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#else
+    int bufSize;
+    buf_size_funcs[funcIdx](sz, &bufSize);
+
+    GpuMat buf(1, bufSize, CV_8UC1);
+
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
+#endif

    cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -352,7 +352,7 @@ GPU_TEST_P(Add_Scalar, WithOutMask)
        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
        cv::add(mat, val, dst_gold, cv::noArray(), depth.second);

-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
    }
 }

@@ -383,7 +383,7 @@ GPU_TEST_P(Add_Scalar, WithMask)
        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
        cv::add(mat, val, dst_gold, mask, depth.second);

-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
    }
 }

@@ -567,7 +567,7 @@ GPU_TEST_P(Subtract_Scalar, WithOutMask)
        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
        cv::subtract(mat, val, dst_gold, cv::noArray(), depth.second);

-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
    }
 }

@@ -598,7 +598,7 @@ GPU_TEST_P(Subtract_Scalar, WithMask)
        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
        cv::subtract(mat, val, dst_gold, mask, depth.second);

-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 1.0);
    }
 }

@@ -2148,7 +2148,7 @@ GPU_TEST_P(Min, Scalar)

        cv::Mat dst_gold = cv::min(src, val);

-        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
    }
 }

@@ -2231,7 +2231,7 @@ GPU_TEST_P(Max, Scalar)

        cv::Mat dst_gold = cv::max(src, val);

-        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
    }
 }

--- a/modules/imgproc/doc/miscellaneous_transformations.rst
+++ b/modules/imgproc/doc/miscellaneous_transformations.rst
@@ -113,6 +113,7 @@ If you use ``cvtColor`` with 8-bit images, the conversion will have some informa
 The function can do the following transformations:

 *
+    RGB :math:`\leftrightarrow` GRAY ( ``CV_BGR2GRAY, CV_RGB2GRAY, CV_GRAY2BGR, CV_GRAY2RGB``     )
    Transformations within RGB space like adding/removing the alpha channel, reversing the channel order, conversion to/from 16-bit RGB color (R5:G6:B5 or R5:G5:B5), as well as conversion to/from grayscale using:

    .. math::
@@ -755,7 +756,7 @@ Runs the GrabCut algorithm.

        * **GC_PR_BGD** defines a possible background pixel.

-        * **GC_PR_BGD** defines a possible foreground pixel.
+        * **GC_PR_FGD** defines a possible foreground pixel.

    :param rect: ROI containing a segmented object. The pixels outside of the ROI are marked as "obvious background". The parameter is only used when  ``mode==GC_INIT_WITH_RECT`` .

--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -0,0 +1,334 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+// ----------------------------------------------------------------------
+// CLAHE
+
+namespace
+{
+    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
+    {
+    public:
+        CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
+            src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
+        {
+        }
+
+        void operator ()(const cv::Range& range) const;
+
+    private:
+        cv::Mat src_;
+        mutable cv::Mat lut_;
+
+        cv::Size tileSize_;
+        int tilesX_;
+        int tilesY_;
+        int clipLimit_;
+        float lutScale_;
+    };
+
+    void CLAHE_CalcLut_Body::operator ()(const cv::Range& range) const
+    {
+        const int histSize = 256;
+
+        uchar* tileLut = lut_.ptr(range.start);
+        const size_t lut_step = lut_.step;
+
+        for (int k = range.start; k < range.end; ++k, tileLut += lut_step)
+        {
+            const int ty = k / tilesX_;
+            const int tx = k % tilesX_;
+
+            // retrieve tile submatrix
+
+            cv::Rect tileROI;
+            tileROI.x = tx * tileSize_.width;
+            tileROI.y = ty * tileSize_.height;
+            tileROI.width = tileSize_.width;
+            tileROI.height = tileSize_.height;
+
+            const cv::Mat tile = src_(tileROI);
+
+            // calc histogram
+
+            int tileHist[histSize] = {0, };
+
+            int height = tileROI.height;
+            const size_t sstep = tile.step;
+            for (const uchar* ptr = tile.ptr<uchar>(0); height--; ptr += sstep)
+            {
+                int x = 0;
+                for (; x <= tileROI.width - 4; x += 4)
+                {
+                    int t0 = ptr[x], t1 = ptr[x+1];
+                    tileHist[t0]++; tileHist[t1]++;
+                    t0 = ptr[x+2]; t1 = ptr[x+3];
+                    tileHist[t0]++; tileHist[t1]++;
+                }
+
+                for (; x < tileROI.width; ++x)
+                    tileHist[ptr[x]]++;
+            }
+
+            // clip histogram
+
+            if (clipLimit_ > 0)
+            {
+                // how many pixels were clipped
+                int clipped = 0;
+                for (int i = 0; i < histSize; ++i)
+                {
+                    if (tileHist[i] > clipLimit_)
+                    {
+                        clipped += tileHist[i] - clipLimit_;
+                        tileHist[i] = clipLimit_;
+                    }
+                }
+
+                // redistribute clipped pixels
+                int redistBatch = clipped / histSize;
+                int residual = clipped - redistBatch * histSize;
+
+                for (int i = 0; i < histSize; ++i)
+                    tileHist[i] += redistBatch;
+
+                for (int i = 0; i < residual; ++i)
+                    tileHist[i]++;
+            }
+
+            // calc Lut
+
+            int sum = 0;
+            for (int i = 0; i < histSize; ++i)
+            {
+                sum += tileHist[i];
+                tileLut[i] = cv::saturate_cast<uchar>(sum * lutScale_);
+            }
+        }
+    }
+
+    class CLAHE_Interpolation_Body : public cv::ParallelLoopBody
+    {
+    public:
+        CLAHE_Interpolation_Body(const cv::Mat& src, cv::Mat& dst, const cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY) :
+            src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
+        {
+        }
+
+        void operator ()(const cv::Range& range) const;
+
+    private:
+        cv::Mat src_;
+        mutable cv::Mat dst_;
+        cv::Mat lut_;
+
+        cv::Size tileSize_;
+        int tilesX_;
+        int tilesY_;
+    };
+
+    void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const
+    {
+        const size_t lut_step = lut_.step;
+
+        for (int y = range.start; y < range.end; ++y)
+        {
+            const uchar* srcRow = src_.ptr<uchar>(y);
+            uchar* dstRow = dst_.ptr<uchar>(y);
+
+            const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
+
+            int ty1 = cvFloor(tyf);
+            int ty2 = ty1 + 1;
+
+            const float ya = tyf - ty1;
+
+            ty1 = std::max(ty1, 0);
+            ty2 = std::min(ty2, tilesY_ - 1);
+
+            const uchar* lutPlane1 = lut_.ptr(ty1 * tilesX_);
+            const uchar* lutPlane2 = lut_.ptr(ty2 * tilesX_);
+
+            for (int x = 0; x < src_.cols; ++x)
+            {
+                const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
+
+                int tx1 = cvFloor(txf);
+                int tx2 = tx1 + 1;
+
+                const float xa = txf - tx1;
+
+                tx1 = std::max(tx1, 0);
+                tx2 = std::min(tx2, tilesX_ - 1);
+
+                const int srcVal = srcRow[x];
+
+                const size_t ind1 = tx1 * lut_step + srcVal;
+                const size_t ind2 = tx2 * lut_step + srcVal;
+
+                float res = 0;
+
+                res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
+                res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
+                res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
+                res += lutPlane2[ind2] * ((xa) * (ya));
+
+                dstRow[x] = cv::saturate_cast<uchar>(res);
+            }
+        }
+    }
+
+    class CLAHE_Impl : public cv::CLAHE
+    {
+    public:
+        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
+
+        cv::AlgorithmInfo* info() const;
+
+        void apply(cv::InputArray src, cv::OutputArray dst);
+
+        void setClipLimit(double clipLimit);
+        double getClipLimit() const;
+
+        void setTilesGridSize(cv::Size tileGridSize);
+        cv::Size getTilesGridSize() const;
+
+        void collectGarbage();
+
+    private:
+        double clipLimit_;
+        int tilesX_;
+        int tilesY_;
+
+        cv::Mat srcExt_;
+        cv::Mat lut_;
+    };
+
+    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
+        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
+    {
+    }
+
+    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE",
+        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
+        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
+        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
+
+    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
+    {
+        cv::Mat src = _src.getMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+
+        _dst.create( src.size(), src.type() );
+        cv::Mat dst = _dst.getMat();
+
+        const int histSize = 256;
+
+        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
+
+        cv::Size tileSize;
+        cv::Mat srcForLut;
+
+        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        {
+            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+            srcForLut = src;
+        }
+        else
+        {
+            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
+
+            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+            srcForLut = srcExt_;
+        }
+
+        const int tileSizeTotal = tileSize.area();
+        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
+
+        int clipLimit = 0;
+        if (clipLimit_ > 0.0)
+        {
+            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
+            clipLimit = std::max(clipLimit, 1);
+        }
+
+        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
+        cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
+
+        CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
+        cv::parallel_for_(cv::Range(0, src.rows), interpolationBody);
+    }
+
+    void CLAHE_Impl::setClipLimit(double clipLimit)
+    {
+        clipLimit_ = clipLimit;
+    }
+
+    double CLAHE_Impl::getClipLimit() const
+    {
+        return clipLimit_;
+    }
+
+    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
+    {
+        tilesX_ = tileGridSize.width;
+        tilesY_ = tileGridSize.height;
+    }
+
+    cv::Size CLAHE_Impl::getTilesGridSize() const
+    {
+        return cv::Size(tilesX_, tilesY_);
+    }
+
+    void CLAHE_Impl::collectGarbage()
+    {
+        srcExt_.release();
+        lut_.release();
+    }
+}
+
+cv::Ptr<cv::CLAHE> cv::createCLAHE(double clipLimit, cv::Size tileGridSize)
+{
+    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
+}
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -1815,7 +1815,7 @@ const int ITUR_BT_601_CGV = -385875;
 const int ITUR_BT_601_CBV = -74448;

 template<int bIdx, int uIdx>
-struct YUV420sp2RGB888Invoker
+struct YUV420sp2RGB888Invoker : ParallelLoopBody
 {
    Mat* dst;
    const uchar* my1, *muv;
@@ -1824,10 +1824,10 @@ struct YUV420sp2RGB888Invoker
    YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
        : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;

        //R = 1.164(Y - 16) + 1.596(V - 128)
        //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -1884,7 +1884,7 @@ struct YUV420sp2RGB888Invoker
 };

 template<int bIdx, int uIdx>
-struct YUV420sp2RGBA8888Invoker
+struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
 {
    Mat* dst;
    const uchar* my1, *muv;
@@ -1893,10 +1893,10 @@ struct YUV420sp2RGBA8888Invoker
    YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
        : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;

        //R = 1.164(Y - 16) + 1.596(V - 128)
        //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -1957,7 +1957,7 @@ struct YUV420sp2RGBA8888Invoker
 };

 template<int bIdx>
-struct YUV420p2RGB888Invoker
+struct YUV420p2RGB888Invoker : ParallelLoopBody
 {
    Mat* dst;
    const uchar* my1, *mu, *mv;
@@ -1967,19 +1967,19 @@ struct YUV420p2RGB888Invoker
    YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
        : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
-        const int rangeBegin = range.begin() * 2;
-        const int rangeEnd = range.end() * 2;
+        const int rangeBegin = range.start * 2;
+        const int rangeEnd = range.end * 2;

        size_t uvsteps[2] = {width/2, stride - width/2};
        int usIdx = ustepIdx, vsIdx = vstepIdx;

        const uchar* y1 = my1 + rangeBegin * stride;
-        const uchar* u1 = mu + (range.begin() / 2) * stride;
-        const uchar* v1 = mv + (range.begin() / 2) * stride;
+        const uchar* u1 = mu + (range.start / 2) * stride;
+        const uchar* v1 = mv + (range.start / 2) * stride;

-        if(range.begin() % 2 == 1)
+        if(range.start % 2 == 1)
        {
            u1 += uvsteps[(usIdx++) & 1];
            v1 += uvsteps[(vsIdx++) & 1];
@@ -2025,7 +2025,7 @@ struct YUV420p2RGB888Invoker
 };

 template<int bIdx>
-struct YUV420p2RGBA8888Invoker
+struct YUV420p2RGBA8888Invoker : ParallelLoopBody
 {
    Mat* dst;
    const uchar* my1, *mu, *mv;
@@ -2035,19 +2035,19 @@ struct YUV420p2RGBA8888Invoker
    YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
        : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;

        size_t uvsteps[2] = {width/2, stride - width/2};
        int usIdx = ustepIdx, vsIdx = vstepIdx;

        const uchar* y1 = my1 + rangeBegin * stride;
-        const uchar* u1 = mu + (range.begin() / 2) * stride;
-        const uchar* v1 = mv + (range.begin() / 2) * stride;
+        const uchar* u1 = mu + (range.start / 2) * stride;
+        const uchar* v1 = mv + (range.start / 2) * stride;

-        if(range.begin() % 2 == 1)
+        if(range.start % 2 == 1)
        {
            u1 += uvsteps[(usIdx++) & 1];
            v1 += uvsteps[(vsIdx++) & 1];
@@ -2102,48 +2102,40 @@ template<int bIdx, int uIdx>
 inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
 {
    YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
-#ifdef HAVE_TBB
    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
    else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }

 template<int bIdx, int uIdx>
 inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
 {
    YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
-#ifdef HAVE_TBB
    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
    else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }

 template<int bIdx>
 inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
 {
    YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-#ifdef HAVE_TBB
    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
    else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }

 template<int bIdx>
 inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
 {
    YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-#ifdef HAVE_TBB
    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
    else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }

 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
@@ -2227,7 +2219,7 @@ static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////

 template<int bIdx, int uIdx, int yIdx>
-struct YUV422toRGB888Invoker
+struct YUV422toRGB888Invoker : ParallelLoopBody
 {
    Mat* dst;
    const uchar* src;
@@ -2236,10 +2228,10 @@ struct YUV422toRGB888Invoker
    YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
        : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
-        int rangeBegin = range.begin();
-        int rangeEnd = range.end();
+        int rangeBegin = range.start;
+        int rangeEnd = range.end;

        const int uidx = 1 - yIdx + uIdx * 2;
        const int vidx = (2 + uidx) % 4;
@@ -2273,7 +2265,7 @@ struct YUV422toRGB888Invoker
 };

 template<int bIdx, int uIdx, int yIdx>
-struct YUV422toRGBA8888Invoker
+struct YUV422toRGBA8888Invoker : ParallelLoopBody
 {
    Mat* dst;
    const uchar* src;
@@ -2282,10 +2274,10 @@ struct YUV422toRGBA8888Invoker
    YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
        : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
-        int rangeBegin = range.begin();
-        int rangeEnd = range.end();
+        int rangeBegin = range.start;
+        int rangeEnd = range.end;

        const int uidx = 1 - yIdx + uIdx * 2;
        const int vidx = (2 + uidx) % 4;
@@ -2326,24 +2318,20 @@ template<int bIdx, int uIdx, int yIdx>
 inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
 {
    YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
-#ifdef HAVE_TBB
    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows), converter);
+        parallel_for_(Range(0, _dst.rows), converter);
    else
-#endif
-        converter(BlockedRange(0, _dst.rows));
+        converter(Range(0, _dst.rows));
 }

 template<int bIdx, int uIdx, int yIdx>
 inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
 {
    YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
-#ifdef HAVE_TBB
    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows), converter);
+        parallel_for_(Range(0, _dst.rows), converter);
    else
-#endif
-        converter(BlockedRange(0, _dst.rows));
+        converter(Range(0, _dst.rows));
 }

 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
--- a/modules/imgproc/src/distransform.cpp
+++ b/modules/imgproc/src/distransform.cpp
@@ -442,7 +442,7 @@ static void getDistanceTransformMask( int maskType, float *metrics )
    }
 }

-struct DTColumnInvoker
+struct DTColumnInvoker : ParallelLoopBody
 {
    DTColumnInvoker( const Mat* _src, Mat* _dst, const int* _sat_tab, const float* _sqr_tab)
    {
@@ -452,9 +452,9 @@ struct DTColumnInvoker
        sqr_tab = _sqr_tab;
    }

-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
    {
-        int i, i1 = range.begin(), i2 = range.end();
+        int i, i1 = range.start, i2 = range.end;
        int m = src->rows;
        size_t sstep = src->step, dstep = dst->step/sizeof(float);
        AutoBuffer<int> _d(m);
@@ -489,7 +489,7 @@ struct DTColumnInvoker
 };


-struct DTRowInvoker
+struct DTRowInvoker : ParallelLoopBody
 {
    DTRowInvoker( Mat* _dst, const float* _sqr_tab, const float* _inv_tab )
    {
@@ -498,10 +498,10 @@ struct DTRowInvoker
        inv_tab = _inv_tab;
    }

-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
    {
        const float inf = 1e15f;
-        int i, i1 = range.begin(), i2 = range.end();
+        int i, i1 = range.start, i2 = range.end;
        int n = dst->cols;
        AutoBuffer<uchar> _buf((n+2)*2*sizeof(float) + (n+2)*sizeof(int));
        float* f = (float*)(uchar*)_buf;
@@ -578,7 +578,7 @@ trueDistTrans( const Mat& src, Mat& dst )
    for( ; i <= m*3; i++ )
        sat_tab[i] = i - shift;

-    cv::parallel_for(cv::BlockedRange(0, n), cv::DTColumnInvoker(&src, &dst, sat_tab, sqr_tab));
+    cv::parallel_for_(cv::Range(0, n), cv::DTColumnInvoker(&src, &dst, sat_tab, sqr_tab));

    // stage 2: compute modified distance transform for each row
    float* inv_tab = sqr_tab + n;
@@ -590,7 +590,7 @@ trueDistTrans( const Mat& src, Mat& dst )
        sqr_tab[i] = (float)(i*i);
    }

-    cv::parallel_for(cv::BlockedRange(0, m), cv::DTRowInvoker(&dst, sqr_tab, inv_tab));
+    cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(&dst, sqr_tab, inv_tab));
 }


@@ -664,7 +664,7 @@ distanceATS_L1_8u( const Mat& src, Mat& dst )
        // do right edge
        a = lut[dbase[width-1+dststep]];
        dbase[width-1] = (uchar)(MIN(a, dbase[width-1]));
-        
+
        for( x = width - 2; x >= 0; x-- )
        {
            int b = dbase[x+dststep];
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -2985,29 +2985,23 @@ cvCalcProbDensity( const CvHistogram* hist, const CvHistogram* hist_mask,
    }
 }

-class EqualizeHistCalcHist_Invoker
+class EqualizeHistCalcHist_Invoker : public cv::ParallelLoopBody
 {
 public:
    enum {HIST_SZ = 256};

-#ifdef HAVE_TBB
-    typedef tbb::mutex* MutextPtr;
-#else
-    typedef void* MutextPtr;
-#endif
-
-    EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, MutextPtr histogramLock)
+    EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, cv::Mutex* histogramLock)
        : src_(src), globalHistogram_(histogram), histogramLock_(histogramLock)
    { }

-    void operator()( const cv::BlockedRange& rowRange ) const
+    void operator()( const cv::Range& rowRange ) const
    {
        int localHistogram[HIST_SZ] = {0, };

        const size_t sstep = src_.step;

        int width = src_.cols;
-        int height = rowRange.end() - rowRange.begin();
+        int height = rowRange.end - rowRange.start;

        if (src_.isContinuous())
        {
@@ -3015,7 +3009,7 @@ public:
            height = 1;
        }

-        for (const uchar* ptr = src_.ptr<uchar>(rowRange.begin()); height--; ptr += sstep)
+        for (const uchar* ptr = src_.ptr<uchar>(rowRange.start); height--; ptr += sstep)
        {
            int x = 0;
            for (; x <= width - 4; x += 4)
@@ -3030,9 +3024,7 @@ public:
                localHistogram[ptr[x]]++;
        }

-#ifdef HAVE_TBB
-        tbb::mutex::scoped_lock lock(*histogramLock_);
-#endif
+        cv::AutoLock lock(*histogramLock_);

        for( int i = 0; i < HIST_SZ; i++ )
            globalHistogram_[i] += localHistogram[i];
@@ -3040,12 +3032,7 @@ public:

    static bool isWorthParallel( const cv::Mat& src )
    {
-#ifdef HAVE_TBB
        return ( src.total() >= 640*480 );
-#else
-        (void)src;
-        return false;
-#endif
    }

 private:
@@ -3053,10 +3040,10 @@ private:

    cv::Mat& src_;
    int* globalHistogram_;
-    MutextPtr histogramLock_;
+    cv::Mutex* histogramLock_;
 };

-class EqualizeHistLut_Invoker
+class EqualizeHistLut_Invoker : public cv::ParallelLoopBody
 {
 public:
    EqualizeHistLut_Invoker( cv::Mat& src, cv::Mat& dst, int* lut )
@@ -3065,13 +3052,13 @@ public:
          lut_(lut)
    { }

-    void operator()( const cv::BlockedRange& rowRange ) const
+    void operator()( const cv::Range& rowRange ) const
    {
        const size_t sstep = src_.step;
        const size_t dstep = dst_.step;

        int width = src_.cols;
-        int height = rowRange.end() - rowRange.begin();
+        int height = rowRange.end - rowRange.start;
        int* lut = lut_;

        if (src_.isContinuous() && dst_.isContinuous())
@@ -3080,8 +3067,8 @@ public:
            height = 1;
        }

-        const uchar* sptr = src_.ptr<uchar>(rowRange.begin());
-        uchar* dptr = dst_.ptr<uchar>(rowRange.begin());
+        const uchar* sptr = src_.ptr<uchar>(rowRange.start);
+        uchar* dptr = dst_.ptr<uchar>(rowRange.start);

        for (; height--; sptr += sstep, dptr += dstep)
        {
@@ -3110,12 +3097,7 @@ public:

    static bool isWorthParallel( const cv::Mat& src )
    {
-#ifdef HAVE_TBB
        return ( src.total() >= 640*480 );
-#else
-        (void)src;
-        return false;
-#endif
    }

 private:
@@ -3142,23 +3124,18 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
    if(src.empty())
        return;

-#ifdef HAVE_TBB
-    tbb::mutex histogramLockInstance;
-    EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = &histogramLockInstance;
-#else
-    EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = 0;
-#endif
+    Mutex histogramLockInstance;

    const int hist_sz = EqualizeHistCalcHist_Invoker::HIST_SZ;
    int hist[hist_sz] = {0,};
    int lut[hist_sz];

-    EqualizeHistCalcHist_Invoker calcBody(src, hist, histogramLock);
+    EqualizeHistCalcHist_Invoker calcBody(src, hist, &histogramLockInstance);
    EqualizeHistLut_Invoker      lutBody(src, dst, lut);
-    cv::BlockedRange heightRange(0, src.rows);
+    cv::Range heightRange(0, src.rows);

    if(EqualizeHistCalcHist_Invoker::isWorthParallel(src))
-        parallel_for(heightRange, calcBody);
+        parallel_for_(heightRange, calcBody);
    else
        calcBody(heightRange);

@@ -3182,303 +3159,11 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
    }

    if(EqualizeHistLut_Invoker::isWorthParallel(src))
-        parallel_for(heightRange, lutBody);
+        parallel_for_(heightRange, lutBody);
    else
        lutBody(heightRange);
 }

-// ----------------------------------------------------------------------
-// CLAHE
-
-namespace
-{
-    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
-    {
-    public:
-        CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
-            src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
-        {
-        }
-
-        void operator ()(const cv::Range& range) const;
-
-    private:
-        cv::Mat src_;
-        mutable cv::Mat lut_;
-
-        cv::Size tileSize_;
-        int tilesX_;
-        int tilesY_;
-        int clipLimit_;
-        float lutScale_;
-    };
-
-    void CLAHE_CalcLut_Body::operator ()(const cv::Range& range) const
-    {
-        const int histSize = 256;
-
-        uchar* tileLut = lut_.ptr(range.start);
-        const size_t lut_step = lut_.step;
-
-        for (int k = range.start; k < range.end; ++k, tileLut += lut_step)
-        {
-            const int ty = k / tilesX_;
-            const int tx = k % tilesX_;
-
-            // retrieve tile submatrix
-
-            cv::Rect tileROI;
-            tileROI.x = tx * tileSize_.width;
-            tileROI.y = ty * tileSize_.height;
-            tileROI.width = tileSize_.width;
-            tileROI.height = tileSize_.height;
-
-            const cv::Mat tile = src_(tileROI);
-
-            // calc histogram
-
-            int tileHist[histSize] = {0, };
-
-            int height = tileROI.height;
-            const size_t sstep = tile.step;
-            for (const uchar* ptr = tile.ptr<uchar>(0); height--; ptr += sstep)
-            {
-                int x = 0;
-                for (; x <= tileROI.width - 4; x += 4)
-                {
-                    int t0 = ptr[x], t1 = ptr[x+1];
-                    tileHist[t0]++; tileHist[t1]++;
-                    t0 = ptr[x+2]; t1 = ptr[x+3];
-                    tileHist[t0]++; tileHist[t1]++;
-                }
-
-                for (; x < tileROI.width; ++x)
-                    tileHist[ptr[x]]++;
-            }
-
-            // clip histogram
-
-            if (clipLimit_ > 0)
-            {
-                // how many pixels were clipped
-                int clipped = 0;
-                for (int i = 0; i < histSize; ++i)
-                {
-                    if (tileHist[i] > clipLimit_)
-                    {
-                        clipped += tileHist[i] - clipLimit_;
-                        tileHist[i] = clipLimit_;
-                    }
-                }
-
-                // redistribute clipped pixels
-                int redistBatch = clipped / histSize;
-                int residual = clipped - redistBatch * histSize;
-
-                for (int i = 0; i < histSize; ++i)
-                    tileHist[i] += redistBatch;
-
-                for (int i = 0; i < residual; ++i)
-                    tileHist[i]++;
-            }
-
-            // calc Lut
-
-            int sum = 0;
-            for (int i = 0; i < histSize; ++i)
-            {
-                sum += tileHist[i];
-                tileLut[i] = cv::saturate_cast<uchar>(sum * lutScale_);
-            }
-        }
-    }
-
-    class CLAHE_Interpolation_Body : public cv::ParallelLoopBody
-    {
-    public:
-        CLAHE_Interpolation_Body(const cv::Mat& src, cv::Mat& dst, const cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY) :
-            src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
-        {
-        }
-
-        void operator ()(const cv::Range& range) const;
-
-    private:
-        cv::Mat src_;
-        mutable cv::Mat dst_;
-        cv::Mat lut_;
-
-        cv::Size tileSize_;
-        int tilesX_;
-        int tilesY_;
-    };
-
-    void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const
-    {
-        const size_t lut_step = lut_.step;
-
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const uchar* srcRow = src_.ptr<uchar>(y);
-            uchar* dstRow = dst_.ptr<uchar>(y);
-
-            const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
-
-            int ty1 = cvFloor(tyf);
-            int ty2 = ty1 + 1;
-
-            const float ya = tyf - ty1;
-
-            ty1 = std::max(ty1, 0);
-            ty2 = std::min(ty2, tilesY_ - 1);
-
-            const uchar* lutPlane1 = lut_.ptr(ty1 * tilesX_);
-            const uchar* lutPlane2 = lut_.ptr(ty2 * tilesX_);
-
-            for (int x = 0; x < src_.cols; ++x)
-            {
-                const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
-
-                int tx1 = cvFloor(txf);
-                int tx2 = tx1 + 1;
-
-                const float xa = txf - tx1;
-
-                tx1 = std::max(tx1, 0);
-                tx2 = std::min(tx2, tilesX_ - 1);
-
-                const int srcVal = srcRow[x];
-
-                const size_t ind1 = tx1 * lut_step + srcVal;
-                const size_t ind2 = tx2 * lut_step + srcVal;
-
-                float res = 0;
-
-                res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
-                res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
-                res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
-                res += lutPlane2[ind2] * ((xa) * (ya));
-
-                dstRow[x] = cv::saturate_cast<uchar>(res);
-            }
-        }
-    }
-
-    class CLAHE_Impl : public cv::CLAHE
-    {
-    public:
-        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
-
-        cv::AlgorithmInfo* info() const;
-
-        void apply(cv::InputArray src, cv::OutputArray dst);
-
-        void setClipLimit(double clipLimit);
-        double getClipLimit() const;
-
-        void setTilesGridSize(cv::Size tileGridSize);
-        cv::Size getTilesGridSize() const;
-
-        void collectGarbage();
-
-    private:
-        double clipLimit_;
-        int tilesX_;
-        int tilesY_;
-
-        cv::Mat srcExt_;
-        cv::Mat lut_;
-    };
-
-    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
-        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
-    {
-    }
-
-    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE",
-        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
-        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
-        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
-
-    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
-    {
-        cv::Mat src = _src.getMat();
-
-        CV_Assert( src.type() == CV_8UC1 );
-
-        _dst.create( src.size(), src.type() );
-        cv::Mat dst = _dst.getMat();
-
-        const int histSize = 256;
-
-        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
-
-        cv::Size tileSize;
-        cv::Mat srcForLut;
-
-        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
-        {
-            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
-            srcForLut = src;
-        }
-        else
-        {
-            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
-
-            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
-            srcForLut = srcExt_;
-        }
-
-        const int tileSizeTotal = tileSize.area();
-        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
-
-        int clipLimit = 0;
-        if (clipLimit_ > 0.0)
-        {
-            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
-            clipLimit = std::max(clipLimit, 1);
-        }
-
-        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
-        cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
-
-        CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
-        cv::parallel_for_(cv::Range(0, src.rows), interpolationBody);
-    }
-
-    void CLAHE_Impl::setClipLimit(double clipLimit)
-    {
-        clipLimit_ = clipLimit;
-    }
-
-    double CLAHE_Impl::getClipLimit() const
-    {
-        return clipLimit_;
-    }
-
-    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
-    {
-        tilesX_ = tileGridSize.width;
-        tilesY_ = tileGridSize.height;
-    }
-
-    cv::Size CLAHE_Impl::getTilesGridSize() const
-    {
-        return cv::Size(tilesX_, tilesY_);
-    }
-
-    void CLAHE_Impl::collectGarbage()
-    {
-        srcExt_.release();
-        lut_.release();
-    }
-}
-
-cv::Ptr<cv::CLAHE> cv::createCLAHE(double clipLimit, cv::Size tileGridSize)
-{
-    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
-}
-
 // ----------------------------------------------------------------------

 /* Implementation of RTTI and Generic Functions for CvHistogram */
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -1081,7 +1081,7 @@ cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor)
 namespace cv
 {

-class MorphologyRunner
+class MorphologyRunner : public ParallelLoopBody
 {
 public:
    MorphologyRunner(Mat _src, Mat _dst, int _nStripes, int _iterations,
@@ -1102,14 +1102,14 @@ public:
        columnBorderType = _columnBorderType;
    }

-    void operator () ( const BlockedRange& range ) const
+    void operator () ( const Range& range ) const
    {
-        int row0 = std::min(cvRound(range.begin() * src.rows / nStripes), src.rows);
-        int row1 = std::min(cvRound(range.end() * src.rows / nStripes), src.rows);
+        int row0 = std::min(cvRound(range.start * src.rows / nStripes), src.rows);
+        int row1 = std::min(cvRound(range.end * src.rows / nStripes), src.rows);

        /*if(0)
            printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n",
-                   src.rows, src.cols, range.begin(), range.end(), row0, row1);*/
+                   src.rows, src.cols, range.start, range.end, row0, row1);*/

        Mat srcStripe = src.rowRange(row0, row1);
        Mat dstStripe = dst.rowRange(row0, row1);
@@ -1173,15 +1173,15 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
    }

    int nStripes = 1;
-#if defined HAVE_TBB && defined HAVE_TEGRA_OPTIMIZATION
+#if defined HAVE_TEGRA_OPTIMIZATION
    if (src.data != dst.data && iterations == 1 &&  //NOTE: threads are not used for inplace processing
        (borderType & BORDER_ISOLATED) == 0 && //TODO: check border types
        src.rows >= 64 ) //NOTE: just heuristics
        nStripes = 4;
 #endif

-    parallel_for(BlockedRange(0, nStripes),
-                 MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
+    parallel_for_(Range(0, nStripes),
+                  MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));

    //Ptr<FilterEngine> f = createMorphologyFilter(op, src.type(),
    //                                             kernel, anchor, borderType, borderType, borderValue );
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -40,10 +40,6 @@

 #include "precomp.hpp"

-#ifdef HAVE_TBB
-#include <tbb/tbb.h>
-#endif
-
 CvANN_MLP_TrainParams::CvANN_MLP_TrainParams()
 {
    term_crit = cvTermCriteria( CV_TERMCRIT_ITER + CV_TERMCRIT_EPS, 1000, 0.01 );
@@ -1022,7 +1018,7 @@ int CvANN_MLP::train_backprop( CvVectors x0, CvVectors u, const double* sw )
    return iter;
 }

-struct rprop_loop {
+struct rprop_loop : cv::ParallelLoopBody {
  rprop_loop(const CvANN_MLP* _point, double**& _weights, int& _count, int& _ivcount, CvVectors* _x0,
     int& _l_count, CvMat*& _layer_sizes, int& _ovcount, int& _max_count,
     CvVectors* _u, const double*& _sw, double& _inv_count, CvMat*& _dEdw, int& _dcount0, double* _E, int _buf_sz)
@@ -1063,7 +1059,7 @@ struct rprop_loop {
  int buf_sz;


-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
  {
    double* buf_ptr;
    double** x = 0;
@@ -1084,7 +1080,7 @@ struct rprop_loop {
        buf_ptr += (df[i] - x[i])*2;
    }

-    for(int si = range.begin(); si < range.end(); si++ )
+    for(int si = range.start; si < range.end; si++ )
    {
        if (si % dcount0 != 0) continue;
        int n1, n2, k;
@@ -1170,36 +1166,33 @@ struct rprop_loop {
            }

        // backward pass, update dEdw
-        #ifdef HAVE_TBB
-        static tbb::spin_mutex mutex;
-        tbb::spin_mutex::scoped_lock lock;
-        #endif
+        static cv::Mutex mutex;
+
        for(int i = l_count-1; i > 0; i-- )
        {
            n1 = layer_sizes->data.i[i-1]; n2 = layer_sizes->data.i[i];
            cvInitMatHeader( &_df, dcount, n2, CV_64F, df[i] );
            cvMul( grad1, &_df, grad1 );
-            #ifdef HAVE_TBB
-            lock.acquire(mutex);
-            #endif
-            cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
-            cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
-            cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );

-            // update bias part of dEdw
-           for( k = 0; k < dcount; k++ )
-           {
-               double* dst = _dEdw.data.db + n1*n2;
-               const double* src = grad1->data.db + k*n2;
-               for(int j = 0; j < n2; j++ )
-                   dst[j] += src[j];
+            {
+                cv::AutoLock lock(mutex);
+                cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
+                cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
+                cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );
+
+                // update bias part of dEdw
+                for( k = 0; k < dcount; k++ )
+                {
+                    double* dst = _dEdw.data.db + n1*n2;
+                    const double* src = grad1->data.db + k*n2;
+                    for(int j = 0; j < n2; j++ )
+                        dst[j] += src[j];
+                }
+
+                if (i > 1)
+                    cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
           }

-           if (i > 1)
-               cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
-           #ifdef HAVE_TBB
-           lock.release();
-           #endif
           cvInitMatHeader( grad2, dcount, n1, CV_64F, grad2->data.db );
           if( i > 1 )
               cvGEMM( grad1, &_w, 1, 0, 0, grad2, CV_GEMM_B_T );
@@ -1297,7 +1290,7 @@ int CvANN_MLP::train_rprop( CvVectors x0, CvVectors u, const double* sw )
        double E = 0;

        // first, iterate through all the samples and compute dEdw
-        cv::parallel_for(cv::BlockedRange(0, count),
+        cv::parallel_for_(cv::Range(0, count),
            rprop_loop(this, weights, count, ivcount, &x0, l_count, layer_sizes,
                       ovcount, max_count, &u, sw, inv_count, dEdw, dcount0, &E, buf_sz)
        );
--- a/modules/ml/src/gbt.cpp
+++ b/modules/ml/src/gbt.cpp
@@ -884,7 +884,7 @@ float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
 }


-class Tree_predictor
+class Tree_predictor : public cv::ParallelLoopBody
 {
 private:
    pCvSeq* weak;
@@ -894,9 +894,7 @@ private:
    const CvMat* missing;
    const float shrinkage;

-#ifdef HAVE_TBB
-    static tbb::spin_mutex SumMutex;
-#endif
+    static cv::Mutex SumMutex;


 public:
@@ -915,14 +913,11 @@ public:
    Tree_predictor& operator=( const Tree_predictor& )
    { return *this; }

-    virtual void operator()(const cv::BlockedRange& range) const
+    virtual void operator()(const cv::Range& range) const
    {
-#ifdef HAVE_TBB
-        tbb::spin_mutex::scoped_lock lock;
-#endif
        CvSeqReader reader;
-        int begin = range.begin();
-        int end = range.end();
+        int begin = range.start;
+        int end = range.end;

        int weak_count = end - begin;
        CvDTree* tree;
@@ -940,13 +935,11 @@ public:
                    tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
                }
            }
-#ifdef HAVE_TBB
-            lock.acquire(SumMutex);
-            sum[i] += tmp_sum;
-            lock.release();
-#else
-            sum[i] += tmp_sum;
-#endif
+
+            {
+                cv::AutoLock lock(SumMutex);
+                sum[i] += tmp_sum;
+            }
        }
    } // Tree_predictor::operator()

@@ -954,11 +947,7 @@ public:

 }; // class Tree_predictor

-
-#ifdef HAVE_TBB
-tbb::spin_mutex Tree_predictor::SumMutex;
-#endif
-
+cv::Mutex Tree_predictor::SumMutex;


 float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
@@ -976,12 +965,7 @@ float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
        Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
                                    params.shrinkage, _sample, _missing, sum);

-//#ifdef HAVE_TBB
-//      tbb::parallel_for(cv::BlockedRange(begin, end), predictor,
-//                          tbb::auto_partitioner());
-//#else
-        cv::parallel_for(cv::BlockedRange(begin, end), predictor);
-//#endif
+        cv::parallel_for_(cv::Range(begin, end), predictor);

        for (int i=0; i<class_count; ++i)
            sum[i] = sum[i] /** params.shrinkage*/ + base_value;
@@ -1210,7 +1194,7 @@ void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )

 //===========================================================================

-class Sample_predictor
+class Sample_predictor : public cv::ParallelLoopBody
 {
 private:
    const CvGBTrees* gbt;
@@ -1240,10 +1224,10 @@ public:
    {}


-    virtual void operator()(const cv::BlockedRange& range) const
+    virtual void operator()(const cv::Range& range) const
    {
-        int begin = range.begin();
-        int end = range.end();
+        int begin = range.start;
+        int end = range.end;

        CvMat x;
        CvMat miss;
@@ -1299,11 +1283,7 @@ CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
    Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
            _data->get_missing(), _sample_idx);

-//#ifdef HAVE_TBB
-//    tbb::parallel_for(cv::BlockedRange(0,n), predictor, tbb::auto_partitioner());
-//#else
-    cv::parallel_for(cv::BlockedRange(0,n), predictor);
-//#endif
+    cv::parallel_for_(cv::Range(0,n), predictor);

    int* sidx = _sample_idx ? _sample_idx->data.i : 0;
    int r_step = CV_IS_MAT_CONT(response->type) ?
--- a/modules/ml/src/knearest.cpp
+++ b/modules/ml/src/knearest.cpp
@@ -306,7 +306,7 @@ float CvKNearest::write_results( int k, int k1, int start, int end,
    return result;
 }

-struct P1 {
+struct P1 : cv::ParallelLoopBody {
  P1(const CvKNearest* _pointer, int _buf_sz, int _k, const CvMat* __samples, const float** __neighbors,
     int _k1, CvMat* __results, CvMat* __neighbor_responses, CvMat* __dist, float* _result)
  {
@@ -333,10 +333,10 @@ struct P1 {
  float* result;
  int buf_sz;

-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
  {
    cv::AutoBuffer<float> buf(buf_sz);
-    for(int i = range.begin(); i < range.end(); i += 1 )
+    for(int i = range.start; i < range.end; i += 1 )
    {
        float* neighbor_responses = &buf[0];
        float* dist = neighbor_responses + 1*k;
@@ -410,8 +410,8 @@ float CvKNearest::find_nearest( const CvMat* _samples, int k, CvMat* _results,
    int k1 = get_sample_count();
    k1 = MIN( k1, k );

-    cv::parallel_for(cv::BlockedRange(0, count), P1(this, buf_sz, k, _samples, _neighbors, k1,
-                                                    _results, _neighbor_responses, _dist, &result)
+    cv::parallel_for_(cv::Range(0, count), P1(this, buf_sz, k, _samples, _neighbors, k1,
+                                             _results, _neighbor_responses, _dist, &result)
    );

    return result;
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@@ -277,7 +277,7 @@ bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _res
    return result;
 }

-struct predict_body {
+struct predict_body : cv::ParallelLoopBody {
  predict_body(CvMat* _c, CvMat** _cov_rotate_mats, CvMat** _inv_eigen_values, CvMat** _avg,
     const CvMat* _samples, const int* _vidx, CvMat* _cls_labels,
     CvMat* _results, float* _value, int _var_count1
@@ -307,7 +307,7 @@ struct predict_body {
  float* value;
  int var_count1;

-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
  {

    int cls = -1;
@@ -324,7 +324,7 @@ struct predict_body {
    cv::AutoBuffer<double> buffer(nclasses + var_count1);
    CvMat diff = cvMat( 1, var_count1, CV_64FC1, &buffer[0] );

-    for(int k = range.begin(); k < range.end(); k += 1 )
+    for(int k = range.start; k < range.end; k += 1 )
    {
        int ival;
        double opt = FLT_MAX;
@@ -397,9 +397,9 @@ float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) c

    const int* vidx = var_idx ? var_idx->data.i : 0;

-    cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
-                                                                      vidx, cls_labels, results, &value, var_count
-    ));
+    cv::parallel_for_(cv::Range(0, samples->rows),
+                      predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
+                                   vidx, cls_labels, results, &value, var_count));

    return value;
 }
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -2192,7 +2192,7 @@ float CvSVM::predict( const CvMat* sample, bool returnDFVal ) const
    return result;
 }

-struct predict_body_svm {
+struct predict_body_svm : ParallelLoopBody {
    predict_body_svm(const CvSVM* _pointer, float* _result, const CvMat* _samples, CvMat* _results)
    {
        pointer = _pointer;
@@ -2206,9 +2206,9 @@ struct predict_body_svm {
    const CvMat* samples;
    CvMat* results;

-    void operator()( const cv::BlockedRange& range ) const
+    void operator()( const cv::Range& range ) const
    {
-        for(int i = range.begin(); i < range.end(); i++ )
+        for(int i = range.start; i < range.end; i++ )
        {
            CvMat sample;
            cvGetRow( samples, &sample, i );
@@ -2224,7 +2224,7 @@ struct predict_body_svm {
 float CvSVM::predict(const CvMat* samples, CV_OUT CvMat* results) const
 {
    float result = 0;
-    cv::parallel_for(cv::BlockedRange(0, samples->rows),
+    cv::parallel_for_(cv::Range(0, samples->rows),
             predict_body_svm(this, &result, samples, results)
    );
    return result;
--- a/modules/nonfree/src/surf.cpp
+++ b/modules/nonfree/src/surf.cpp
@@ -258,7 +258,7 @@ interpolateKeypoint( float N9[3][9], int dx, int dy, int ds, KeyPoint& kpt )
 }

 // Multi-threaded construction of the scale-space pyramid
-struct SURFBuildInvoker
+struct SURFBuildInvoker : ParallelLoopBody
 {
    SURFBuildInvoker( const Mat& _sum, const std::vector<int>& _sizes,
                      const std::vector<int>& _sampleSteps,
@@ -271,9 +271,9 @@ struct SURFBuildInvoker
        traces = &_traces;
    }

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
-        for( int i=range.begin(); i<range.end(); i++ )
+        for( int i=range.start; i<range.end; i++ )
            calcLayerDetAndTrace( *sum, (*sizes)[i], (*sampleSteps)[i], (*dets)[i], (*traces)[i] );
    }

@@ -285,7 +285,7 @@ struct SURFBuildInvoker
 };

 // Multi-threaded search of the scale-space pyramid for keypoints
-struct SURFFindInvoker
+struct SURFFindInvoker : ParallelLoopBody
 {
    SURFFindInvoker( const Mat& _sum, const Mat& _mask_sum,
                     const std::vector<Mat>& _dets, const std::vector<Mat>& _traces,
@@ -310,9 +310,9 @@ struct SURFFindInvoker
                   const std::vector<int>& sizes, std::vector<KeyPoint>& keypoints,
                   int octave, int layer, float hessianThreshold, int sampleStep );

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
-        for( int i=range.begin(); i<range.end(); i++ )
+        for( int i=range.start; i<range.end; i++ )
        {
            int layer = (*middleIndices)[i];
            int octave = i / nOctaveLayers;
@@ -333,14 +333,10 @@ struct SURFFindInvoker
    int nOctaveLayers;
    float hessianThreshold;

-#ifdef HAVE_TBB
-    static tbb::mutex findMaximaInLayer_m;
-#endif
+    static Mutex findMaximaInLayer_m;
 };

-#ifdef HAVE_TBB
-tbb::mutex SURFFindInvoker::findMaximaInLayer_m;
-#endif
+Mutex SURFFindInvoker::findMaximaInLayer_m;


 /*
@@ -437,9 +433,7 @@ void SURFFindInvoker::findMaximaInLayer( const Mat& sum, const Mat& mask_sum,
                    if( interp_ok  )
                    {
                        /*printf( "KeyPoint %f %f %d\n", point.pt.x, point.pt.y, point.size );*/
-#ifdef HAVE_TBB
-                        tbb::mutex::scoped_lock lock(findMaximaInLayer_m);
-#endif
+                        cv::AutoLock lock(findMaximaInLayer_m);
                        keypoints.push_back(kpt);
                    }
                }
@@ -505,20 +499,20 @@ static void fastHessianDetector( const Mat& sum, const Mat& mask_sum, std::vecto
    }

    // Calculate hessian determinant and trace samples in each layer
-    parallel_for( BlockedRange(0, nTotalLayers),
-                      SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );
+    parallel_for_( Range(0, nTotalLayers),
+                   SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );

    // Find maxima in the determinant of the hessian
-    parallel_for( BlockedRange(0, nMiddleLayers),
-                      SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
-                                      sampleSteps, middleIndices, keypoints,
-                                      nOctaveLayers, hessianThreshold) );
+    parallel_for_( Range(0, nMiddleLayers),
+                   SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
+                                   sampleSteps, middleIndices, keypoints,
+                                   nOctaveLayers, hessianThreshold) );

    std::sort(keypoints.begin(), keypoints.end(), KeypointGreater());
 }


-struct SURFInvoker
+struct SURFInvoker : ParallelLoopBody
 {
    enum { ORI_RADIUS = 6, ORI_WIN = 60, PATCH_SZ = 20 };

@@ -566,7 +560,7 @@ struct SURFInvoker
        }
    }

-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
    {
        /* X and Y gradient wavelet data */
        const int NX=2, NY=2;
@@ -584,7 +578,7 @@ struct SURFInvoker

        int dsize = extended ? 128 : 64;

-        int k, k1 = range.begin(), k2 = range.end();
+        int k, k1 = range.start, k2 = range.end;
        float maxSize = 0;
        for( k = k1; k < k2; k++ )
        {
@@ -952,7 +946,7 @@ void SURF::operator()(InputArray _img, InputArray _mask,

        // we call SURFInvoker in any case, even if we do not need descriptors,
        // since it computes orientation of each feature.
-        parallel_for(BlockedRange(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );
+        parallel_for_(Range(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );

        // remove keypoints that were marked for deletion
        for( i = j = 0; i < N; i++ )
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -1164,15 +1164,10 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o

        int stripCount, stripSize;

-    #ifdef HAVE_TBB
        const int PTS_PER_THREAD = 1000;
        stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
        stripCount = std::min(std::max(stripCount, 1), 100);
        stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
-    #else
-        stripCount = 1;
-        stripSize = processingRectSize.height;
-    #endif

        if( !detectSingleScale( scaledImage, stripCount, processingRectSize, stripSize, yStep, factor, candidates,
            rejectLevels, levelWeights, outputRejectLevels ) )
--- a/modules/objdetect/src/latentsvm.cpp
+++ b/modules/objdetect/src/latentsvm.cpp
@@ -582,7 +582,6 @@ int searchObjectThresholdSomeComponents(const CvLSVMFeaturePyramid *H,
    // For each component perform searching
    for (i = 0; i < kComponents; i++)
    {
-#ifdef HAVE_TBB
        int error = searchObjectThreshold(H, &(filters[componentIndex]), kPartFilters[i],
            b[i], maxXBorder, maxYBorder, scoreThreshold,
            &(pointsArr[i]), &(levelsArr[i]), &(kPointsArr[i]),
@@ -598,13 +597,6 @@ int searchObjectThresholdSomeComponents(const CvLSVMFeaturePyramid *H,
            free(partsDisplacementArr);
            return LATENT_SVM_SEARCH_OBJECT_FAILED;
        }
-#else
-    (void)numThreads;
-        searchObjectThreshold(H, &(filters[componentIndex]), kPartFilters[i],
-            b[i], maxXBorder, maxYBorder, scoreThreshold,
-            &(pointsArr[i]), &(levelsArr[i]), &(kPointsArr[i]),
-            &(scoreArr[i]), &(partsDisplacementArr[i]));
-#endif
        estimateBoxes(pointsArr[i], levelsArr[i], kPointsArr[i],
            filters[componentIndex]->sizeX, filters[componentIndex]->sizeY, &(oppPointsArr[i]));
        componentIndex += (kPartFilters[i] + 1);
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -121,8 +121,9 @@ namespace cv
        CV_EXPORTS  void setBinpath(const char *path);

        //The two functions below enable other opencl program to use ocl module's cl_context and cl_command_queue
+        //returns cl_context *
        CV_EXPORTS void* getoclContext();
-
+        //returns cl_command_queue *
        CV_EXPORTS void* getoclCommandQueue();

        //explicit call clFinish. The global command queue will be used.
@@ -460,6 +461,7 @@ namespace cv
        // support all C1 types

        CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
+        CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);

        //! finds global minimum and maximum array elements and returns their values with locations
        // support all C1 types
@@ -808,7 +810,11 @@ namespace cv
        CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum);
        CV_EXPORTS void integral(const oclMat &src, oclMat &sum);
        CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
+        CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
+            int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
        CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
+        CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
+            int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);

        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
        ///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
@@ -826,13 +832,14 @@ namespace cv
        };
 #endif

+#if 0
        class CV_EXPORTS OclCascadeClassifierBuf : public  cv::CascadeClassifier
        {
        public:
            OclCascadeClassifierBuf() :
                m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}

-            ~OclCascadeClassifierBuf() {}
+            ~OclCascadeClassifierBuf() { release(); }

            void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
                                  double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
@@ -864,6 +871,7 @@ namespace cv
            oclMat gimg1, gsum, gsqsum;
            void * buffers;
        };
+#endif

        /////////////////////////////// Pyramid /////////////////////////////////////
        CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
@@ -1388,6 +1396,51 @@ namespace cv
            explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
        };

+        class CV_EXPORTS GoodFeaturesToTrackDetector_OCL
+        {
+        public:
+            explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
+                int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
+
+            //! return 1 rows matrix with CV_32FC2 type
+            void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
+            //! download points of type Point2f to a vector. the vector's content will be erased
+            void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
+
+            int maxCorners;
+            double qualityLevel;
+            double minDistance;
+
+            int blockSize;
+            bool useHarrisDetector;
+            double harrisK;
+            void releaseMemory()
+            {
+                Dx_.release();
+                Dy_.release();
+                eig_.release();
+                minMaxbuf_.release();
+                tmpCorners_.release();
+            }
+        private:
+            oclMat Dx_;
+            oclMat Dy_;
+            oclMat eig_;
+            oclMat minMaxbuf_;
+            oclMat tmpCorners_;
+        };
+
+        inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
+            int blockSize_, bool useHarrisDetector_, double harrisK_)
+        {
+            maxCorners = maxCorners_;
+            qualityLevel = qualityLevel_;
+            minDistance = minDistance_;
+            blockSize = blockSize_;
+            useHarrisDetector = useHarrisDetector_;
+            harrisK = harrisK_;
+        }
+
        /////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////

        class CV_EXPORTS PyrLKOpticalFlow
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -47,7 +47,7 @@
 #define __OPENCV_OCL_PRIVATE_UTIL__

 #if defined __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/opencl.h>
 #endif
@@ -121,6 +121,33 @@ namespace cv
        cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
        void CV_EXPORTS releaseTexture(cl_mem& texture);

+        //Represents an image texture object
+        class CV_EXPORTS TextureCL
+        {
+        public:
+            TextureCL(cl_mem tex, int r, int c, int t)
+                : tex_(tex), rows(r), cols(c), type(t) {}
+            ~TextureCL()
+            {
+                openCLFree(tex_);
+            }
+            operator cl_mem() 
+            {
+                return tex_;
+            }
+            cl_mem const tex_;
+            const int rows;
+            const int cols;
+            const int type;
+        private:
+            //disable assignment
+            void operator=(const TextureCL&);
+        };
+        // bind oclMat to OpenCL image textures and retunrs an TextureCL object
+        // note:
+        //   for faster clamping, there is no buffer padding for the constructed texture
+        Ptr<TextureCL> CV_EXPORTS bindTexturePtr(const oclMat &mat);
+
        // returns whether the current context supports image2d_t format or not
        bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());

@@ -132,7 +159,7 @@ namespace cv
        };
        template<DEVICE_INFO _it, typename _ty>
        _ty queryDeviceInfo(cl_kernel kernel = NULL);
-        //info should have been pre-allocated
+
        template<>
        int CV_EXPORTS queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel);
        template<>
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -48,7 +48,7 @@
 ///////////// Lut ////////////////////////
 PERFTEST(lut)
 {
-    Mat src, lut, dst;
+    Mat src, lut, dst, ocl_dst;
    ocl::oclMat d_src, d_lut, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC3};
@@ -77,11 +77,6 @@ PERFTEST(lut)
            ocl::LUT(d_src, d_lut, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0));
-
            GPU_ON;
            ocl::LUT(d_src, d_lut, d_dst);
            GPU_OFF;
@@ -90,9 +85,10 @@ PERFTEST(lut)
            d_src.upload(src);
            d_lut.upload(lut);
            ocl::LUT(d_src, d_lut, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;

+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0);
        }

    }
@@ -101,7 +97,7 @@ PERFTEST(lut)
 ///////////// Exp ////////////////////////
 PERFTEST(Exp)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -121,11 +117,6 @@ PERFTEST(Exp)
        ocl::exp(d_src, d_dst);
        WARMUP_OFF;

-        cv::Mat ocl_mat_dst;
-        d_dst.download(ocl_mat_dst);
-
-        TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 2));
-
        GPU_ON;
        ocl::exp(d_src, d_dst);
        GPU_OFF;
@@ -133,15 +124,17 @@ PERFTEST(Exp)
        GPU_FULL_ON;
        d_src.upload(src);
        ocl::exp(d_src, d_dst);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 2);
    }
 }

 ///////////// LOG ////////////////////////
 PERFTEST(Log)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -161,11 +154,6 @@ PERFTEST(Log)
        ocl::log(d_src, d_dst);
        WARMUP_OFF;

-        cv::Mat ocl_mat_dst;
-        d_dst.download(ocl_mat_dst);
-
-        TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1));
-
        GPU_ON;
        ocl::log(d_src, d_dst);
        GPU_OFF;
@@ -173,15 +161,17 @@ PERFTEST(Log)
        GPU_FULL_ON;
        d_src.upload(src);
        ocl::log(d_src, d_dst);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
    }
 }

 ///////////// Add ////////////////////////
 PERFTEST(Add)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;

    int all_type[] = {CV_8UC1, CV_32FC1};
@@ -201,6 +191,7 @@ PERFTEST(Add)
            CPU_ON;
            add(src1, src2, dst);
            CPU_OFF;
+
            d_src1.upload(src1);
            d_src2.upload(src2);

@@ -208,11 +199,6 @@ PERFTEST(Add)
            ocl::add(d_src1, d_src2, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
-
            GPU_ON;
            ocl::add(d_src1, d_src2, d_dst);
            GPU_OFF;
@@ -221,8 +207,10 @@ PERFTEST(Add)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::add(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
        }

    }
@@ -231,7 +219,7 @@ PERFTEST(Add)
 ///////////// Mul ////////////////////////
 PERFTEST(Mul)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC4};
@@ -260,11 +248,6 @@ PERFTEST(Mul)
            ocl::multiply(d_src1, d_src2, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
-
            GPU_ON;
            ocl::multiply(d_src1, d_src2, d_dst);
            GPU_OFF;
@@ -273,8 +256,10 @@ PERFTEST(Mul)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::multiply(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
        }

    }
@@ -283,7 +268,7 @@ PERFTEST(Mul)
 ///////////// Div ////////////////////////
 PERFTEST(Div)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -304,6 +289,7 @@ PERFTEST(Div)
            CPU_ON;
            divide(src1, src2, dst);
            CPU_OFF;
+
            d_src1.upload(src1);
            d_src2.upload(src2);

@@ -311,11 +297,6 @@ PERFTEST(Div)
            ocl::divide(d_src1, d_src2, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1));
-
            GPU_ON;
            ocl::divide(d_src1, d_src2, d_dst);
            GPU_OFF;
@@ -324,8 +305,10 @@ PERFTEST(Div)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::divide(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
        }

    }
@@ -334,7 +317,7 @@ PERFTEST(Div)
 ///////////// Absdiff ////////////////////////
 PERFTEST(Absdiff)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC4};
@@ -355,6 +338,7 @@ PERFTEST(Absdiff)
            CPU_ON;
            absdiff(src1, src2, dst);
            CPU_OFF;
+
            d_src1.upload(src1);
            d_src2.upload(src2);

@@ -362,11 +346,6 @@ PERFTEST(Absdiff)
            ocl::absdiff(d_src1, d_src2, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
-
            GPU_ON;
            ocl::absdiff(d_src1, d_src2, d_dst);
            GPU_OFF;
@@ -375,8 +354,10 @@ PERFTEST(Absdiff)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::absdiff(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
        }

    }
@@ -385,7 +366,7 @@ PERFTEST(Absdiff)
 ///////////// CartToPolar ////////////////////////
 PERFTEST(CartToPolar)
 {
-    Mat src1, src2, dst, dst1;
+    Mat src1, src2, dst, dst1, ocl_dst, ocl_dst1;
    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;

    int all_type[] = {CV_32FC1};
@@ -408,6 +389,7 @@ PERFTEST(CartToPolar)
            CPU_ON;
            cartToPolar(src1, src2, dst, dst1, 1);
            CPU_OFF;
+
            d_src1.upload(src1);
            d_src2.upload(src2);

@@ -415,14 +397,6 @@ PERFTEST(CartToPolar)
            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            cv::Mat ocl_mat_dst1;
-            d_dst1.download(ocl_mat_dst1);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst1, dst1, 0.5)&&ExpectedMatNear(ocl_mat_dst, dst, 0.5));
-
            GPU_ON;
            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
            GPU_OFF;
@@ -431,9 +405,15 @@ PERFTEST(CartToPolar)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-            d_dst.download(dst);
-            d_dst1.download(dst1);
+            d_dst.download(ocl_dst);
+            d_dst1.download(ocl_dst1);
            GPU_FULL_OFF;
+
+            double diff1 = checkNorm(ocl_dst1, dst1);
+            double diff2 = checkNorm(ocl_dst, dst);
+            double max_diff = max(diff1, diff2);
+            TestSystem::instance().setAccurate(max_diff<=.5?1:0, max_diff);
+
        }

    }
@@ -442,7 +422,7 @@ PERFTEST(CartToPolar)
 ///////////// PolarToCart ////////////////////////
 PERFTEST(PolarToCart)
 {
-    Mat src1, src2, dst, dst1;
+    Mat src1, src2, dst, dst1, ocl_dst, ocl_dst1;
    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;

    int all_type[] = {CV_32FC1};
@@ -472,14 +452,6 @@ PERFTEST(PolarToCart)
            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            cv::Mat ocl_mat_dst1;
-            d_dst1.download(ocl_mat_dst1);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst1, dst1, 0.5)&&ExpectedMatNear(ocl_mat_dst, dst, 0.5));
-
            GPU_ON;
            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
            GPU_OFF;
@@ -488,9 +460,15 @@ PERFTEST(PolarToCart)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-            d_dst.download(dst);
-            d_dst1.download(dst1);
+            d_dst.download(ocl_dst);
+            d_dst1.download(ocl_dst1);
            GPU_FULL_OFF;
+
+            double diff1 = checkNorm(ocl_dst1, dst1);
+            double diff2 = checkNorm(ocl_dst, dst);
+            double max_diff = max(diff1, diff2);
+            TestSystem::instance().setAccurate(max_diff<=.5?1:0, max_diff);
+
        }

    }
@@ -499,7 +477,7 @@ PERFTEST(PolarToCart)
 ///////////// Magnitude ////////////////////////
 PERFTEST(magnitude)
 {
-    Mat x, y, mag;
+    Mat x, y, mag, ocl_mag;
    ocl::oclMat d_x, d_y, d_mag;

    int all_type[] = {CV_32FC1};
@@ -526,11 +504,6 @@ PERFTEST(magnitude)
            ocl::magnitude(d_x, d_y, d_mag);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_mag.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, mag, 1e-5));
-
            GPU_ON;
            ocl::magnitude(d_x, d_y, d_mag);
            GPU_OFF;
@@ -539,8 +512,10 @@ PERFTEST(magnitude)
            d_x.upload(x);
            d_y.upload(y);
            ocl::magnitude(d_x, d_y, d_mag);
-            d_mag.download(mag);
+            d_mag.download(ocl_mag);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_mag, mag, 1e-5);
        }

    }
@@ -549,7 +524,7 @@ PERFTEST(magnitude)
 ///////////// Transpose ////////////////////////
 PERFTEST(Transpose)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC4};
@@ -575,11 +550,6 @@ PERFTEST(Transpose)
            ocl::transpose(d_src, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
-
            GPU_ON;
            ocl::transpose(d_src, d_dst);
            GPU_OFF;
@@ -587,8 +557,10 @@ PERFTEST(Transpose)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::transpose(d_src, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
        }

    }
@@ -597,7 +569,7 @@ PERFTEST(Transpose)
 ///////////// Flip ////////////////////////
 PERFTEST(Flip)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC4};
@@ -623,11 +595,6 @@ PERFTEST(Flip)
            ocl::flip(d_src, d_dst, 0);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
-
            GPU_ON;
            ocl::flip(d_src, d_dst, 0);
            GPU_OFF;
@@ -635,8 +602,10 @@ PERFTEST(Flip)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::flip(d_src, d_dst, 0);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
        }

    }
@@ -671,7 +640,10 @@ PERFTEST(minMax)
            ocl::minMax(d_src, &min_val_, &max_val_);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(EeceptDoubleEQ<double>(max_val_, max_val)&&EeceptDoubleEQ<double>(min_val_, min_val));
+            if(EeceptDoubleEQ<double>(max_val_, max_val) && EeceptDoubleEQ<double>(min_val_, min_val))
+                TestSystem::instance().setAccurate(1, max(fabs(max_val_-max_val), fabs(min_val_-min_val)));
+            else
+                TestSystem::instance().setAccurate(0, max(fabs(max_val_-max_val), fabs(min_val_-min_val)));

            GPU_ON;
            ocl::minMax(d_src, &min_val, &max_val);
@@ -724,8 +696,6 @@ PERFTEST(minMaxLoc)
                minlocVal_ = src.at<unsigned char>(min_loc_);
                maxlocVal = src.at<unsigned char>(max_loc);
                maxlocVal_ = src.at<unsigned char>(max_loc_);
-                error0 = ::abs(src.at<unsigned char>(min_loc_) - src.at<unsigned char>(min_loc));
-                error1 = ::abs(src.at<unsigned char>(max_loc_) - src.at<unsigned char>(max_loc));
            }
            if(src.depth() == 1)
            {
@@ -733,8 +703,6 @@ PERFTEST(minMaxLoc)
                minlocVal_ = src.at<signed char>(min_loc_);
                maxlocVal = src.at<signed char>(max_loc);
                maxlocVal_ = src.at<signed char>(max_loc_);
-                error0 = ::abs(src.at<signed char>(min_loc_) - src.at<signed char>(min_loc));
-                error1 = ::abs(src.at<signed char>(max_loc_) - src.at<signed char>(max_loc));
            }
            if(src.depth() == 2)
            {
@@ -742,8 +710,6 @@ PERFTEST(minMaxLoc)
                minlocVal_ = src.at<unsigned short>(min_loc_);
                maxlocVal = src.at<unsigned short>(max_loc);
                maxlocVal_ = src.at<unsigned short>(max_loc_);
-                error0 = ::abs(src.at<unsigned short>(min_loc_) - src.at<unsigned short>(min_loc));
-                error1 = ::abs(src.at<unsigned short>(max_loc_) - src.at<unsigned short>(max_loc));
            }
            if(src.depth() == 3)
            {
@@ -751,8 +717,6 @@ PERFTEST(minMaxLoc)
                minlocVal_ = src.at<signed short>(min_loc_);
                maxlocVal = src.at<signed short>(max_loc);
                maxlocVal_ = src.at<signed short>(max_loc_);
-                error0 = ::abs(src.at<signed short>(min_loc_) - src.at<signed short>(min_loc));
-                error1 = ::abs(src.at<signed short>(max_loc_) - src.at<signed short>(max_loc));
            }
            if(src.depth() == 4)
            {
@@ -760,8 +724,6 @@ PERFTEST(minMaxLoc)
                minlocVal_ = src.at<int>(min_loc_);
                maxlocVal = src.at<int>(max_loc);
                maxlocVal_ = src.at<int>(max_loc_);
-                error0 = ::abs(src.at<int>(min_loc_) - src.at<int>(min_loc));
-                error1 = ::abs(src.at<int>(max_loc_) - src.at<int>(max_loc));
            }
            if(src.depth() == 5)
            {
@@ -769,8 +731,6 @@ PERFTEST(minMaxLoc)
                minlocVal_ = src.at<float>(min_loc_);
                maxlocVal = src.at<float>(max_loc);
                maxlocVal_ = src.at<float>(max_loc_);
-                error0 = ::abs(src.at<float>(min_loc_) - src.at<float>(min_loc));
-                error1 = ::abs(src.at<float>(max_loc_) - src.at<float>(max_loc));
            }
            if(src.depth() == 6)
            {
@@ -778,16 +738,16 @@ PERFTEST(minMaxLoc)
                minlocVal_ = src.at<double>(min_loc_);
                maxlocVal = src.at<double>(max_loc);
                maxlocVal_ = src.at<double>(max_loc_);
-                error0 = ::abs(src.at<double>(min_loc_) - src.at<double>(min_loc));
-                error1 = ::abs(src.at<double>(max_loc_) - src.at<double>(max_loc));
            }
-
-            TestSystem::instance().setAccurate(EeceptDoubleEQ<double>(error1, 0.0)
-                &&EeceptDoubleEQ<double>(error0, 0.0)
-                &&EeceptDoubleEQ<double>(maxlocVal_, maxlocVal)
+            error0 = ::abs(minlocVal_ - minlocVal);
+            error1 = ::abs(maxlocVal_ - maxlocVal);
+            if( EeceptDoubleEQ<double>(maxlocVal_, maxlocVal)
                &&EeceptDoubleEQ<double>(minlocVal_, minlocVal)
                &&EeceptDoubleEQ<double>(max_val_, max_val)
-                &&EeceptDoubleEQ<double>(min_val_, min_val));
+                &&EeceptDoubleEQ<double>(min_val_, min_val))
+                TestSystem::instance().setAccurate(1, 0.);
+            else
+                TestSystem::instance().setAccurate(0, max(error0, error1));

            GPU_ON;
            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
@@ -831,11 +791,13 @@ PERFTEST(Sum)
            gpures = ocl::sum(d_src);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExceptDoubleNear(cpures[3], gpures[3], 0.1)
-                &&ExceptDoubleNear(cpures[2], gpures[2], 0.1)
-                &&ExceptDoubleNear(cpures[1], gpures[1], 0.1)
-                &&ExceptDoubleNear(cpures[0], gpures[0], 0.1));
-
+            vector<double> diffs(4);
+            diffs[3] = fabs(cpures[3] - gpures[3]);
+            diffs[2] = fabs(cpures[2] - gpures[2]);
+            diffs[1] = fabs(cpures[1] - gpures[1]);
+            diffs[0] = fabs(cpures[0] - gpures[0]);
+            double max_diff = *max_element(diffs.begin(), diffs.end());
+            TestSystem::instance().setAccurate(max_diff<0.1?1:0, max_diff);

            GPU_ON;
            gpures = ocl::sum(d_src);
@@ -879,7 +841,11 @@ PERFTEST(countNonZero)
            gpures = ocl::countNonZero(d_src);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate((EeceptDoubleEQ<double>((double)cpures, (double)gpures)));
+            int diff = abs(cpures - gpures);
+            if(diff == 0)
+                TestSystem::instance().setAccurate(1, 0);
+            else
+                TestSystem::instance().setAccurate(0, diff);

            GPU_ON;
            ocl::countNonZero(d_src);
@@ -897,7 +863,7 @@ PERFTEST(countNonZero)
 ///////////// Phase ////////////////////////
 PERFTEST(Phase)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;

    int all_type[] = {CV_32FC1};
@@ -913,12 +879,12 @@ PERFTEST(Phase)
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);

-
            phase(src1, src2, dst, 1);

            CPU_ON;
            phase(src1, src2, dst, 1);
            CPU_OFF;
+
            d_src1.upload(src1);
            d_src2.upload(src2);

@@ -926,11 +892,6 @@ PERFTEST(Phase)
            ocl::phase(d_src1, d_src2, d_dst, 1);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-2));
-
            GPU_ON;
            ocl::phase(d_src1, d_src2, d_dst, 1);
            GPU_OFF;
@@ -939,8 +900,10 @@ PERFTEST(Phase)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::phase(d_src1, d_src2, d_dst, 1);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-2);
        }

    }
@@ -949,7 +912,7 @@ PERFTEST(Phase)
 ///////////// bitwise_and////////////////////////
 PERFTEST(bitwise_and)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;

    int all_type[] = {CV_8UC1, CV_32SC1};
@@ -965,7 +928,6 @@ PERFTEST(bitwise_and)
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);

-
            bitwise_and(src1, src2, dst);

            CPU_ON;
@@ -978,11 +940,6 @@ PERFTEST(bitwise_and)
            ocl::bitwise_and(d_src1, d_src2, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
-
            GPU_ON;
            ocl::bitwise_and(d_src1, d_src2, d_dst);
            GPU_OFF;
@@ -991,8 +948,10 @@ PERFTEST(bitwise_and)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::bitwise_and(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
        }

    }
@@ -1001,7 +960,7 @@ PERFTEST(bitwise_and)
 ///////////// bitwise_not////////////////////////
 PERFTEST(bitwise_not)
 {
-    Mat src1, dst;
+    Mat src1, dst, ocl_dst;
    ocl::oclMat d_src1, d_dst;

    int all_type[] = {CV_8UC1, CV_32SC1};
@@ -1016,7 +975,6 @@ PERFTEST(bitwise_not)
            gen(src1, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);

-
            bitwise_not(src1, dst);

            CPU_ON;
@@ -1028,11 +986,6 @@ PERFTEST(bitwise_not)
            ocl::bitwise_not(d_src1, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
-
            GPU_ON;
            ocl::bitwise_not(d_src1, d_dst);
            GPU_OFF;
@@ -1040,8 +993,10 @@ PERFTEST(bitwise_not)
            GPU_FULL_ON;
            d_src1.upload(src1);
            ocl::bitwise_not(d_src1, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
        }

    }
@@ -1050,7 +1005,7 @@ PERFTEST(bitwise_not)
 ///////////// compare////////////////////////
 PERFTEST(compare)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;

    int CMP_EQ = 0;
@@ -1067,12 +1022,12 @@ PERFTEST(compare)
            gen(src2, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);

-
            compare(src1, src2, dst, CMP_EQ);

            CPU_ON;
            compare(src1, src2, dst, CMP_EQ);
            CPU_OFF;
+
            d_src1.upload(src1);
            d_src2.upload(src2);

@@ -1080,11 +1035,6 @@ PERFTEST(compare)
            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 0.0));
-
            GPU_ON;
            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
            GPU_OFF;
@@ -1093,8 +1043,10 @@ PERFTEST(compare)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 0.0);
        }

    }
@@ -1103,7 +1055,7 @@ PERFTEST(compare)
 ///////////// pow ////////////////////////
 PERFTEST(pow)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_32FC1};
@@ -1129,11 +1081,6 @@ PERFTEST(pow)
            ocl::pow(d_src, -2.0, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1.0));
-
            GPU_ON;
            ocl::pow(d_src, -2.0, d_dst);
            GPU_OFF;
@@ -1141,8 +1088,10 @@ PERFTEST(pow)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::pow(d_src, -2.0, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
        }

    }
@@ -1151,7 +1100,7 @@ PERFTEST(pow)
 ///////////// MagnitudeSqr////////////////////////
 PERFTEST(MagnitudeSqr)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;

    int all_type[] = {CV_32FC1};
@@ -1167,53 +1116,36 @@ PERFTEST(MagnitudeSqr)
            gen(src2, size, size, all_type[t], 0, 256);
            gen(dst, size, size, all_type[t], 0, 256);

-
+            CPU_ON;
            for (int i = 0; i < src1.rows; ++i)
-
                for (int j = 0; j < src1.cols; ++j)
                {
                    float val1 = src1.at<float>(i, j);
                    float val2 = src2.at<float>(i, j);
-
                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;

                }
+            CPU_OFF;

-                CPU_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);

-                for (int i = 0; i < src1.rows; ++i)
-                    for (int j = 0; j < src1.cols; ++j)
-                    {
-                        float val1 = src1.at<float>(i, j);
-                        float val2 = src2.at<float>(i, j);
+            WARMUP_ON;
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+            WARMUP_OFF;

-                        ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
+            GPU_ON;
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+            GPU_OFF;

-                    }
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+            d_dst.download(ocl_dst);
+            GPU_FULL_OFF;

-                    CPU_OFF;
-                    d_src1.upload(src1);
-                    d_src2.upload(src2);
-
-                    WARMUP_ON;
-                    ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-                    WARMUP_OFF;
-
-                    cv::Mat ocl_mat_dst;
-                    d_dst.download(ocl_mat_dst);
-
-                    TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1.0));
-
-                    GPU_ON;
-                    ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-                    GPU_OFF;
-
-                    GPU_FULL_ON;
-                    d_src1.upload(src1);
-                    d_src2.upload(src2);
-                    ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-                    d_dst.download(dst);
-                    GPU_FULL_OFF;
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
        }

    }
@@ -1222,7 +1154,7 @@ PERFTEST(MagnitudeSqr)
 ///////////// AddWeighted////////////////////////
 PERFTEST(AddWeighted)
 {
-    Mat src1, src2, dst;
+    Mat src1, src2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_dst;

    double alpha = 2.0, beta = 1.0, gama = 3.0;
@@ -1252,11 +1184,6 @@ PERFTEST(AddWeighted)
            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat_dst;
-            d_dst.download(ocl_mat_dst);
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat_dst, dst, 1e-5));
-
            GPU_ON;
            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
            GPU_OFF;
@@ -1265,8 +1192,10 @@ PERFTEST(AddWeighted)
            d_src1.upload(src1);
            d_src2.upload(src2);
            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
        }

    }
--- a/modules/ocl/perf/perf_blend.cpp
+++ b/modules/ocl/perf/perf_blend.cpp
@@ -71,7 +71,7 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we
 }
 PERFTEST(blend)
 {
-    Mat src1, src2, weights1, weights2, dst;
+    Mat src1, src2, weights1, weights2, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC4};
@@ -103,10 +103,6 @@ PERFTEST(blend)
            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
            WARMUP_OFF;

-            cv::Mat ocl_mat;
-            d_dst.download(ocl_mat);
-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 1.f));
-
            GPU_ON;
            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
            GPU_OFF;
@@ -117,8 +113,10 @@ PERFTEST(blend)
            d_weights1.upload(weights1);
            d_weights2.upload(weights2);
            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.f);
        }
    }
 }
--- a/modules/ocl/perf/perf_brute_force_matcher.cpp
+++ b/modules/ocl/perf/perf_brute_force_matcher.cpp
@@ -88,9 +88,6 @@ PERFTEST(BruteForceMatcher)
        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
        WARMUP_OFF;

-        d_matcher.match(d_query, d_train, d_matches[0]);
-        TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
-
        GPU_ON;
        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
        GPU_OFF;
@@ -98,9 +95,15 @@ PERFTEST(BruteForceMatcher)
        GPU_FULL_ON;
        d_query.upload(query);
        d_train.upload(train);
-        d_matcher.match(d_query, d_train, matches[0]);
+        d_matcher.match(d_query, d_train, d_matches[0]);
        GPU_FULL_OFF;

+        int diff = abs((int)d_matches[0].size() - (int)matches[0].size());
+        if(diff == 0)
+            TestSystem::instance().setAccurate(1, 0);
+        else
+            TestSystem::instance().setAccurate(0, diff);
+
        SUBTEST << size << "; knnMatch";

        matcher.knnMatch(query, train, matches, 2);
@@ -123,7 +126,11 @@ PERFTEST(BruteForceMatcher)
        d_matcher.knnMatch(d_query, d_train, d_matches, 2);
        GPU_FULL_OFF;

-        TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
+        diff = abs((int)d_matches[0].size() - (int)matches[0].size());
+        if(diff == 0)
+            TestSystem::instance().setAccurate(1, 0);
+        else
+            TestSystem::instance().setAccurate(0, diff);

        SUBTEST << size << "; radiusMatch";

@@ -151,6 +158,10 @@ PERFTEST(BruteForceMatcher)
        d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
        GPU_FULL_OFF;

-        TestSystem::instance().setAccurate(AssertEQ<size_t>(d_matches[0].size(), matches[0].size()));
+        diff = abs((int)d_matches[0].size() - (int)matches[0].size());
+        if(diff == 0)
+            TestSystem::instance().setAccurate(1, 0);
+        else
+            TestSystem::instance().setAccurate(0, diff);
    }
 }
--- a/modules/ocl/perf/perf_canny.cpp
+++ b/modules/ocl/perf/perf_canny.cpp
@@ -57,7 +57,7 @@ PERFTEST(Canny)

    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";

-    Mat edges(img.size(), CV_8UC1);
+    Mat edges(img.size(), CV_8UC1), ocl_edges;

    CPU_ON;
    Canny(img, edges, 50.0, 100.0);
@@ -71,8 +71,6 @@ PERFTEST(Canny)
    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
    WARMUP_OFF;

-    TestSystem::instance().setAccurate(ExceptedMatSimilar(edges, d_edges, 2e-2));
-
    GPU_ON;
    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
    GPU_OFF;
@@ -80,6 +78,8 @@ PERFTEST(Canny)
    GPU_FULL_ON;
    d_img.upload(img);
    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    d_edges.download(edges);
+    d_edges.download(ocl_edges);
    GPU_FULL_OFF;
+
+    TestSystem::instance().ExceptedMatSimilar(edges, ocl_edges, 2e-2);
 }
--- a/modules/ocl/perf/perf_color.cpp
+++ b/modules/ocl/perf/perf_color.cpp
@@ -48,7 +48,7 @@
 ///////////// cvtColor////////////////////////
 PERFTEST(cvtColor)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_8UC4};
@@ -73,10 +73,6 @@ PERFTEST(cvtColor)
            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
            WARMUP_OFF;

-            cv::Mat ocl_mat;
-            d_dst.download(ocl_mat);
-            TestSystem::instance().setAccurate(ExceptedMatSimilar(dst, ocl_mat, 1e-5));
-
            GPU_ON;
            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
            GPU_OFF;
@@ -84,8 +80,10 @@ PERFTEST(cvtColor)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExceptedMatSimilar(dst, ocl_dst, 1e-5);
        }


--- a/modules/ocl/perf/perf_columnsum.cpp
+++ b/modules/ocl/perf/perf_columnsum.cpp
@@ -48,7 +48,7 @@
 ///////////// columnSum////////////////////////
 PERFTEST(columnSum)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -63,23 +63,16 @@ PERFTEST(columnSum)
            dst.at<float>(0, j) = src.at<float>(0, j);

        for (int i = 1; i < src.rows; ++i)
-        {for (int j = 0; j < src.cols; ++j)
-            {
+            for (int j = 0; j < src.cols; ++j)
                dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
-            }
-        }
-
        CPU_OFF;

        d_src.upload(src);
+
        WARMUP_ON;
        ocl::columnSum(d_src, d_dst);
        WARMUP_OFF;

-        cv::Mat ocl_mat;
-        d_dst.download(ocl_mat);
-        TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 5e-1));
-
        GPU_ON;
        ocl::columnSum(d_src, d_dst);
        GPU_OFF;
@@ -87,7 +80,9 @@ PERFTEST(columnSum)
        GPU_FULL_ON;
        d_src.upload(src);
        ocl::columnSum(d_src, d_dst);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 5e-1);
    }
 }
--- a/modules/ocl/perf/perf_fft.cpp
+++ b/modules/ocl/perf/perf_fft.cpp
@@ -48,7 +48,7 @@
 ///////////// dft ////////////////////////
 PERFTEST(dft)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_32FC2};
@@ -74,8 +74,6 @@ PERFTEST(dft)
            ocl::dft(d_src, d_dst, Size(size, size));
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), src.size().area() * 1e-4));
-
            GPU_ON;
            ocl::dft(d_src, d_dst, Size(size, size));
            GPU_OFF;
@@ -83,8 +81,10 @@ PERFTEST(dft)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::dft(d_src, d_dst, Size(size, size));
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, src.size().area() * 1e-4);
        }

    }
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@@ -48,7 +48,7 @@
 ///////////// Blur////////////////////////
 PERFTEST(Blur)
 {
-    Mat src1, dst;
+    Mat src1, dst, ocl_dst;
    ocl::oclMat d_src1, d_dst;

    Size ksize = Size(3, 3);
@@ -65,7 +65,6 @@ PERFTEST(Blur)
            gen(src1, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);

-
            blur(src1, dst, ksize, Point(-1, -1), bordertype);

            CPU_ON;
@@ -78,8 +77,6 @@ PERFTEST(Blur)
            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1.0));
-
            GPU_ON;
            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
            GPU_OFF;
@@ -87,8 +84,10 @@ PERFTEST(Blur)
            GPU_FULL_ON;
            d_src1.upload(src1);
            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
        }

    }
@@ -96,7 +95,7 @@ PERFTEST(Blur)
 ///////////// Laplacian////////////////////////
 PERFTEST(Laplacian)
 {
-    Mat src1, dst;
+    Mat src1, dst, ocl_dst;
    ocl::oclMat d_src1, d_dst;

    int ksize = 3;
@@ -112,7 +111,6 @@ PERFTEST(Laplacian)
            gen(src1, size, size, all_type[j], 0, 256);
            gen(dst, size, size, all_type[j], 0, 256);

-
            Laplacian(src1, dst, -1, ksize, 1);

            CPU_ON;
@@ -125,8 +123,6 @@ PERFTEST(Laplacian)
            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));
-
            GPU_ON;
            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
            GPU_OFF;
@@ -134,8 +130,10 @@ PERFTEST(Laplacian)
            GPU_FULL_ON;
            d_src1.upload(src1);
            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
        }

    }
@@ -144,7 +142,7 @@ PERFTEST(Laplacian)
 ///////////// Erode ////////////////////
 PERFTEST(Erode)
 {
-    Mat src, dst, ker;
+    Mat src, dst, ker, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
@@ -171,8 +169,6 @@ PERFTEST(Erode)
            ocl::erode(d_src, d_dst, ker);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));            
-
            GPU_ON;
            ocl::erode(d_src, d_dst, ker);
            GPU_OFF;
@@ -180,8 +176,10 @@ PERFTEST(Erode)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::erode(d_src, d_dst, ker);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
        }

    }
@@ -190,7 +188,7 @@ PERFTEST(Erode)
 ///////////// Sobel ////////////////////////
 PERFTEST(Sobel)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int dx = 1;
@@ -218,8 +216,6 @@ PERFTEST(Sobel)
            ocl::Sobel(d_src, d_dst, -1, dx, dy);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1));
-
            GPU_ON;
            ocl::Sobel(d_src, d_dst, -1, dx, dy);
            GPU_OFF;
@@ -227,8 +223,10 @@ PERFTEST(Sobel)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
        }

    }
@@ -236,7 +234,7 @@ PERFTEST(Sobel)
 ///////////// Scharr ////////////////////////
 PERFTEST(Scharr)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int dx = 1;
@@ -264,8 +262,6 @@ PERFTEST(Scharr)
            ocl::Scharr(d_src, d_dst, -1, dx, dy);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1));
-
            GPU_ON;
            ocl::Scharr(d_src, d_dst, -1, dx, dy);
            GPU_OFF;
@@ -273,8 +269,10 @@ PERFTEST(Scharr)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
        }

    }
@@ -283,7 +281,7 @@ PERFTEST(Scharr)
 ///////////// GaussianBlur ////////////////////////
 PERFTEST(GaussianBlur)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};

@@ -311,9 +309,6 @@ PERFTEST(GaussianBlur)
            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1.0));
-
-
            GPU_ON;
            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
            GPU_OFF;
@@ -321,8 +316,10 @@ PERFTEST(GaussianBlur)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
        }

    }
@@ -349,7 +346,7 @@ PERFTEST(filter2D)
                Mat kernel;
                gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);

-				Mat dst(src);
+				Mat dst, ocl_dst;
 				dst.setTo(0);
                cv::filter2D(src, dst, -1, kernel);

@@ -357,17 +354,12 @@ PERFTEST(filter2D)
                cv::filter2D(src, dst, -1, kernel);
                CPU_OFF;

-                ocl::oclMat d_src(src);
-                ocl::oclMat d_dst(d_src);
-				d_dst.setTo(0);
+                ocl::oclMat d_src(src), d_dst;

                WARMUP_ON;
                ocl::filter2D(d_src, d_dst, -1, kernel);
                WARMUP_OFF;

-                TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, 1e-5));
-
-
                GPU_ON;
                ocl::filter2D(d_src, d_dst, -1, kernel);
                GPU_OFF;
@@ -375,8 +367,10 @@ PERFTEST(filter2D)
                GPU_FULL_ON;
                d_src.upload(src);
                ocl::filter2D(d_src, d_dst, -1, kernel);
-                d_dst.download(dst);
+                d_dst.download(ocl_dst);
                GPU_FULL_OFF;
+
+                TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
            }

        }
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/ocl/perf/perf_gemm.cpp
@@ -48,7 +48,7 @@
 ///////////// gemm ////////////////////////
 PERFTEST(gemm)
 {
-    Mat src1, src2, src3, dst;
+    Mat src1, src2, src3, dst, ocl_dst;
    ocl::oclMat d_src1, d_src2, d_src3, d_dst;

    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -72,7 +72,6 @@ PERFTEST(gemm)
        WARMUP_ON;
        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
        WARMUP_OFF;
-        TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(d_dst), dst, src1.cols * src1.rows * 1e-4));

        GPU_ON;
        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
@@ -83,7 +82,9 @@ PERFTEST(gemm)
        d_src2.upload(src2);
        d_src3.upload(src3);
        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(ocl_dst, dst, src1.cols * src1.rows * 1e-4);
    }
 }
--- a/modules/ocl/perf/perf_haar.cpp
+++ b/modules/ocl/perf/perf_haar.cpp
@@ -125,8 +125,10 @@ PERFTEST(Haar)
                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
    WARMUP_OFF;

-    //Testing whether the expected is equal to the actual.
-    TestSystem::instance().setAccurate(ExpectedEQ<vector<Rect>::size_type, vector<Rect>::size_type>(faces.size(), oclfaces.size()));
+    if(faces.size() == oclfaces.size())
+        TestSystem::instance().setAccurate(1, 0);
+    else
+        TestSystem::instance().setAccurate(0, abs((int)faces.size() - (int)oclfaces.size()));

    faces.clear();

--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@@ -146,10 +146,8 @@ PERFTEST(HOG)
        }
    }

-    cv::Mat ocl_mat;
-    ocl_mat = cv::Mat(d_comp);
-    ocl_mat.convertTo(ocl_mat, cv::Mat(comp).type());
-    TestSystem::instance().setAccurate(ExpectedMatNear(ocl_mat, cv::Mat(comp), 3));
+    cv::Mat gpu_rst(d_comp), cpu_rst(comp);
+    TestSystem::instance().ExpectedMatNear(gpu_rst, cpu_rst, 3);

    GPU_ON;
    ocl_hog.detectMultiScale(d_src, found_locations);
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@ -48,7 +48,7 @@
 ///////////// equalizeHist ////////////////////////
 PERFTEST(equalizeHist)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    int all_type[] = {CV_8UC1};
    std::string type_name[] = {"CV_8UC1"};

@@ -75,9 +75,6 @@ PERFTEST(equalizeHist)
            ocl::equalizeHist(d_src, d_dst);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.1));
-
-
            GPU_ON;
            ocl::equalizeHist(d_src, d_dst);
            GPU_OFF;
@@ -85,8 +82,10 @@ PERFTEST(equalizeHist)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::equalizeHist(d_src, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.1);
        }

    }
@@ -94,7 +93,7 @@ PERFTEST(equalizeHist)
 /////////// CopyMakeBorder //////////////////////
 PERFTEST(CopyMakeBorder)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_dst;

    int bordertype = BORDER_CONSTANT;
@@ -122,9 +121,6 @@ PERFTEST(CopyMakeBorder)
            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
-
-
            GPU_ON;
            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
            GPU_OFF;
@@ -132,8 +128,10 @@ PERFTEST(CopyMakeBorder)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
        }

    }
@@ -141,7 +139,7 @@ PERFTEST(CopyMakeBorder)
 ///////////// cornerMinEigenVal ////////////////////////
 PERFTEST(cornerMinEigenVal)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_dst;

    int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
@@ -155,7 +153,6 @@ PERFTEST(cornerMinEigenVal)
        {
            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

-
            gen(src, size, size, all_type[j], 0, 256);

            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
@@ -170,9 +167,6 @@ PERFTEST(cornerMinEigenVal)
            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-
-
            GPU_ON;
            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
            GPU_OFF;
@@ -180,8 +174,10 @@ PERFTEST(cornerMinEigenVal)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
        }

    }
@@ -189,7 +185,7 @@ PERFTEST(cornerMinEigenVal)
 ///////////// cornerHarris ////////////////////////
 PERFTEST(cornerHarris)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_8UC1, CV_32FC1};
@@ -215,8 +211,6 @@ PERFTEST(cornerHarris)
            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-
            GPU_ON;
            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
            GPU_OFF;
@@ -224,8 +218,10 @@ PERFTEST(cornerHarris)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
        }


@@ -234,7 +230,7 @@ PERFTEST(cornerHarris)
 ///////////// integral ////////////////////////
 PERFTEST(integral)
 {
-    Mat src, sum;
+    Mat src, sum, ocl_sum;
    ocl::oclMat d_src, d_sum, d_buf;

    int all_type[] = {CV_8UC1};
@@ -260,12 +256,6 @@ PERFTEST(integral)
            ocl::integral(d_src, d_sum);
            WARMUP_OFF;

-            cv::Mat ocl_mat;
-            d_sum.download(ocl_mat);
-            if(sum.type() == ocl_mat.type()) //we won't test accuracy when cpu function overlow
-                TestSystem::instance().setAccurate(ExpectedMatNear(sum, ocl_mat, 0.0));
-
-
            GPU_ON;
            ocl::integral(d_src, d_sum);
            GPU_OFF;
@@ -273,8 +263,12 @@ PERFTEST(integral)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::integral(d_src, d_sum);
-            d_sum.download(sum);
+            d_sum.download(ocl_sum);
            GPU_FULL_OFF;
+
+            if(sum.type() == ocl_sum.type()) //we won't test accuracy when cpu function overlow
+                TestSystem::instance().ExpectedMatNear(sum, ocl_sum, 0.0);
+
        }

    }
@@ -282,7 +276,7 @@ PERFTEST(integral)
 ///////////// WarpAffine ////////////////////////
 PERFTEST(WarpAffine)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    static const double coeffs[2][3] =
@@ -319,8 +313,6 @@ PERFTEST(WarpAffine)
            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-
            GPU_ON;
            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
            GPU_OFF;
@@ -328,8 +320,10 @@ PERFTEST(WarpAffine)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
        }

    }
@@ -337,7 +331,7 @@ PERFTEST(WarpAffine)
 ///////////// WarpPerspective ////////////////////////
 PERFTEST(WarpPerspective)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    static const double coeffs[3][3] =
@@ -374,8 +368,6 @@ PERFTEST(WarpPerspective)
            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-
            GPU_ON;
            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
            GPU_OFF;
@@ -383,8 +375,10 @@ PERFTEST(WarpPerspective)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
        }

    }
@@ -393,7 +387,7 @@ PERFTEST(WarpPerspective)
 ///////////// resize ////////////////////////
 PERFTEST(resize)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;


@@ -420,9 +414,6 @@ PERFTEST(resize)
            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-
-
            GPU_ON;
            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
            GPU_OFF;
@@ -430,8 +421,10 @@ PERFTEST(resize)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
        }

    }
@@ -456,8 +449,6 @@ PERFTEST(resize)
            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-
            GPU_ON;
            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
            GPU_OFF;
@@ -465,8 +456,10 @@ PERFTEST(resize)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
        }

    }
@@ -474,10 +467,9 @@ PERFTEST(resize)
 ///////////// threshold////////////////////////
 PERFTEST(threshold)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

-
    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
        SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";
@@ -496,9 +488,6 @@ PERFTEST(threshold)
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        WARMUP_OFF;

-        TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-
-
        GPU_ON;
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        GPU_OFF;
@@ -506,9 +495,10 @@ PERFTEST(threshold)
        GPU_FULL_ON;
        d_src.upload(src);
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
        GPU_FULL_OFF;

+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
    }

    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
@@ -529,8 +519,6 @@ PERFTEST(threshold)
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
        WARMUP_OFF;

-        TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-
        GPU_ON;
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
        GPU_OFF;
@@ -538,8 +526,10 @@ PERFTEST(threshold)
        GPU_FULL_ON;
        d_src.upload(src);
        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.0);
    }
 }
 ///////////// meanShiftFiltering////////////////////////
@@ -726,7 +716,7 @@ void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::T
 PERFTEST(meanShiftFiltering)
 {
    int sp = 5, sr = 6;
-    Mat src, dst;
+    Mat src, dst, ocl_dst;

    ocl::oclMat d_src, d_dst;

@@ -753,11 +743,6 @@ PERFTEST(meanShiftFiltering)
        ocl::meanShiftFiltering(d_src, d_dst, sp, sr, crit);
        WARMUP_OFF;

-        cv::Mat ocl_mat;
-        d_dst.download(ocl_mat);
-
-        TestSystem::instance().setAccurate(ExpectedMatNear(dst, ocl_mat, 0.0));
-
        GPU_ON;
        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
        GPU_OFF;
@@ -765,8 +750,10 @@ PERFTEST(meanShiftFiltering)
        GPU_FULL_ON;
        d_src.upload(src);
        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        d_dst.download(dst);
+        d_dst.download(ocl_dst);
        GPU_FULL_OFF;
+
+        TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
    }
 }
 ///////////// meanShiftProc////////////////////////
@@ -1010,8 +997,9 @@ void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp,
 }
 PERFTEST(meanShiftProc)
 {
-    Mat src, dst, dstCoor_roi;
-    ocl::oclMat d_src, d_dst, d_dstCoor_roi;
+    Mat src;
+    vector<Mat> dst(2), ocl_dst(2);
+    ocl::oclMat d_src, d_dst, d_dstCoor;

    TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);

@@ -1020,42 +1008,41 @@ PERFTEST(meanShiftProc)
        SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";

        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
+        gen(dst[0], size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
+        gen(dst[1], size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));

-        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
+        meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);

        CPU_ON;
-        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
+        meanShiftProc_(src, dst[0], dst[1], 5, 6, crit);
        CPU_OFF;

        d_src.upload(src);

        WARMUP_ON;
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
        WARMUP_OFF;

-        TestSystem::instance().setAccurate(ExpectedMatNear(dstCoor_roi, cv::Mat(d_dstCoor_roi), 0.0)
-            &&ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));
-
        GPU_ON;
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
        GPU_OFF;

        GPU_FULL_ON;
        d_src.upload(src);
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-        d_dst.download(dst);
-        d_dstCoor_roi.download(dstCoor_roi);
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor, 5, 6, crit);
+        d_dst.download(ocl_dst[0]);
+        d_dstCoor.download(ocl_dst[1]);
        GPU_FULL_OFF;

+        vector<double> eps(2, 0.);
+        TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
    }
 }

 ///////////// remap////////////////////////
 PERFTEST(remap)
 {
-    Mat src, dst, xmap, ymap;
+    Mat src, dst, xmap, ymap, ocl_dst;
    ocl::oclMat d_src, d_dst, d_xmap, d_ymap;

    int all_type[] = {CV_8UC1, CV_8UC4};
@@ -1088,7 +1075,6 @@ PERFTEST(remap)
                }
            }

-
            remap(src, dst, xmap, ymap, interpolation, borderMode);

            CPU_ON;
@@ -1104,12 +1090,6 @@ PERFTEST(remap)
            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
            WARMUP_OFF;

-            if(interpolation == 0)
-                TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 1.0));
-            else
-                TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 2.0));
-
-
            GPU_ON;
            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
            GPU_OFF;
@@ -1117,8 +1097,10 @@ PERFTEST(remap)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 2.0);
        }

    }
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@@ -56,11 +56,9 @@
 PERFTEST(matchTemplate)
 {
    //InitMatchTemplate();
-
-    Mat src, templ, dst;
+    Mat src, templ, dst, ocl_dst;
    int templ_size = 5;

-
    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
        int all_type[] = {CV_32FC1, CV_32FC4};
@@ -82,16 +80,12 @@ PERFTEST(matchTemplate)
                matchTemplate(src, templ, dst, TM_CCORR);
                CPU_OFF;

-                ocl::oclMat d_src(src), d_templ, d_dst;
-
-                d_templ.upload(templ);
+                ocl::oclMat d_src(src), d_templ(templ), d_dst;

                WARMUP_ON;
                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
                WARMUP_OFF;

-                TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), templ.rows * templ.cols * 1e-1));
-
                GPU_ON;
                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
                GPU_OFF;
@@ -100,8 +94,10 @@ PERFTEST(matchTemplate)
                d_src.upload(src);
                d_templ.upload(templ);
                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
-                d_dst.download(dst);
+                d_dst.download(ocl_dst);
                GPU_FULL_OFF;
+
+                TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
            }
        }

@@ -131,8 +127,6 @@ PERFTEST(matchTemplate)
                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
                WARMUP_OFF;

-                TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), templ.rows * templ.cols * 1e-1));
-
                GPU_ON;
                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
                GPU_OFF;
@@ -141,8 +135,10 @@ PERFTEST(matchTemplate)
                d_src.upload(src);
                d_templ.upload(templ);
                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
-                d_dst.download(dst);
+                d_dst.download(ocl_dst);
                GPU_FULL_OFF;
+
+                TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
            }
        }
    }
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@@ -48,7 +48,7 @@
 ///////////// ConvertTo////////////////////////
 PERFTEST(ConvertTo)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC4};
@@ -77,9 +77,6 @@ PERFTEST(ConvertTo)
            d_src.convertTo(d_dst, CV_32FC1);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));            
-
-
            GPU_ON;
            d_src.convertTo(d_dst, CV_32FC1);
            GPU_OFF;
@@ -87,8 +84,10 @@ PERFTEST(ConvertTo)
            GPU_FULL_ON;
            d_src.upload(src);
            d_src.convertTo(d_dst, CV_32FC1);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
        }

    }
@@ -96,7 +95,7 @@ PERFTEST(ConvertTo)
 ///////////// copyTo////////////////////////
 PERFTEST(copyTo)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    ocl::oclMat d_src, d_dst;

    int all_type[] = {CV_8UC1, CV_8UC4};
@@ -125,9 +124,6 @@ PERFTEST(copyTo)
            d_src.copyTo(d_dst);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), 0.0));            
-
-
            GPU_ON;
            d_src.copyTo(d_dst);
            GPU_OFF;
@@ -135,8 +131,10 @@ PERFTEST(copyTo)
            GPU_FULL_ON;
            d_src.upload(src);
            d_src.copyTo(d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
        }

    }
@@ -144,9 +142,9 @@ PERFTEST(copyTo)
 ///////////// setTo////////////////////////
 PERFTEST(setTo)
 {
-    Mat src, dst;
+    Mat src, ocl_src;
    Scalar val(1, 2, 3, 4);
-    ocl::oclMat d_src, d_dst;
+    ocl::oclMat d_src;

    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
@@ -171,10 +169,10 @@ PERFTEST(setTo)
            d_src.setTo(val);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(src, cv::Mat(d_src), 1.0));            
+            d_src.download(ocl_src);
+            TestSystem::instance().ExpectedMatNear(src, ocl_src, 1.0);

-
-            GPU_ON;
+            GPU_ON;;
            d_src.setTo(val);
            GPU_OFF;

--- a/modules/ocl/perf/perf_norm.cpp
+++ b/modules/ocl/perf/perf_norm.cpp
@@ -48,39 +48,40 @@
 ///////////// norm////////////////////////
 PERFTEST(norm)
 {
-    Mat src, buf;
-    ocl::oclMat d_src, d_buf;
-
+    Mat src1, src2, ocl_src1;
+    ocl::oclMat d_src1, d_src2;

    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
    {
        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";

-        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
-        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+        gen(src1, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+        gen(src2, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));

-        norm(src, NORM_INF);
+        norm(src1, src2, NORM_INF);

        CPU_ON;
-        norm(src, NORM_INF);
+        norm(src1, src2, NORM_INF);
        CPU_OFF;

-        d_src.upload(src);
-        d_buf.upload(buf);
+        d_src1.upload(src1);
+        d_src2.upload(src2);

        WARMUP_ON;
-        ocl::norm(d_src, d_buf, NORM_INF);
+        ocl::norm(d_src1, d_src2, NORM_INF);
        WARMUP_OFF;

-        TestSystem::instance().setAccurate(ExpectedMatNear(src, cv::Mat(d_buf), .5));                        
+        d_src1.download(ocl_src1);
+        TestSystem::instance().ExpectedMatNear(src1, ocl_src1, .5);                        

        GPU_ON;
-        ocl::norm(d_src, d_buf, NORM_INF);
+        ocl::norm(d_src1, d_src2, NORM_INF);
        GPU_OFF;

        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::norm(d_src, d_buf, NORM_INF);
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        ocl::norm(d_src1, d_src2, NORM_INF);
        GPU_FULL_OFF;
    }
 }
--- a/modules/ocl/perf/perf_opticalflow.cpp
+++ b/modules/ocl/perf/perf_opticalflow.cpp
@@ -82,8 +82,8 @@ PERFTEST(PyrLKOpticalFlow)
                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
            else
                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
-            Mat nextPts_cpu;
-            Mat status_cpu;
+            Mat ocl_nextPts;
+            Mat ocl_status;

            vector<Point2f> pts;
            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
@@ -116,12 +116,6 @@ PERFTEST(PyrLKOpticalFlow)
            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
            WARMUP_OFF;

-            std::vector<cv::Point2f> ocl_nextPts(d_nextPts.cols);
-            std::vector<unsigned char> ocl_status(d_status.cols);
-            TestSystem::instance().setAccurate(AssertEQ<size_t>(nextPts.size(), ocl_nextPts.size()));
-            TestSystem::instance().setAccurate(AssertEQ<size_t>(status.size(), ocl_status.size()));                        
-
-
            GPU_ON;
            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
            GPU_OFF;
@@ -133,17 +127,102 @@ PERFTEST(PyrLKOpticalFlow)
            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);

            if (!d_nextPts.empty())
-            {
-                d_nextPts.download(nextPts_cpu);
-            }
+                d_nextPts.download(ocl_nextPts);

            if (!d_status.empty())
-            {
-                d_status.download(status_cpu);
-            }
-
+                d_status.download(ocl_status);
            GPU_FULL_OFF;
+
+            size_t mismatch = 0;
+            for (int i = 0; i < (int)nextPts.size(); ++i)
+            {
+                if(status[i] != ocl_status.at<unsigned char>(0, i)){
+                    mismatch++;
+                    continue;
+                }
+                if(status[i]){
+                    Point2f gpu_rst = ocl_nextPts.at<Point2f>(0, i);
+                    Point2f cpu_rst = nextPts[i];
+                    if(fabs(gpu_rst.x - cpu_rst.x) >= 1. || fabs(gpu_rst.y - cpu_rst.y) >= 1.)
+                        mismatch++;
+                }
+            }
+            double ratio = (double)mismatch / (double)nextPts.size();
+            if(ratio < .02)
+                TestSystem::instance().setAccurate(1, ratio);
+            else
+                TestSystem::instance().setAccurate(0, ratio);
        }

    }
 }
+
+
+PERFTEST(tvl1flow)
+{
+    cv::Mat frame0 = imread("rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    assert(!frame0.empty());
+
+    cv::Mat frame1 = imread("rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    assert(!frame1.empty());
+
+    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
+    cv::ocl::oclMat d_flowx(frame0.size(), CV_32FC1);
+    cv::ocl::oclMat d_flowy(frame1.size(), CV_32FC1);
+
+    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
+    cv::Mat flow;
+
+
+    SUBTEST << frame0.cols << 'x' << frame0.rows << "; rubberwhale1.png; "<<frame1.cols<<'x'<<frame1.rows<<"; rubberwhale2.png";
+
+    alg->calc(frame0, frame1, flow);
+
+    CPU_ON;
+    alg->calc(frame0, frame1, flow);
+    CPU_OFF;
+
+    cv::Mat gold[2];
+    cv::split(flow, gold);
+
+    cv::ocl::oclMat d0(frame0.size(), CV_32FC1);
+    d0.upload(frame0);
+    cv::ocl::oclMat d1(frame1.size(), CV_32FC1);
+    d1.upload(frame1);
+
+    WARMUP_ON;
+    d_alg(d0, d1, d_flowx, d_flowy);
+    WARMUP_OFF;
+/*
+    double diff1 = 0.0, diff2 = 0.0;
+    if(ExceptedMatSimilar(gold[0], cv::Mat(d_flowx), 3e-3, diff1) == 1
+        &&ExceptedMatSimilar(gold[1], cv::Mat(d_flowy), 3e-3, diff2) == 1)
+        TestSystem::instance().setAccurate(1);
+    else
+        TestSystem::instance().setAccurate(0);
+
+    TestSystem::instance().setDiff(diff1);
+    TestSystem::instance().setDiff(diff2);
+*/
+
+
+    GPU_ON;
+    d_alg(d0, d1, d_flowx, d_flowy);
+    d_alg.collectGarbage();
+    GPU_OFF;
+    
+
+    cv::Mat flowx, flowy;
+
+    GPU_FULL_ON;
+    d0.upload(frame0);
+    d1.upload(frame1);
+    d_alg(d0, d1, d_flowx, d_flowy);
+    d_alg.collectGarbage();
+    d_flowx.download(flowx);
+    d_flowy.download(flowy);
+    GPU_FULL_OFF;
+
+    TestSystem::instance().ExceptedMatSimilar(gold[0], flowx, 3e-3);
+    TestSystem::instance().ExceptedMatSimilar(gold[1], flowy, 3e-3);
+}
--- a/modules/ocl/perf/perf_pyramid.cpp
+++ b/modules/ocl/perf/perf_pyramid.cpp
@@ -48,7 +48,7 @@
 ///////////// pyrDown //////////////////////
 PERFTEST(pyrDown)
 {
-    Mat src, dst;
+    Mat src, dst, ocl_dst;
    int all_type[] = {CV_8UC1, CV_8UC4};
    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

@@ -73,9 +73,6 @@ PERFTEST(pyrDown)
            ocl::pyrDown(d_src, d_dst);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), dst.depth() == CV_32F ? 1e-4f : 1.0f));                        
-
-
            GPU_ON;
            ocl::pyrDown(d_src, d_dst);
            GPU_OFF;
@@ -83,8 +80,53 @@ PERFTEST(pyrDown)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::pyrDown(d_src, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, dst.depth() == CV_32F ? 1e-4f : 1.0f);
+        }
+    }
+}
+
+///////////// pyrUp ////////////////////////
+PERFTEST(pyrUp)
+{
+    Mat src, dst, ocl_dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = 500; size <= 2000; size *= 2)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            pyrUp(src, dst);
+
+            CPU_ON;
+            pyrUp(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
+
+            WARMUP_ON;
+            ocl::pyrUp(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::pyrUp(d_src, d_dst);
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrUp(d_src, d_dst);
+            d_dst.download(ocl_dst);
+            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, (src.depth() == CV_32F ? 1e-4f : 1.0));
        }
    }
 }
--- a/modules/ocl/perf/perf_pyrup.cpp
+++ b/modules/ocl/perf/perf_pyrup.cpp
@@ -1,89 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-
-///////////// pyrUp ////////////////////////
-PERFTEST(pyrUp)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 500; size <= 2000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            pyrUp(src, dst);
-
-            CPU_ON;
-            pyrUp(src, dst);
-            CPU_OFF;
-
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
-
-            WARMUP_ON;
-            ocl::pyrUp(d_src, d_dst);
-            WARMUP_OFF;
-
-            TestSystem::instance().setAccurate(ExpectedMatNear(dst, cv::Mat(d_dst), (src.depth() == CV_32F ? 1e-4f : 1.0)));                     
-
-            GPU_ON;
-            ocl::pyrUp(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pyrUp(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-        }
-    }
-}
--- a/modules/ocl/perf/perf_split_merge.cpp
+++ b/modules/ocl/perf/perf_split_merge.cpp
@@ -48,7 +48,7 @@
 ///////////// Merge////////////////////////
 PERFTEST(Merge)
 {
-    Mat dst;
+    Mat dst, ocl_dst;
    ocl::oclMat d_dst;

    int channels = 4;
@@ -85,22 +85,20 @@ PERFTEST(Merge)
            ocl::merge(d_src, d_dst);
            WARMUP_OFF;

-            TestSystem::instance().setAccurate(ExpectedMatNear(cv::Mat(dst), cv::Mat(d_dst), 0.0));                     
-
            GPU_ON;
            ocl::merge(d_src, d_dst);
            GPU_OFF;

            GPU_FULL_ON;
-
            for (int i = 0; i < channels; ++i)
            {
-                d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
+                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
            }
-
            ocl::merge(d_src, d_dst);
-            d_dst.download(dst);
+            d_dst.download(ocl_dst);
            GPU_FULL_OFF;
+
+            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
        }

    }
@@ -122,7 +120,7 @@ PERFTEST(Split)

            Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));

-            std::vector<cv::Mat> dst;
+            std::vector<cv::Mat> dst, ocl_dst(4);

            split(src, dst);

@@ -135,22 +133,7 @@ PERFTEST(Split)

            WARMUP_ON;
            ocl::split(d_src, d_dst);
-            WARMUP_OFF;
-
-            if(d_dst.size() == dst.size())
-            {
-                TestSystem::instance().setAccurate(1);
-                for(size_t i = 0; i < dst.size(); i++)
-                {
-                    if(ExpectedMatNear(dst[i], cv::Mat(d_dst[i]), 0.0) == 0)
-                    {
-                        TestSystem::instance().setAccurate(0);
-                        break;
-                    }
-                }
-            }else
-                TestSystem::instance().setAccurate(0);
-                                
+            WARMUP_OFF;         

            GPU_ON;
            ocl::split(d_src, d_dst);
@@ -159,7 +142,12 @@ PERFTEST(Split)
            GPU_FULL_ON;
            d_src.upload(src);
            ocl::split(d_src, d_dst);
+            for(size_t i = 0; i < dst.size(); i++)
+                d_dst[i].download(ocl_dst[i]);
            GPU_FULL_OFF;
+
+            vector<double> eps(4, 0.);
+            TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
        }

    }
--- a/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@@ -114,7 +114,6 @@ void TestSystem::finishCurrentSubtest()
        return;
    }

-    int is_accurate = is_accurate_;
    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
@@ -171,8 +170,8 @@ void TestSystem::finishCurrentSubtest()
        deviation = std::sqrt(sum / gpu_times_.size());
    }

-    printMetrics(is_accurate, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
-    writeMetrics(is_accurate, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
+    printMetrics(is_accurate_, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
+    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);

    num_subtests_called_++;
    resetCurrentSubtest();
@@ -219,7 +218,7 @@ void TestSystem::writeHeading()
        }
    }

-    fprintf(record_, "NAME,DESCRIPTION,ACCURACY,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
+    fprintf(record_, "NAME,DESCRIPTION,ACCURACY,DIFFERENCE,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");

    fflush(record_);
 }
@@ -392,7 +391,7 @@ void TestSystem::printMetrics(int is_accurate, double cpu_time, double gpu_time,
 #endif
 }

-void TestSystem::writeMetrics(int is_accurate, double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
+void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
 {
    if (!record_)
    {
@@ -402,21 +401,24 @@ void TestSystem::writeMetrics(int is_accurate, double cpu_time, double gpu_time,

    string _is_accurate_;

-    if(is_accurate == 1)
+    if(is_accurate_ == 1)
        _is_accurate_ = "Pass";
-    else if(is_accurate == 0)
+    else if(is_accurate_ == 0)
        _is_accurate_ = "Fail";
-    else if(is_accurate == -1)
+    else if(is_accurate_ == -1)
        _is_accurate_ = " ";
    else
    {
-        std::cout<<"is_accurate errer: "<<is_accurate<<"\n";
+        std::cout<<"is_accurate errer: "<<is_accurate_<<"\n";
        exit(-1);
    }

-    fprintf(record_, "%s,%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
+    fprintf(record_, "%s,%s,%s,%.2f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n",
+        itname_changed_ ? itname_.c_str() : "",
        cur_subtest_description_.str().c_str(),
-        _is_accurate_.c_str(), cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
+        _is_accurate_.c_str(),
+        accurate_diff_,
+        cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
        gpu_min, gpu_max, std_dev);

    if (itname_changed_)
@@ -469,134 +471,6 @@ void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
    RNG rng(0);
    rng.fill(mat, RNG::UNIFORM, low, high);
 }
-#if 0
-void gen(Mat &mat, int rows, int cols, int type, int low, int high, int n)
-{
-    assert(n > 0&&n <= cols * rows);
-    assert(type == CV_8UC1||type == CV_8UC3||type == CV_8UC4
-        ||type == CV_32FC1||type == CV_32FC3||type == CV_32FC4);
-
-    RNG rng;
-    //generate random position without duplication
-    std::vector<int> pos;
-    for(int i = 0; i < cols * rows; i++)
-    {
-        pos.push_back(i);
-    }
-
-    for(int i = 0; i < cols * rows; i++)
-    {
-        int temp = i + rng.uniform(0, cols * rows - 1 - i);
-        int temp1 = pos[temp];
-        pos[temp]= pos[i];
-        pos[i] = temp1;
-    }
-
-    std::vector<int> selected_pos;
-    for(int i = 0; i < n; i++)
-    {
-        selected_pos.push_back(pos[i]);
-    }
-
-    pos.clear();
-    //end of generating random y without duplication
-
-    if(type == CV_8UC1)
-    {
-        typedef struct coorStruct_
-        {
-            int x;
-            int y;
-            uchar xy;
-        }coorStruct;
-
-        coorStruct coor_struct;
-
-        std::vector<coorStruct> coor;
-
-        for(int i = 0; i < n; i++)
-        {
-            coor_struct.x = -1;
-            coor_struct.y = -1;
-            coor_struct.xy = (uchar)rng.uniform(low, high);
-            coor.push_back(coor_struct);
-        }
-
-        for(int i = 0; i < n; i++)
-        {
-            coor[i].y = selected_pos[i]/cols;
-            coor[i].x = selected_pos[i]%cols;
-        }
-        selected_pos.clear();
-
-        mat.create(rows, cols, type);
-        mat.setTo(0);
-
-        for(int i = 0; i < n; i++)
-        {
-            mat.at<unsigned char>(coor[i].y, coor[i].x) = coor[i].xy;
-        }
-    }
-
-    if(type == CV_8UC4 || type == CV_8UC3)
-    {
-        mat.create(rows, cols, type);
-        mat.setTo(0);
-
-        typedef struct Coor
-        {
-            int x;
-            int y;
-
-            uchar r;
-            uchar g;
-            uchar b;
-            uchar alpha;
-        }coor;
-
-        std::vector<coor> coor_vect;
-
-        coor xy_coor;
-
-        for(int i = 0; i < n; i++)
-        {
-            xy_coor.r = (uchar)rng.uniform(low, high);
-            xy_coor.g = (uchar)rng.uniform(low, high);
-            xy_coor.b = (uchar)rng.uniform(low, high);
-            if(type == CV_8UC4)
-                xy_coor.alpha = (uchar)rng.uniform(low, high);
-
-            coor_vect.push_back(xy_coor);
-        }
-
-        for(int i = 0; i < n; i++)
-        {
-            coor_vect[i].y = selected_pos[i]/((int)mat.step1()/mat.elemSize());
-            coor_vect[i].x = selected_pos[i]%((int)mat.step1()/mat.elemSize());
-            //printf("coor_vect[%d] = (%d, %d)\n", i, coor_vect[i].y, coor_vect[i].x);
-        }
-
-        if(type == CV_8UC4)
-        {
-            for(int i = 0; i < n; i++)
-            {
-                mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x) = coor_vect[i].r;
-                mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 1) = coor_vect[i].g;
-                mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 2) = coor_vect[i].b;
-                mat.at<unsigned char>(coor_vect[i].y, 4 * coor_vect[i].x + 3) = coor_vect[i].alpha;
-            }
-        }else if(type == CV_8UC3)
-        {
-            for(int i = 0; i < n; i++)
-            {
-                mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x) = coor_vect[i].r;
-                mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x + 1) = coor_vect[i].g;
-                mat.at<unsigned char>(coor_vect[i].y, 3 * coor_vect[i].x + 2) = coor_vect[i].b;
-            }
-        }
-    }
-}
-#endif

 string abspath(const string &relpath)
 {
@@ -619,31 +493,3 @@ double checkSimilarity(const Mat &m1, const Mat &m2)
    matchTemplate(m1, m2, diff, TM_CCORR_NORMED);
    return std::abs(diff.at<float>(0, 0) - 1.f);
 }
-
-
-int ExpectedMatNear(cv::Mat dst, cv::Mat cpu_dst, double eps)
-{
-    assert(dst.type() == cpu_dst.type());
-    assert(dst.size() == cpu_dst.size());
-    if(checkNorm(cv::Mat(dst), cv::Mat(cpu_dst)) < eps ||checkNorm(cv::Mat(dst), cv::Mat(cpu_dst)) == eps)
-        return 1;
-    return 0;
-}
-
-int ExceptDoubleNear(double val1, double val2, double abs_error)
-{
-    const double diff = fabs(val1 - val2);
-    if (diff <= abs_error)
-        return 1;
-
-    return 0;
-}
-
-int ExceptedMatSimilar(cv::Mat dst, cv::Mat cpu_dst, double eps)
-{
-    assert(dst.type() == cpu_dst.type());
-    assert(dst.size() == cpu_dst.size());
-    if(checkSimilarity(cv::Mat(cpu_dst), cv::Mat(dst)) <= eps)
-        return 1;
-    return 0;
-}
--- a/modules/ocl/perf/precomp.hpp
+++ b/modules/ocl/perf/precomp.hpp
@@ -322,9 +322,46 @@ public:
        itname_changed_ = true;
    }

-    void setAccurate(int is_accurate = -1)
+    void setAccurate(int accurate, double diff)
    {
-        is_accurate_ = is_accurate;
+        is_accurate_ = accurate;
+        accurate_diff_ = diff;
+    }
+
+    void ExpectMatsNear(vector<Mat>& dst, vector<Mat>& cpu_dst, vector<double>& eps)
+    {
+        assert(dst.size() == cpu_dst.size());
+        assert(cpu_dst.size() == eps.size());
+        is_accurate_ = 1;
+        for(size_t i=0; i<dst.size(); i++)
+        {
+            double cur_diff = checkNorm(dst[i], cpu_dst[i]);
+            accurate_diff_ = max(accurate_diff_, cur_diff);
+            if(cur_diff > eps[i])
+                is_accurate_ = 0;
+        }
+    }
+
+    void ExpectedMatNear(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
+    {
+        assert(dst.type() == cpu_dst.type());
+        assert(dst.size() == cpu_dst.size());
+        accurate_diff_ = checkNorm(dst, cpu_dst);
+        if(accurate_diff_ <= eps)
+            is_accurate_ = 1;
+        else
+            is_accurate_ = 0;
+    }
+
+    void ExceptedMatSimilar(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
+    {
+        assert(dst.type() == cpu_dst.type());
+        assert(dst.size() == cpu_dst.size());
+        accurate_diff_ = checkSimilarity(cpu_dst, dst);
+        if(accurate_diff_ <= eps)
+            is_accurate_ = 1;
+        else
+            is_accurate_ = 0;
    }

    std::stringstream &getCurSubtestDescription()
@@ -342,7 +379,7 @@ private:
        num_iters_(10), cpu_num_iters_(2),
        gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
        record_(0), recordname_("performance"), itname_changed_(true),
-        is_accurate_(-1)
+        is_accurate_(-1), accurate_diff_(0.)
    {
        cpu_times_.reserve(num_iters_);
        gpu_times_.reserve(num_iters_);
@@ -363,6 +400,7 @@ private:
        gpu_times_.clear();
        gpu_full_times_.clear();
        is_accurate_ = -1;
+        accurate_diff_ = 0.;
    }

    double meanTime(const std::vector<int64> &samples);
@@ -373,7 +411,7 @@ private:

    void writeHeading();
    void writeSummary();
-    void writeMetrics(int is_accurate, double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
+    void writeMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
                      double speedup = 0.0f, double fullspeedup = 0.0f,
                      double gpu_min = 0.0f, double gpu_max = 0.0f, double std_dev = 0.0f);

@@ -425,6 +463,7 @@ private:
    bool itname_changed_;

    int is_accurate_;
+    double accurate_diff_;
 };


--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -412,11 +412,11 @@ static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, String kernelN
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));

+    float f_scalar = (float)scalar;
    if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
        args.push_back( std::make_pair( sizeof(cl_double), (void *)&scalar ));
    else
    {
-        float f_scalar = (float)scalar;
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&f_scalar));
    }

@@ -783,45 +783,55 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
    }
 }

-template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
+template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
+                                             const oclMat &mask, oclMat &buf)
 {
    size_t groupnum = src.clCxt->computeUnits();
    CV_Assert(groupnum != 0);
    groupnum = groupnum * 2;
    int vlen = 8;
    int dbsize = groupnum * 2 * vlen * sizeof(T) ;
-    Context *clCxt = src.clCxt;
-    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
-    *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
+
+    ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
+
+    cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
+
    if (mask.empty())
    {
-        arithmetic_minMax_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax");
+        arithmetic_minMax_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax");
    }
    else
    {
-        arithmetic_minMax_mask_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax_mask");
+        arithmetic_minMax_mask_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax_mask");
    }
-    T *p = new T[groupnum * vlen * 2];
-    memset(p, 0, dbsize);
-    openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
-    if(minVal != NULL){
+
+    Mat matbuf = Mat(buf);
+    T *p = matbuf.ptr<T>();
+    if(minVal != NULL)
+    {
+        *minVal = std::numeric_limits<double>::max();
        for(int i = 0; i < vlen * (int)groupnum; i++)
        {
            *minVal = *minVal < p[i] ? *minVal : p[i];
        }
    }
-    if(maxVal != NULL){
+    if(maxVal != NULL)
+    {
+        *maxVal = -std::numeric_limits<double>::max();
        for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
        {
            *maxVal = *maxVal > p[i] ? *maxVal : p[i];
        }
    }
-    delete[] p;
-    openCLFree(dstBuffer);
 }

-typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask);
+typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf);
 void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
+{
+    oclMat buf;
+    minMax_buf(src, minVal, maxVal, mask, buf);
+}
+void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
 {
    CV_Assert(src.oclchannels() == 1);
    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -841,7 +851,7 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
    };
    minMaxFunc func;
    func = functab[src.depth()];
-    func(src, minVal, maxVal, mask);
+    func(src, minVal, maxVal, mask, buf);
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -1688,10 +1698,11 @@ void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String ker
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));

+    T scalar;
    if(_scalar != NULL)
    {
        double scalar1 = *((double *)_scalar);
-        T scalar = (T)scalar1;
+        scalar = (T)scalar1;
        args.push_back( std::make_pair( sizeof(T), (void *)&scalar ));
    }

@@ -2308,9 +2319,9 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, String
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
+    float pf = p;
    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
    {
-        float pf = p;
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
    }
    else
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -244,11 +244,12 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, const oclM
 {
    const oclMat zeroMask;
    const oclMat &tempMask = mask.data ? mask : zeroMask;
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
    if (query.cols <= 64)
    {
        matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
    }
-    else if (query.cols <= 128)
+    else if (query.cols <= 128 && !is_cpu)
    {
        matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx,  distance, distType);
    }
@@ -263,11 +264,12 @@ static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, co
 {
    const oclMat zeroMask;
    const oclMat &tempMask = mask.data ? mask : zeroMask;
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
    if (query.cols <= 64)
    {
        matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
    }
-    else if (query.cols <= 128)
+    else if (query.cols <= 128 && !is_cpu)
    {
        matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
    }
@@ -283,11 +285,12 @@ static void matchDispatcher(const oclMat &query, const oclMat &train, float maxD
 {
    const oclMat zeroMask;
    const oclMat &tempMask = mask.data ? mask : zeroMask;
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
    if (query.cols <= 64)
    {
        matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
    }
-    else if (query.cols <= 128)
+    else if (query.cols <= 128 && !is_cpu)
    {
        matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
    }
@@ -465,11 +468,12 @@ static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, con
 static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
                      const oclMat &trainIdx, const oclMat &distance, int distType)
 {
+    bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
    if (query.cols <= 64)
    {
        knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
    }
-    else if (query.cols <= 128)
+    else if (query.cols <= 128 && !is_cpu)
    {
        knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType);
    }
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@ -239,7 +239,7 @@ void canny::calcSobelRowPass_gpu(const oclMat &src, oclMat &dx_buf, oclMat &dy_b

    size_t globalThreads[3] = {cols, rows, 1};
    size_t localThreads[3]  = {16, 16, 1};
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat &dx, oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
@@ -269,12 +269,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat
    size_t globalThreads[3] = {cols, rows, 1};
    size_t localThreads[3]  = {16, 16, 1};

-    char build_options [15] = "";
-    if(L2Grad)
-    {
-        strcat(build_options, "-D L2GRAD");
-    }
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+    const char * build_options = L2Grad ? "-D L2GRAD":"";
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
 }
 void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
 {
@@ -297,12 +293,8 @@ void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, i
    size_t globalThreads[3] = {cols, rows, 1};
    size_t localThreads[3]  = {16, 16, 1};

-    char build_options [15] = "";
-    if(L2Grad)
-    {
-        strcat(build_options, "-D L2GRAD");
-    }
-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+    const char * build_options = L2Grad ? "-D L2GRAD":"";
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
 }

 void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh)
@@ -333,7 +325,7 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro
    String kernelName = "calcMap";
    size_t localThreads[3]  = {16, 16, 1};

-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols)
@@ -353,7 +345,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
    size_t globalThreads[3] = {cols, rows, 1};
    size_t localThreads[3]  = {16, 16, 1};

-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
@@ -383,7 +375,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));

-        openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
+        openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
        std::swap(st1, st2);
    }
@@ -408,5 +400,5 @@ void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols)
    size_t globalThreads[3] = {cols, rows, 1};
    size_t localThreads[3]  = {16, 16, 1};

-    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -356,8 +356,7 @@ static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
    char compile_option[128];
    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s",
        anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
-        rectKernel?"-D RECTKERNEL":"",
-        s);
+        s, rectKernel?"-D RECTKERNEL":"");
    std::vector< std::pair<size_t, const void *> > args;
    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
--- a/modules/ocl/src/gfft.cpp
+++ b/modules/ocl/src/gfft.cpp
@@ -0,0 +1,349 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <iomanip>
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+
+static bool use_cpu_sorter = true;
+
+namespace cv
+{
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *imgproc_gfft;
+    }
+}
+
+namespace
+{
+enum SortMethod
+{
+    CPU_STL,
+    BITONIC,
+    SELECTION
+};
+
+const int GROUP_SIZE = 256;
+
+template<SortMethod method>
+struct Sorter
+{
+    //typedef EigType;
+};
+
+//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
+template<>
+struct Sorter<CPU_STL>
+{
+    typedef oclMat EigType;
+    static cv::Mutex cs;
+    static Mat mat_eig;
+
+    //prototype
+    static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
+    {
+        float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
+        float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
+        return v1 > v2;
+    }
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        cv::AutoLock lock(cs);
+        //temporarily use STL's sort function
+        Mat mat_corners = corners;
+        mat_eig = eig_tex;
+        std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
+        corners = mat_corners;
+    }
+};
+cv::Mutex Sorter<CPU_STL>::cs;
+cv::Mat   Sorter<CPU_STL>::mat_eig;
+
+template<>
+struct Sorter<BITONIC>
+{
+    typedef TextureCL EigType;
+
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        Context * cxt = Context::getContext();
+        size_t globalThreads[3] = {count / 2, 1, 1};
+        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+        // 2^numStages should be equal to count or the output is invalid
+        int numStages = 0;
+        for(int i = count; i > 1; i >>= 1)
+        {
+            ++numStages;
+        }
+        const int argc = 5;
+        std::vector< std::pair<size_t, const void *> > args(argc);
+        String kernelname = "sortCorners_bitonicSort";
+        args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
+        args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
+        args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
+        for(int stage = 0; stage < numStages; ++stage)
+        {
+            args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
+            for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
+            {
+                args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+                openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+            }
+        }
+    }
+};
+
+template<>
+struct Sorter<SELECTION>
+{
+    typedef TextureCL EigType;
+
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        Context * cxt = Context::getContext();
+
+        size_t globalThreads[3] = {count, 1, 1};
+        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+        std::vector< std::pair<size_t, const void *> > args;
+        //local
+        String kernelname = "sortCorners_selectionSortLocal";
+        int lds_size = GROUP_SIZE * sizeof(cl_float2);
+        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
+        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
+        args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
+        args.push_back( std::make_pair( lds_size,       (void*)NULL) );
+
+        openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+
+        //final
+        kernelname = "sortCorners_selectionSortFinal";
+        args.pop_back();
+        openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+    }
+};
+
+int findCorners_caller(
+    const TextureCL& eig,
+    const float threshold,
+    const oclMat& mask,
+    oclMat& corners,
+    const int max_count)
+{
+    std::vector<int> k;
+    Context * cxt = Context::getContext();
+
+    std::vector< std::pair<size_t, const void*> > args;
+    String kernelname = "findCorners";
+
+    const int mask_strip = mask.step / mask.elemSize1();
+
+    oclMat g_counter(1, 1, CV_32SC1);
+    g_counter.setTo(0);
+
+    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&eig  ));
+    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&mask.data ));
+    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&corners.data ));
+    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&mask_strip));
+    args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold ));
+    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows ));
+    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols ));
+    args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count ));
+    args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
+
+    size_t globalThreads[3] = {eig.cols, eig.rows, 1};
+    size_t localThreads[3]  = {16, 16, 1};
+
+    const char * opt = mask.empty() ? "" : "-D WITH_MASK";
+    openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1, opt);
+    return std::min(Mat(g_counter).at<int>(0), max_count);
+}
+}//unnamed namespace
+
+void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
+{
+    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
+
+    CV_DbgAssert(support_image2d());
+
+    ensureSizeIsEnough(image.size(), CV_32F, eig_);
+
+    if (useHarrisDetector)
+        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
+    else
+        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
+
+    double maxVal = 0;
+    minMax_buf(eig_, 0, &maxVal, oclMat(), minMaxbuf_);
+
+    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+
+    Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
+    int total = findCorners_caller(
+        *eig_tex,
+        static_cast<float>(maxVal * qualityLevel),
+        mask,
+        tmpCorners_,
+        tmpCorners_.cols);
+
+    if (total == 0)
+    {
+        corners.release();
+        return;
+    }
+    if(use_cpu_sorter)
+    {
+        Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
+    }
+    else
+    {
+        //if total is power of 2
+        if(((total - 1) & (total)) == 0)
+        {
+            Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
+        }
+        else
+        {
+            Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
+        }
+    }
+
+    if (minDistance < 1)
+    {
+        corners = tmpCorners_(Rect(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1));
+    }
+    else
+    {
+        std::vector<Point2f> tmp(total);
+        downloadPoints(tmpCorners_, tmp);
+
+        std::vector<Point2f> tmp2;
+        tmp2.reserve(total);
+
+        const int cell_size = cvRound(minDistance);
+        const int grid_width = (image.cols + cell_size - 1) / cell_size;
+        const int grid_height = (image.rows + cell_size - 1) / cell_size;
+
+        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+
+        for (int i = 0; i < total; ++i)
+        {
+            Point2f p = tmp[i];
+
+            bool good = true;
+
+            int x_cell = static_cast<int>(p.x / cell_size);
+            int y_cell = static_cast<int>(p.y / cell_size);
+
+            int x1 = x_cell - 1;
+            int y1 = y_cell - 1;
+            int x2 = x_cell + 1;
+            int y2 = y_cell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(grid_width - 1, x2);
+            y2 = std::min(grid_height - 1, y2);
+
+            for (int yy = y1; yy <= y2; yy++)
+            {
+                for (int xx = x1; xx <= x2; xx++)
+                {
+                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
+
+                    if (!m.empty())
+                    {
+                        for(size_t j = 0; j < m.size(); j++)
+                        {
+                            float dx = p.x - m[j].x;
+                            float dy = p.y - m[j].y;
+
+                            if (dx * dx + dy * dy < minDistance * minDistance)
+                            {
+                                good = false;
+                                goto break_out;
+                            }
+                        }
+                    }
+                }
+            }
+
+            break_out:
+
+            if(good)
+            {
+                grid[y_cell * grid_width + x_cell].push_back(p);
+
+                tmp2.push_back(p);
+
+                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
+                    break;
+            }
+        }
+
+        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+    }
+}
+void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
+{
+    CV_DbgAssert(points.type() == CV_32FC2);
+    points_v.resize(points.cols);
+    openCLSafeCall(clEnqueueReadBuffer(
+        *reinterpret_cast<cl_command_queue*>(getoclCommandQueue()),
+        reinterpret_cast<cl_mem>(points.data),
+        CL_TRUE,
+        0,
+        points.cols * sizeof(Point2f),
+        &points_v[0],
+        0,
+        NULL,
+        NULL));
+}
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -136,47 +136,22 @@ struct CvHidHaarClassifierCascade
 };
 typedef struct
 {
-    //int rows;
-    //int ystep;
    int width_height;
-    //int height;
    int grpnumperline_totalgrp;
-    //int totalgrp;
    int imgoff;
    float factor;
 } detect_piramid_info;
-
-#if defined WIN32 && !defined __MINGW__ && !defined __MINGW32__
+#ifdef WIN32
 #define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT))
-typedef _ALIGNED_ON(128) struct  GpuHidHaarFeature
-{
-    _ALIGNED_ON(32) struct
-    {
-        _ALIGNED_ON(4)  int    p0 ;
-        _ALIGNED_ON(4)  int    p1 ;
-        _ALIGNED_ON(4)  int    p2 ;
-        _ALIGNED_ON(4)  int    p3 ;
-        _ALIGNED_ON(4)  float weight  ;
-    }
-    /*_ALIGNED_ON(32)*/ rect[CV_HAAR_FEATURE_MAX] ;
-}
-GpuHidHaarFeature;
-

 typedef _ALIGNED_ON(128) struct  GpuHidHaarTreeNode
 {
    _ALIGNED_ON(64) int p[CV_HAAR_FEATURE_MAX][4];
-    //_ALIGNED_ON(16) int p1[CV_HAAR_FEATURE_MAX] ;
-    //_ALIGNED_ON(16) int p2[CV_HAAR_FEATURE_MAX] ;
-    //_ALIGNED_ON(16) int p3[CV_HAAR_FEATURE_MAX] ;
-    /*_ALIGNED_ON(16)*/
    float weight[CV_HAAR_FEATURE_MAX] ;
-    /*_ALIGNED_ON(4)*/
    float threshold ;
-    _ALIGNED_ON(8) float alpha[2] ;
+    _ALIGNED_ON(16) float alpha[3] ;
    _ALIGNED_ON(4) int left ;
    _ALIGNED_ON(4) int right ;
-    // GpuHidHaarFeature feature __attribute__((aligned (128)));
 }
 GpuHidHaarTreeNode;

@@ -184,7 +159,6 @@ GpuHidHaarTreeNode;
 typedef  _ALIGNED_ON(32) struct  GpuHidHaarClassifier
 {
    _ALIGNED_ON(4) int count;
-    //CvHaarFeature* orig_feature;
    _ALIGNED_ON(8) GpuHidHaarTreeNode *node ;
    _ALIGNED_ON(8) float *alpha ;
 }
@@ -219,32 +193,16 @@ typedef _ALIGNED_ON(64) struct  GpuHidHaarClassifierCascade
    _ALIGNED_ON(4) int p2 ;
    _ALIGNED_ON(4) int p3 ;
    _ALIGNED_ON(4) float inv_window_area ;
-    // GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
 } GpuHidHaarClassifierCascade;
 #else
 #define _ALIGNED_ON(_ALIGNMENT) __attribute__((aligned(_ALIGNMENT) ))

-typedef struct _ALIGNED_ON(128) GpuHidHaarFeature
-{
-    struct _ALIGNED_ON(32)
-{
-    int    p0 _ALIGNED_ON(4);
-    int    p1 _ALIGNED_ON(4);
-    int    p2 _ALIGNED_ON(4);
-    int    p3 _ALIGNED_ON(4);
-    float weight  _ALIGNED_ON(4);
-}
-rect[CV_HAAR_FEATURE_MAX] _ALIGNED_ON(32);
-}
-GpuHidHaarFeature;
-
-
 typedef struct _ALIGNED_ON(128) GpuHidHaarTreeNode
 {
    int p[CV_HAAR_FEATURE_MAX][4] _ALIGNED_ON(64);
    float weight[CV_HAAR_FEATURE_MAX];// _ALIGNED_ON(16);
    float threshold;// _ALIGNED_ON(4);
-    float alpha[2] _ALIGNED_ON(8);
+    float alpha[3] _ALIGNED_ON(16);
    int left _ALIGNED_ON(4);
    int right _ALIGNED_ON(4);
 }
@@ -287,7 +245,6 @@ typedef struct _ALIGNED_ON(64) GpuHidHaarClassifierCascade
    int p2 _ALIGNED_ON(4);
    int p3 _ALIGNED_ON(4);
    float inv_window_area _ALIGNED_ON(4);
-    // GpuHidHaarStageClassifier* stage_classifier __attribute__((aligned (8)));
 } GpuHidHaarClassifierCascade;
 #endif

@@ -295,36 +252,6 @@ const int icv_object_win_border = 1;
 const float icv_stage_threshold_bias = 0.0001f;
 double globaltime = 0;

-
-// static CvHaarClassifierCascade * gpuCreateHaarClassifierCascade( int stage_count )
-// {
-//     CvHaarClassifierCascade *cascade = 0;
-
-//     int block_size = sizeof(*cascade) + stage_count * sizeof(*cascade->stage_classifier);
-
-//     if( stage_count <= 0 )
-//         CV_Error( CV_StsOutOfRange, "Number of stages should be positive" );
-
-//     cascade = (CvHaarClassifierCascade *)cvAlloc( block_size );
-//     memset( cascade, 0, block_size );
-
-//     cascade->stage_classifier = (CvHaarStageClassifier *)(cascade + 1);
-//     cascade->flags = CV_HAAR_MAGIC_VAL;
-//     cascade->count = stage_count;
-
-//     return cascade;
-// }
-
-//static int globalcounter = 0;
-
-// static void gpuReleaseHidHaarClassifierCascade( GpuHidHaarClassifierCascade **_cascade )
-// {
-//     if( _cascade && *_cascade )
-//     {
-//         cvFree( _cascade );
-//     }
-// }
-
 /* create more efficient internal representation of haar classifier cascade */
 static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarClassifierCascade *cascade, int *size, int *totalclassifier)
 {
@@ -440,24 +367,12 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
        hid_stage_classifier->two_rects = 1;
        haar_classifier_ptr += stage_classifier->count;

-        /*
-        hid_stage_classifier->parent = (stage_classifier->parent == -1)
-        ? NULL : stage_classifier_ptr + stage_classifier->parent;
-        hid_stage_classifier->next = (stage_classifier->next == -1)
-        ? NULL : stage_classifier_ptr + stage_classifier->next;
-        hid_stage_classifier->child = (stage_classifier->child == -1)
-        ? NULL : stage_classifier_ptr + stage_classifier->child;
-
-        out->is_tree |= hid_stage_classifier->next != NULL;
-        */
-
        for( j = 0; j < stage_classifier->count; j++ )
        {
            CvHaarClassifier *classifier         = stage_classifier->classifier + j;
            GpuHidHaarClassifier *hid_classifier = hid_stage_classifier->classifier + j;
            int node_count = classifier->count;

-            //   float* alpha_ptr = (float*)(haar_node_ptr + node_count);
            float *alpha_ptr = &haar_node_ptr->alpha[0];

            hid_classifier->count = node_count;
@@ -484,16 +399,12 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl
                    node->p[2][3] = 0;
                    node->weight[2] = 0;
                }
-                //   memset( &(node->feature.rect[2]), 0, sizeof(node->feature.rect[2]) );
                else
                    hid_stage_classifier->two_rects = 0;
+
+                memcpy( node->alpha, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
+                haar_node_ptr = haar_node_ptr + 1;
            }
-
-            memcpy( alpha_ptr, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
-            haar_node_ptr = haar_node_ptr + 1;
-            // (GpuHidHaarTreeNode*)cvAlignPtr(alpha_ptr+node_count+1, sizeof(void*));
-            //   (GpuHidHaarTreeNode*)(alpha_ptr+node_count+1);
-
            out->is_stump_based &= node_count == 1;
        }
    }
@@ -506,25 +417,19 @@ static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarCl


 #define sum_elem_ptr(sum,row,col)  \
-	((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
+    ((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))

 #define sqsum_elem_ptr(sqsum,row,col)  \
-	((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
+    ((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))

 #define calc_sum(rect,offset) \
-	((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
+    ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])


 static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_cascade,
-                                      /*   const CvArr* _sum,
-                                      const CvArr* _sqsum,
-                                      const CvArr* _tilted_sum,*/
                                      double scale,
                                      int step)
 {
-    //   CvMat sum_stub, *sum = (CvMat*)_sum;
-    //   CvMat sqsum_stub, *sqsum = (CvMat*)_sqsum;
-    //   CvMat tilted_stub, *tilted = (CvMat*)_tilted_sum;
    GpuHidHaarClassifierCascade *cascade;
    int coi0 = 0, coi1 = 0;
    int i;
@@ -540,61 +445,25 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
    if( scale <= 0 )
        CV_Error( CV_StsOutOfRange, "Scale must be positive" );

-    //   sum = cvGetMat( sum, &sum_stub, &coi0 );
-    //   sqsum = cvGetMat( sqsum, &sqsum_stub, &coi1 );
-
    if( coi0 || coi1 )
        CV_Error( CV_BadCOI, "COI is not supported" );

-    //   if( !CV_ARE_SIZES_EQ( sum, sqsum ))
-    //       CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
-
-    //   if( CV_MAT_TYPE(sqsum->type) != CV_64FC1 ||
-    //       CV_MAT_TYPE(sum->type) != CV_32SC1 )
-    //       CV_Error( CV_StsUnsupportedFormat,
-    //       "Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
-
    if( !_cascade->hid_cascade )
        gpuCreateHidHaarClassifierCascade(_cascade, &datasize, &total);

    cascade = (GpuHidHaarClassifierCascade *) _cascade->hid_cascade;
    stage_classifier = (GpuHidHaarStageClassifier *) (cascade + 1);

-    if( cascade->has_tilted_features )
-    {
-        //    tilted = cvGetMat( tilted, &tilted_stub, &coi1 );
-
-        //    if( CV_MAT_TYPE(tilted->type) != CV_32SC1 )
-        //        CV_Error( CV_StsUnsupportedFormat,
-        //        "Only (32s, 64f, 32s) combination of (sum,sqsum,tilted_sum) formats is allowed" );
-
-        //    if( sum->step != tilted->step )
-        //        CV_Error( CV_StsUnmatchedSizes,
-        //        "Sum and tilted_sum must have the same stride (step, widthStep)" );
-
-        //    if( !CV_ARE_SIZES_EQ( sum, tilted ))
-        //        CV_Error( CV_StsUnmatchedSizes, "All integral images must have the same size" );
-        //  cascade->tilted = *tilted;
-    }
-
    _cascade->scale = scale;
    _cascade->real_window_size.width = cvRound( _cascade->orig_window_size.width * scale );
    _cascade->real_window_size.height = cvRound( _cascade->orig_window_size.height * scale );

-    //cascade->sum = *sum;
-    //cascade->sqsum = *sqsum;
-
    equRect.x = equRect.y = cvRound(scale);
    equRect.width = cvRound((_cascade->orig_window_size.width - 2) * scale);
    equRect.height = cvRound((_cascade->orig_window_size.height - 2) * scale);
    weight_scale = 1. / (equRect.width * equRect.height);
    cascade->inv_window_area = weight_scale;

-    //	cascade->pq0 = equRect.y * step + equRect.x;
-    //	cascade->pq1 = equRect.y * step + equRect.x + equRect.width ;
-    //	cascade->pq2 = (equRect.y + equRect.height)*step + equRect.x;
-    //	cascade->pq3 = (equRect.y + equRect.height)*step + equRect.x + equRect.width ;
-
    cascade->pq0 = equRect.x;
    cascade->pq1 = equRect.y;
    cascade->pq2 = equRect.x + equRect.width;
@@ -617,10 +486,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
            {
                CvHaarFeature *feature =
                    &_cascade->stage_classifier[i].classifier[j].haar_feature[l];
-                /*  GpuHidHaarClassifier* classifier =
-                cascade->stage_classifier[i].classifier + j; */
-                //GpuHidHaarFeature* hidfeature =
-                //    &cascade->stage_classifier[i].classifier[j].node[l].feature;
                GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
                double sum0 = 0, area0 = 0;
                CvRect r[3];
@@ -635,8 +500,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
                /* align blocks */
                for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
                {
-                    //if( !hidfeature->rect[k].p0 )
-                    //    break;
                    if(!hidnode->p[k][0])
                        break;
                    r[k] = feature->rect[k].r;
@@ -716,15 +579,6 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc

                    if( !feature->tilted )
                    {
-                        /*     hidfeature->rect[k].p0 = tr.y * sum->cols + tr.x;
-                        hidfeature->rect[k].p1 = tr.y * sum->cols + tr.x + tr.width;
-                        hidfeature->rect[k].p2 = (tr.y + tr.height) * sum->cols + tr.x;
-                        hidfeature->rect[k].p3 = (tr.y + tr.height) * sum->cols + tr.x + tr.width;
-                        */
-                        /*hidnode->p0[k] = tr.y * step + tr.x;
-                        hidnode->p1[k] = tr.y * step + tr.x + tr.width;
-                        hidnode->p2[k] = (tr.y + tr.height) * step + tr.x;
-                        hidnode->p3[k] = (tr.y + tr.height) * step + tr.x + tr.width;*/
                        hidnode->p[k][0] = tr.x;
                        hidnode->p[k][1] = tr.y;
                        hidnode->p[k][2] = tr.x + tr.width;
@@ -732,37 +586,24 @@ static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_casc
                    }
                    else
                    {
-                        /*    hidfeature->rect[k].p2 = (tr.y + tr.width) * tilted->cols + tr.x + tr.width;
-                        hidfeature->rect[k].p3 = (tr.y + tr.width + tr.height) * tilted->cols + tr.x + tr.width - tr.height;
-                        hidfeature->rect[k].p0 = tr.y * tilted->cols + tr.x;
-                        hidfeature->rect[k].p1 = (tr.y + tr.height) * tilted->cols + tr.x - tr.height;
-                        */
-
                        hidnode->p[k][2] = (tr.y + tr.width) * step + tr.x + tr.width;
                        hidnode->p[k][3] = (tr.y + tr.width + tr.height) * step + tr.x + tr.width - tr.height;
                        hidnode->p[k][0] = tr.y * step + tr.x;
                        hidnode->p[k][1] = (tr.y + tr.height) * step + tr.x - tr.height;
                    }
-
-                    //hidfeature->rect[k].weight = (float)(feature->rect[k].weight * correction_ratio);
                    hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
                    if( k == 0 )
                        area0 = tr.width * tr.height;
                    else
-                        //sum0 += hidfeature->rect[k].weight * tr.width * tr.height;
                        sum0 += hidnode->weight[k] * tr.width * tr.height;
                }
-
-                // hidfeature->rect[0].weight = (float)(-sum0/area0);
                hidnode->weight[0] = (float)(-sum0 / area0);
            } /* l */
        } /* j */
    }
 }

-static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
-                             /*double scale=0.0,*/
-                             /*int step*/)
+static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade)
 {
    GpuHidHaarClassifierCascade *cascade;
    int i;
@@ -816,11 +657,7 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
                    if(!hidnode->p[k][0])
                        break;
                    r[k] = feature->rect[k].r;
-                    // 					base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].width-1) );
-                    // 					base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].x - r[0].x-1) );
-                    // 					base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].height-1) );
-                    // 					base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].y - r[0].y-1) );
-                }
+               }

                nr = k;
                for( k = 0; k < nr; k++ )
@@ -838,7 +675,6 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
                    hidnode->p[k][3] = tr.height;
                    hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
                }
-                //hidnode->weight[0]=(float)(-sum0/area0);
            } /* l */
        } /* j */
    }
@@ -851,7 +687,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS

    const double GROUP_EPS = 0.2;
    CvSeq *result_seq = 0;
-    cv::Ptr<CvMemStorage> temp_storage;

    cv::ConcurrentRectVector allCandidates;
    std::vector<cv::Rect> rectList;
@@ -909,6 +744,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
    if( gimg.cols < minSize.width || gimg.rows < minSize.height )
        CV_Error(CV_StsError, "Image too small");

+    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
    if( (flags & CV_HAAR_SCALE_IMAGE) )
    {
        CvSize winSize0 = cascade->orig_window_size;
@@ -951,7 +787,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS

        size_t blocksize = 8;
        size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0],
+        size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->computeUnits()) *localThreads[0],
                                    localThreads[1], 1
                                  };
        int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -996,7 +832,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );

        stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
-        cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
        openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));

        nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
@@ -1043,7 +878,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq ));
        args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));

-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
+        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+
+        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);

        openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );

@@ -1058,6 +895,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
        openCLSafeCall(clReleaseMemObject(nodebuffer));
        openCLSafeCall(clReleaseMemObject(candidatebuffer));
+
    }
    else
    {
@@ -1115,7 +953,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
                       sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
        nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
                                        nodenum * sizeof(GpuHidHaarTreeNode));
-        cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
        openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
                                            nodenum * sizeof(GpuHidHaarTreeNode),
                                            node, 0, NULL, NULL));
@@ -1157,7 +994,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
            args1.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnodenum ));

            size_t globalThreads2[3] = {nodenum, 1, 1};
-
            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
        }

@@ -1193,7 +1029,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&nodenum ));

-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
+        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);

        candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);

@@ -1281,7 +1117,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
    int blocksize = 8;
    int grp_per_CU = 12;
    size_t localThreads[3] = { blocksize, blocksize, 1 };
-    size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0],
+    size_t globalThreads[3] = { grp_per_CU * cv::ocl::Context::getContext()->computeUnits() *localThreads[0],
        localThreads[1],
        1 };
    int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -1297,8 +1133,6 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
    CvHaarClassifierCascade      *cascade = oldCascade;
    GpuHidHaarClassifierCascade  *gcascade;
    GpuHidHaarStageClassifier    *stage;
-    GpuHidHaarClassifier         *classifier;
-    GpuHidHaarTreeNode           *node;

    if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
@@ -1311,7 +1145,7 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
    }

    int *candidate;
-
+    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
    if( (flags & CV_HAAR_SCALE_IMAGE) )
    {
        int indexy = 0;
@@ -1337,19 +1171,6 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std

        gcascade   = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
        stage      = (GpuHidHaarStageClassifier *)(gcascade + 1);
-        classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
-        node       = (GpuHidHaarTreeNode *)(classifier->node);
-
-        gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
-
-        cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
-        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
-                                            sizeof(GpuHidHaarStageClassifier) * gcascade->count,
-                                            stage, 0, NULL, NULL));
-
-        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
-                                            m_nodenum * sizeof(GpuHidHaarTreeNode),
-                                            node, 0, NULL, NULL));

        int startstage = 0;
        int endstage = gcascade->count;
@@ -1386,17 +1207,23 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
        args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
        args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));

-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
+        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+
+        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);

        candidate = (int *)malloc(4 * sizeof(int) * outputsz);
        memset(candidate, 0, 4 * sizeof(int) * outputsz);
+
        openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz );

        for(int i = 0; i < outputsz; i++)
+        {
            if(candidate[4 * i + 2] != 0)
+            {
                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
                candidate[4 * i + 2], candidate[4 * i + 3]));
-
+            }
+        }
        free((void *)candidate);
        candidate = NULL;
    }
@@ -1404,6 +1231,132 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
    {
        cv::ocl::integral(gimg, gsum, gsqsum);

+        gcascade   = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
+
+        int step = gsum.step / 4;
+        int startnode = 0;
+        int splitstage = 3;
+
+        int startstage = 0;
+        int endstage = gcascade->count;
+
+        vector<pair<size_t, const void *> > args;
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
+        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
+        args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
+
+        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
+
+        candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
+
+        for(int i = 0; i < outputsz; i++)
+        {
+            if(candidate[4 * i + 2] != 0)
+                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
+                candidate[4 * i + 2], candidate[4 * i + 3]));
+        }
+        clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
+    }
+    rectList.resize(allCandidates.size());
+    if(!allCandidates.empty())
+        std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
+
+    if( minNeighbors != 0 || findBiggestObject )
+        groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
+    else
+        rweights.resize(rectList.size(), 0);
+
+    GenResult(faces, rectList, rweights);
+}
+
+void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
+    double scaleFactor, int flags,
+    const int outputsz, const size_t localThreads[],
+    CvSize minSize, CvSize maxSize)
+{
+    if(initialized)
+    {
+        return; // we only allow one time initialization
+    }
+    CvHaarClassifierCascade      *cascade = oldCascade;
+
+    if( !CV_IS_HAAR_CLASSIFIER(cascade) )
+        CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
+
+    if( scaleFactor <= 1 )
+        CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
+
+    if( cols < minSize.width || rows < minSize.height )
+        CV_Error(CV_StsError, "Image too small");
+
+    int datasize=0;
+    int totalclassifier=0;
+
+    if( !cascade->hid_cascade )
+    {
+        gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
+    }
+
+    if( maxSize.height == 0 || maxSize.width == 0 )
+    {
+        maxSize.height = rows;
+        maxSize.width = cols;
+    }
+
+    findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
+    if( findBiggestObject )
+        flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
+
+    CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
+    CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
+
+    m_scaleFactor = scaleFactor;
+    m_rows = rows;
+    m_cols = cols;
+    m_flags = flags;
+    m_minSize = minSize;
+    m_maxSize = maxSize;
+
+    // initialize nodes
+    GpuHidHaarClassifierCascade  *gcascade;
+    GpuHidHaarStageClassifier    *stage;
+    GpuHidHaarClassifier         *classifier;
+    GpuHidHaarTreeNode           *node;
+    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
+    if( (flags & CV_HAAR_SCALE_IMAGE) )
+    {
+        gcascade   = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
+        stage      = (GpuHidHaarStageClassifier *)(gcascade + 1);
+        classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
+        node       = (GpuHidHaarTreeNode *)(classifier->node);
+
+        gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
+
+        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
+            sizeof(GpuHidHaarStageClassifier) * gcascade->count,
+            stage, 0, NULL, NULL));
+
+        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
+                                            m_nodenum * sizeof(GpuHidHaarTreeNode),
+                                            node, 0, NULL, NULL));
+    }
+    else
+    {
        gpuSetHaarClassifierCascade(cascade);

        gcascade   = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
@@ -1411,15 +1364,12 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std
        classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
        node       = (GpuHidHaarTreeNode *)(classifier->node);

-        cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
-                                            m_nodenum * sizeof(GpuHidHaarTreeNode),
-                                            node, 0, NULL, NULL));
+            m_nodenum * sizeof(GpuHidHaarTreeNode),
+            node, 0, NULL, NULL));

        cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount);
        float *correction = (float *)malloc(sizeof(float) * m_loopcount);
-        int startstage = 0;
-        int endstage = gcascade->count;
        double factor;
        for(int i = 0; i < m_loopcount; i++)
        {
@@ -1445,105 +1395,15 @@ void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std

            size_t globalThreads2[3] = {m_nodenum, 1, 1};

-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
+            openCLExecuteKernel(Context::getContext(), &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
        }
-
-        int step = gsum.step / 4;
-        int startnode = 0;
-        int splitstage = 3;
        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL));
        openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL));

-        vector<pair<size_t, const void *> > args;
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
-        args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
-        args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
-
-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
-
-        candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
-
-        for(int i = 0; i < outputsz; i++)
-        {
-            if(candidate[4 * i + 2] != 0)
-                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
-                candidate[4 * i + 2], candidate[4 * i + 3]));
-        }
-
        free(p);
        free(correction);
-        clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
    }
-
-    rectList.resize(allCandidates.size());
-    if(!allCandidates.empty())
-        std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
-
-    if( minNeighbors != 0 || findBiggestObject )
-        groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
-    else
-        rweights.resize(rectList.size(), 0);
-
-    GenResult(faces, rectList, rweights);
-}
-
-void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
-    double scaleFactor, int flags,
-    const int outputsz, const size_t localThreads[],
-    CvSize minSize, CvSize maxSize)
-{
-    CvHaarClassifierCascade      *cascade = oldCascade;
-
-    if( !CV_IS_HAAR_CLASSIFIER(cascade) )
-        CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
-
-    if( scaleFactor <= 1 )
-        CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
-
-    if( cols < minSize.width || rows < minSize.height )
-        CV_Error(CV_StsError, "Image too small");
-
-    int datasize=0;
-    int totalclassifier=0;
-
-    if( !cascade->hid_cascade )
-        gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
-
-    if( maxSize.height == 0 || maxSize.width == 0 )
-    {
-        maxSize.height = rows;
-        maxSize.width = cols;
-    }
-
-    findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
-    if( findBiggestObject )
-        flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
-
-    CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
-    CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
-
-    m_scaleFactor = scaleFactor;
-    m_rows = rows;
-    m_cols = cols;
-    m_flags = flags;
-    m_minSize = minSize;
-    m_maxSize = maxSize;
-
    initialized = true;
 }

@@ -1642,6 +1502,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
    CvSize sz;
    CvSize winSize0 = oldCascade->orig_window_size;
    detect_piramid_info *scaleinfo;
+    cl_command_queue qu = reinterpret_cast<cl_command_queue>(Context::getContext()->oclCommandQueue());
    if (flags & CV_HAAR_SCALE_IMAGE)
    {
        for(factor = 1.f;; factor *= scaleFactor)
@@ -1743,7 +1604,7 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
        ((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
    }

-    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
+    openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
        sizeof(detect_piramid_info)*loopcount,
        scaleinfo, 0, NULL, NULL));
    free(scaleinfo);
@@ -1755,7 +1616,8 @@ void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& f
                                                 const std::vector<cv::Rect> &rectList,
                                                 const std::vector<int> &rweights)
 {
-    CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) );
+    MemStorage tempStorage(cvCreateMemStorage(0));
+    CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), tempStorage );

    if( findBiggestObject && rectList.size() )
    {
@@ -1791,168 +1653,32 @@ void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& f

 void cv::ocl::OclCascadeClassifierBuf::release()
 {
-    openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
-    openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
-    openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
-    openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
-
-    if( (m_flags & CV_HAAR_SCALE_IMAGE) )
+    if(initialized)
    {
-        cvFree(&oldCascade->hid_cascade);
-    }
-    else
-    {
-        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
-        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
-        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
-    }
+        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
+        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
+        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
+        openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));

-    free(buffers);
-    buffers = NULL;
+        if( (m_flags & CV_HAAR_SCALE_IMAGE) )
+        {
+            cvFree(&oldCascade->hid_cascade);
+        }
+        else
+        {
+            openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
+            openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
+            openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
+        }
+
+        free(buffers);
+        buffers = NULL;
+        initialized = false;
+    }
 }

 #ifndef _MAX_PATH
 #define _MAX_PATH 1024
 #endif

-
-/****************************************************************************************\
-*                                  Persistence functions                                 *
-\****************************************************************************************/
-
-/* field names */
-
-#define ICV_HAAR_SIZE_NAME            "size"
-#define ICV_HAAR_STAGES_NAME          "stages"
-#define ICV_HAAR_TREES_NAME             "trees"
-#define ICV_HAAR_FEATURE_NAME             "feature"
-#define ICV_HAAR_RECTS_NAME                 "rects"
-#define ICV_HAAR_TILTED_NAME                "tilted"
-#define ICV_HAAR_THRESHOLD_NAME           "threshold"
-#define ICV_HAAR_LEFT_NODE_NAME           "left_node"
-#define ICV_HAAR_LEFT_VAL_NAME            "left_val"
-#define ICV_HAAR_RIGHT_NODE_NAME          "right_node"
-#define ICV_HAAR_RIGHT_VAL_NAME           "right_val"
-#define ICV_HAAR_STAGE_THRESHOLD_NAME   "stage_threshold"
-#define ICV_HAAR_PARENT_NAME            "parent"
-#define ICV_HAAR_NEXT_NAME              "next"
-
-static int gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade, CvPoint pt, int start_stage */)
-{
-    return 1;
-}
-
-namespace cv
-{
-namespace ocl
-{
-
-struct gpuHaarDetectObjects_ScaleImage_Invoker
-{
-    gpuHaarDetectObjects_ScaleImage_Invoker( const CvHaarClassifierCascade *_cascade,
-            int _stripSize, double _factor,
-            const Mat &_sum1, const Mat &_sqsum1, Mat *_norm1,
-            Mat *_mask1, Rect _equRect, ConcurrentRectVector &_vec )
-    {
-        cascade = _cascade;
-        stripSize = _stripSize;
-        factor = _factor;
-        sum1 = _sum1;
-        sqsum1 = _sqsum1;
-        norm1 = _norm1;
-        mask1 = _mask1;
-        equRect = _equRect;
-        vec = &_vec;
-    }
-
-    void operator()( const BlockedRange &range ) const
-    {
-        Size winSize0 = cascade->orig_window_size;
-        Size winSize(cvRound(winSize0.width * factor), cvRound(winSize0.height * factor));
-        int y1 = range.begin() * stripSize, y2 = std::min(range.end() * stripSize, sum1.rows - 1 - winSize0.height);
-        Size ssz(sum1.cols - 1 - winSize0.width, y2 - y1);
-        int x, y, ystep = factor > 2 ? 1 : 2;
-
-        for( y = y1; y < y2; y += ystep )
-            for( x = 0; x < ssz.width; x += ystep )
-            {
-                if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 )
-                    vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor),
-                                        winSize.width, winSize.height));
-            }
-    }
-
-    const CvHaarClassifierCascade *cascade;
-    int stripSize;
-    double factor;
-    Mat sum1, sqsum1, *norm1, *mask1;
-    Rect equRect;
-    ConcurrentRectVector *vec;
-};
-
-
-struct gpuHaarDetectObjects_ScaleCascade_Invoker
-{
-    gpuHaarDetectObjects_ScaleCascade_Invoker( const CvHaarClassifierCascade *_cascade,
-            Size _winsize, const Range &_xrange, double _ystep,
-            size_t _sumstep, const int **_p, const int **_pq,
-            ConcurrentRectVector &_vec )
-    {
-        cascade = _cascade;
-        winsize = _winsize;
-        xrange = _xrange;
-        ystep = _ystep;
-        sumstep = _sumstep;
-        p = _p;
-        pq = _pq;
-        vec = &_vec;
-    }
-
-    void operator()( const BlockedRange &range ) const
-    {
-        int iy, startY = range.begin(), endY = range.end();
-        const int *p0 = p[0], *p1 = p[1], *p2 = p[2], *p3 = p[3];
-        const int *pq0 = pq[0], *pq1 = pq[1], *pq2 = pq[2], *pq3 = pq[3];
-        bool doCannyPruning = p0 != 0;
-        int sstep = (int)(sumstep / sizeof(p0[0]));
-
-        for( iy = startY; iy < endY; iy++ )
-        {
-            int ix, y = cvRound(iy * ystep), ixstep = 1;
-            for( ix = xrange.start; ix < xrange.end; ix += ixstep )
-            {
-                int x = cvRound(ix * ystep); // it should really be ystep, not ixstep
-
-                if( doCannyPruning )
-                {
-                    int offset = y * sstep + x;
-                    int s = p0[offset] - p1[offset] - p2[offset] + p3[offset];
-                    int sq = pq0[offset] - pq1[offset] - pq2[offset] + pq3[offset];
-                    if( s < 100 || sq < 20 )
-                    {
-                        ixstep = 2;
-                        continue;
-                    }
-                }
-
-                int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */);
-                if( result > 0 )
-                    vec->push_back(Rect(x, y, winsize.width, winsize.height));
-                ixstep = result != 0 ? 1 : 2;
-            }
-        }
-    }
-
-    const CvHaarClassifierCascade *cascade;
-    double ystep;
-    size_t sumstep;
-    Size winsize;
-    Range xrange;
-    const int **p;
-    const int **pq;
-    ConcurrentRectVector *vec;
-};
-
-}
-}
 #endif
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -270,7 +270,7 @@ namespace cv
            size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
            size_t localThreads[3] = {blkSizeX, blkSizeY, 1};

-
+            float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
            std::vector< std::pair<size_t, const void *> > args;
            if(map1.channels() == 2)
            {
@@ -292,7 +292,7 @@ namespace cv
                args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
                float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};

-               if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
+                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
                {
                    args.push_back( std::make_pair(sizeof(cl_double4), (void *)&borderValue));
                }
@@ -326,7 +326,6 @@ namespace cv
                }
                else
                {
-                    float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
                    args.push_back( std::make_pair(sizeof(cl_float4), (void *)&borderFloat));
                }
            }
@@ -1210,30 +1209,41 @@ namespace cv
        void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
                          double k, int borderType)
        {
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
-            {
-                CV_Error(Error::GpuNotSupported, "select device don't support double");
-            }
-            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-            oclMat Dx, Dy;
-            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-            extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
-            dst.create(src.size(), CV_32F);
-            corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), Dx, Dy, dst, borderType);
+            oclMat dx, dy;
+            cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
        }

-        void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
+        void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
+                          double k, int borderType)
        {
            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
            {
                CV_Error(Error::GpuNotSupported, "select device don't support double");
            }
            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-            oclMat Dx, Dy;
            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-            extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
+            extractCovData(src, dx, dy, blockSize, ksize, borderType);
            dst.create(src.size(), CV_32F);
-            corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, Dx, Dy, dst, borderType);
+            corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
+        }
+
+        void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
+        {
+            oclMat dx, dy;
+            cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
+        }
+
+        void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
+        {
+            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            {
+                CV_Error(Error::GpuNotSupported, "select device don't support double");
+            }
+            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
+            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+            extractCovData(src, dx, dy, blockSize, ksize, borderType);
+            dst.create(src.size(), CV_32F);
+            corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
        }
        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
        static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -43,9 +43,28 @@
 //
 //M*/

-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #include "precomp.hpp"

+#ifdef __GNUC__
+#if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 402
+#define GCC_DIAG_STR(s) #s
+#define GCC_DIAG_JOINSTR(x,y) GCC_DIAG_STR(x ## y)
+# define GCC_DIAG_DO_PRAGMA(x) _Pragma (#x)
+# define GCC_DIAG_PRAGMA(x) GCC_DIAG_DO_PRAGMA(GCC diagnostic x)
+# if ((__GNUC__ * 100) + __GNUC_MINOR__) >= 406
+#  define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(push) \
+GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
+#  define GCC_DIAG_ON(x) GCC_DIAG_PRAGMA(pop)
+# else
+#  define GCC_DIAG_OFF(x) GCC_DIAG_PRAGMA(ignored GCC_DIAG_JOINSTR(-W,x))
+#  define GCC_DIAG_ON(x)  GCC_DIAG_PRAGMA(warning GCC_DIAG_JOINSTR(-W,x))
+# endif
+#else
+# define GCC_DIAG_OFF(x)
+# define GCC_DIAG_ON(x)
+#endif
+#endif /* __GNUC__ */
+
 using namespace std;

 namespace cv
@@ -121,6 +140,9 @@ namespace cv
                                  build_options, finish_mode);
        }

+#ifdef __GNUC__
+        GCC_DIAG_OFF(deprecated-declarations)
+#endif
        cl_mem bindTexture(const oclMat &mat)
        {
            cl_mem texture;
@@ -156,7 +178,7 @@ namespace cv
                format.image_channel_order     = CL_RGBA;
                break;
            default:
-                CV_Error(-1, "Image forma is not supported");
+                CV_Error(-1, "Image format is not supported");
                break;
            }
 #ifdef CL_VERSION_1_2
@@ -180,10 +202,6 @@ namespace cv
            else
 #endif
            {
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
                texture = clCreateImage2D(
                    (cl_context)mat.clCxt->oclContext(),
                    CL_MEM_READ_WRITE,
@@ -193,9 +211,6 @@ namespace cv
                    0,
                    NULL,
                    &err);
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
            }
            size_t origin[] = { 0, 0, 0 };
            size_t region[] = { mat.cols, mat.rows, 1 };
@@ -225,6 +240,14 @@ namespace cv
            openCLSafeCall(err);
            return texture;
        }
+#ifdef __GNUC__
+        GCC_DIAG_ON(deprecated-declarations)
+#endif
+
+        Ptr<TextureCL> bindTexturePtr(const oclMat &mat)
+        {
+            return Ptr<TextureCL>(new TextureCL(bindTexture(mat), mat.rows, mat.cols, mat.type()));
+        }
        void releaseTexture(cl_mem& texture)
        {
            openCLFree(texture);
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -127,7 +127,7 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 3)
+#define dst_align ((dst_offset / 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -165,7 +165,7 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 3)
+#define dst_align ((dst_offset / 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));

@@ -335,7 +335,7 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -375,7 +375,7 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -507,7 +507,7 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -126,7 +126,7 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_st
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -164,7 +164,7 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_ste
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

@@ -288,7 +288,7 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
 #ifdef dst_align
 #undef dst_align
 #endif
-#define dst_align ((dst_offset >> 1) & 1)
+#define dst_align ((dst_offset / 2) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);

--- a/modules/ocl/src/opencl/filtering_morph.cl
+++ b/modules/ocl/src/opencl/filtering_morph.cl
@@ -120,7 +120,7 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
    int gidy = get_global_id(1);
    int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);

-    if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3)==0)
+    if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
    {
        *(__global uchar4*)&dst[out_addr] = res;
    }
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -10,6 +10,7 @@
 //    Wang Weiyan, wangweiyanster@gmail.com
 //    Jia Haipeng, jiahaipeng95@gmail.com
 //    Nathan, liujun@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -45,27 +46,16 @@
 typedef int   sumtype;
 typedef float sqsumtype;

-typedef struct  __attribute__((aligned (128)))  GpuHidHaarFeature
-{
-    struct __attribute__((aligned (32)))
-{
-    int p0 __attribute__((aligned (4)));
-    int p1 __attribute__((aligned (4)));
-    int p2 __attribute__((aligned (4)));
-    int p3 __attribute__((aligned (4)));
-    float weight __attribute__((aligned (4)));
-}
-rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned (32)));
-}
-GpuHidHaarFeature;
-
+#ifndef STUMP_BASED
+#define STUMP_BASED 1
+#endif

 typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
 {
    int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
-    float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
-    float threshold /*__attribute__((aligned (4)))*/;
-    float alpha[2] __attribute__((aligned (8)));
+    float weight[CV_HAAR_FEATURE_MAX];
+    float threshold;
+    float alpha[3] __attribute__((aligned (16)));
    int left __attribute__((aligned (4)));
    int right __attribute__((aligned (4)));
 }
@@ -111,7 +101,6 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
    float inv_window_area __attribute__((aligned (4)));
 } GpuHidHaarClassifierCascade;

-
 __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
    global GpuHidHaarStageClassifier * stagecascadeptr,
    global int4 * info,
@@ -234,7 +223,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                float stage_sum = 0.f;
                int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
                float stagethreshold = as_float(stageinfo.y);
-                for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
+                for(int nodeloop = 0; nodeloop < stageinfo.x; )
                {
                    __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);

@@ -242,7 +231,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                    int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
                    int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
                    float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-                    float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                    float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
+
                    float nodethreshold  = w.w * variance_norm_factor;

                    info1.x +=lcl_off;
@@ -261,8 +251,34 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                    classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
                                    lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;

-                    stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                    bool passThres = classsum >= nodethreshold;
+#if STUMP_BASED
+                    stage_sum += passThres ? alpha3.y : alpha3.x;
                    nodecounter++;
+                    nodeloop++;
+#else
+                    bool isRootNode = (nodecounter & 1) == 0;
+                    if(isRootNode)
+                    {
+                        if( (passThres && currentnodeptr->right) ||
+                            (!passThres && currentnodeptr->left))
+                        {
+                            nodecounter ++;
+                        }
+                        else
+                        {
+                            stage_sum += alpha3.x;
+                            nodecounter += 2;
+                            nodeloop ++;
+                        }
+                    }
+                    else
+                    {
+                        stage_sum += passThres ? alpha3.z : alpha3.y;
+                        nodecounter ++;
+                        nodeloop ++;
+                    }
+#endif
                }

                result = (stage_sum >= stagethreshold);
@@ -301,18 +317,20 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa

                    if(lcl_compute_win_id < queuecount)
                    {
-
                        int tempnodecounter = lcl_compute_id;
                        float part_sum = 0.f;
-                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
+                        const int stump_factor = STUMP_BASED ? 1 : 2;
+                        int root_offset = 0;
+                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
                        {
-                            __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
+                            __global GpuHidHaarTreeNode* currentnodeptr =
+                                nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;

                            int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
                            int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
                            int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
                            float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-                            float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
+                            float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
                            float nodethreshold  = w.w * variance_norm_factor;

                            info1.x +=queue_pixel;
@@ -332,8 +350,34 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                            classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
                                            lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;

-                            part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
-                            tempnodecounter +=lcl_compute_win;
+                            bool passThres = classsum >= nodethreshold;
+#if STUMP_BASED
+                            part_sum += passThres ? alpha3.y : alpha3.x;
+                            tempnodecounter += lcl_compute_win;
+                            lcl_loop++;
+#else
+                            if(root_offset == 0)
+                            {
+                                if( (passThres && currentnodeptr->right) ||
+                                    (!passThres && currentnodeptr->left))
+                                {
+                                    root_offset = 1;
+                                }
+                                else
+                                {
+                                    part_sum += alpha3.x;
+                                    tempnodecounter += lcl_compute_win;
+                                    lcl_loop++;
+                                }
+                            }
+                            else
+                            {
+                                part_sum += passThres ? alpha3.z : alpha3.y;
+                                tempnodecounter += lcl_compute_win;
+                                lcl_loop++;
+                                root_offset = 0;
+                            }
+#endif
                        }//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
                        partialsum[lcl_id]=part_sum;
                    }
@@ -377,155 +421,3 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
        }//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
    }//end for(int scalei = 0; scalei <loopcount; scalei++)
 }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-/*
-if(stagecascade->two_rects)
-{
-    #pragma unroll
-    for( n = 0; n < stagecascade->count; n++ )
-    {
-        t1 = *(node + counter);
-        t = t1.threshold * variance_norm_factor;
-        classsum = calc_sum1(t1,p_offset,0) * t1.weight[0];
-
-        classsum  += calc_sum1(t1, p_offset,1) * t1.weight[1];
-        stage_sum += classsum >= t ? t1.alpha[1]:t1.alpha[0];
-
-        counter++;
-    }
-}
-else
-{
-    #pragma unroll
-    for( n = 0; n < stagecascade->count; n++ )
-    {
-        t = node[counter].threshold*variance_norm_factor;
-        classsum = calc_sum1(node[counter],p_offset,0) * node[counter].weight[0];
-        classsum += calc_sum1(node[counter],p_offset,1) * node[counter].weight[1];
-
-        if( node[counter].p0[2] )
-            classsum += calc_sum1(node[counter],p_offset,2) * node[counter].weight[2];
-
-        stage_sum += classsum >= t ? node[counter].alpha[1]:node[counter].alpha[0];// modify
-
-        counter++;
-    }
-}
-*/
-/*
-__kernel void gpuRunHaarClassifierCascade_ScaleWindow(
-                          constant GpuHidHaarClassifierCascade * _cascade,
-                          global GpuHidHaarStageClassifier * stagecascadeptr,
-                          //global GpuHidHaarClassifier * classifierptr,
-                          global GpuHidHaarTreeNode * nodeptr,
-                          global int * sum,
-                          global float * sqsum,
-                          global int * _candidate,
-                          int pixel_step,
-                          int cols,
-                          int rows,
-                          int start_stage,
-                          int end_stage,
-                          //int counts,
-                          int nodenum,
-                          int ystep,
-                          int detect_width,
-                          //int detect_height,
-                          int loopcount,
-                          int outputstep)
-                          //float scalefactor)
-{
-unsigned int x1 = get_global_id(0);
-unsigned int y1 = get_global_id(1);
-int p_offset;
-int m, n;
-int result;
-int counter;
-float mean, variance_norm_factor;
-for(int i=0;i<loopcount;i++)
-{
-constant GpuHidHaarClassifierCascade * cascade = _cascade + i;
-global int * candidate = _candidate + i*outputstep;
-int window_width = cascade->p1 - cascade->p0;
-int window_height = window_width;
-result = 1;
-counter = 0;
-unsigned int x = mul24(x1,ystep);
-unsigned int y = mul24(y1,ystep);
-if((x < cols - window_width - 1) && (y < rows - window_height -1))
-{
-global GpuHidHaarStageClassifier *stagecascade = stagecascadeptr +cascade->count*i+ start_stage;
-//global GpuHidHaarClassifier      *classifier   = classifierptr;
-global GpuHidHaarTreeNode        *node         = nodeptr + nodenum*i;
-
-p_offset = mad24(y, pixel_step, x);// modify
-
-mean = (*(sum + p_offset + (int)cascade->p0) - *(sum + p_offset + (int)cascade->p1) -
-    *(sum + p_offset + (int)cascade->p2) + *(sum + p_offset + (int)cascade->p3))
-    *cascade->inv_window_area;
-
-variance_norm_factor = *(sqsum + p_offset + cascade->p0) - *(sqsum + cascade->p1 + p_offset) -
-                    *(sqsum + p_offset + cascade->p2) + *(sqsum + cascade->p3 + p_offset);
-variance_norm_factor = variance_norm_factor * cascade->inv_window_area - mean * mean;
-variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1;//modify
-
-// if( cascade->is_stump_based )
-//{
-for( m = start_stage; m < end_stage; m++ )
-{
-float stage_sum = 0.f;
-float t,  classsum;
-GpuHidHaarTreeNode t1;
-
-//#pragma unroll
-for( n = 0; n < stagecascade->count; n++ )
-{
-     t1 = *(node + counter);
-     t  = t1.threshold * variance_norm_factor;
-     classsum = calc_sum1(t1, p_offset ,0) * t1.weight[0] + calc_sum1(t1, p_offset ,1) * t1.weight[1];
-
-     if((t1.p0[2]) && (!stagecascade->two_rects))
-         classsum += calc_sum1(t1, p_offset, 2) * t1.weight[2];
-
-     stage_sum += classsum >= t ? t1.alpha[1] : t1.alpha[0];// modify
-     counter++;
-}
-
-if (stage_sum < stagecascade->threshold)
-{
-    result = 0;
-    break;
-}
-
-stagecascade++;
-
-}
-if(result)
-{
-    candidate[4 * (y1 * detect_width + x1)]     = x;
-    candidate[4 * (y1 * detect_width + x1) + 1] = y;
-    candidate[4 * (y1 * detect_width + x1)+2]     = window_width;
-    candidate[4 * (y1 * detect_width + x1) + 3] = window_height;
-}
-//}
-}
-}
-}
-*/
--- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -17,7 +17,7 @@
 // @Authors
 //    Wu Xinglong, wxl370@126.com
 //    Sen Liu, swjtuls1987@126.com
-//
+//    Peng Xiao, pengxiao@outlook.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -49,25 +49,13 @@
 #define CV_HAAR_FEATURE_MAX           3
 typedef int   sumtype;
 typedef float sqsumtype;
-typedef struct  __attribute__((aligned(128)))  GpuHidHaarFeature
-{
-    struct __attribute__((aligned(32)))
-{
-    int p0 __attribute__((aligned(4)));
-    int p1 __attribute__((aligned(4)));
-    int p2 __attribute__((aligned(4)));
-    int p3 __attribute__((aligned(4)));
-    float weight __attribute__((aligned(4)));
-}
-rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32)));
-}
-GpuHidHaarFeature;
+
 typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
 {
    int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
    float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
    float threshold /*__attribute__((aligned (4)))*/;
-    float alpha[2] __attribute__((aligned(8)));
+    float alpha[3] __attribute__((aligned(16)));
    int left __attribute__((aligned(4)));
    int right __attribute__((aligned(4)));
 }
@@ -174,45 +162,83 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                const int p_offset = mad24(y, step, x);
                cascadeinfo.x += p_offset;
                cascadeinfo.z += p_offset;
-                mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
-                        sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
+                mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
+                - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
+                        sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+                + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
                       * correction_t;
-                variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
-                                       sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
+                variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
+                - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
+                                       sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+                + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
                variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
                variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
                bool result = true;
                nodecounter = startnode + nodecount * scalei;
-
                for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
                {
                    float stage_sum = 0.f;
                    int   stagecount = stagecascadeptr[stageloop].count;
-                    for (int nodeloop = 0; nodeloop < stagecount; nodeloop++)
+                    for (int nodeloop = 0; nodeloop < stagecount;)
                    {
                        __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
                        int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
                        int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
                        int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
                        float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
-                        float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
+                        float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
                        float nodethreshold  = w.w * variance_norm_factor;
+
                        info1.x += p_offset;
                        info1.z += p_offset;
                        info2.x += p_offset;
                        info2.z += p_offset;
-                        float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
-                                          sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
-                        classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
-                                     sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
                        info3.x += p_offset;
                        info3.z += p_offset;
-                        classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
-                                     sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
-                        stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                        float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
+                        - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
+                                          sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
+                        + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
+                        classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
+                        - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
+                                     sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
+                        + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
+                        classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
+                        - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
+                                     sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
+                        + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
+                        
+                        bool passThres = classsum >= nodethreshold;
+
+#if STUMP_BASED
+                        stage_sum += passThres ? alpha3.y : alpha3.x;
                        nodecounter++;
+                        nodeloop++;
+#else
+                        bool isRootNode = (nodecounter & 1) == 0;
+                        if(isRootNode)
+                        {
+                            if( (passThres && currentnodeptr->right) ||
+                                (!passThres && currentnodeptr->left))
+                            {
+                                nodecounter ++;
+                            }
+                            else
+                            {
+                                stage_sum += alpha3.x;
+                                nodecounter += 2;
+                                nodeloop ++;
+                            }
+                        }
+                        else
+                        {
+                            stage_sum += (passThres ? alpha3.z : alpha3.y);
+                            nodecounter ++;
+                            nodeloop ++;
+                        }
+#endif
                    }
-                    result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold);
+                    result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
                }

                barrier(CLK_LOCAL_MEM_FENCE);
@@ -222,7 +248,6 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                    int queueindex = atomic_inc(lclcount);
                    lcloutindex[queueindex] = (y << 16) | x;
                }
-
                barrier(CLK_LOCAL_MEM_FENCE);
                int queuecount = lclcount[0];

@@ -277,5 +302,6 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
    newnode[counter].threshold = t1.threshold;
    newnode[counter].alpha[0] = t1.alpha[0];
    newnode[counter].alpha[1] = t1.alpha[1];
+    newnode[counter].alpha[2] = t1.alpha[2];
 }

--- a/modules/ocl/src/opencl/imgproc_canny.cl
+++ b/modules/ocl/src/opencl/imgproc_canny.cl
@@ -297,6 +297,9 @@ calcMap
    map_step   /= sizeof(*map);
    map_offset /= sizeof(*map);

+    mag += mag_offset;
+    map += map_offset;
+
    __local float smem[18][18];

    int gidx = get_global_id(0);
@@ -389,7 +392,7 @@ edgesHysteresisLocal
 (
    __global int * map,
    __global ushort2 * st,
-    volatile __global unsigned int * counter,
+    __global unsigned int * counter,
    int rows,
    int cols,
    int map_step,
@@ -399,6 +402,8 @@ edgesHysteresisLocal
    map_step   /= sizeof(*map);
    map_offset /= sizeof(*map);

+    map += map_offset;
+
    __local int smem[18][18];

    int gidx = get_global_id(0);
@@ -416,12 +421,12 @@ edgesHysteresisLocal
    if(ly < 14)
    {
        smem[ly][lx] =
-            map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step + map_offset];
+            map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
    }
    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
    {
        smem[ly + 14][lx] =
-            map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step + map_offset];
+            map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
    }

    barrier(CLK_LOCAL_MEM_FENCE);
@@ -482,14 +487,17 @@ edgesHysteresisLocal
 __constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
 __constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};

+
 #define stack_size 512
 __kernel
-void edgesHysteresisGlobal
+void
+__attribute__((reqd_work_group_size(128,1,1)))
+edgesHysteresisGlobal
 (
    __global int * map,
    __global ushort2 * st1,
    __global ushort2 * st2,
-    volatile __global int * counter,
+    __global int * counter,
    int rows,
    int cols,
    int count,
@@ -501,6 +509,8 @@ void edgesHysteresisGlobal
    map_step   /= sizeof(*map);
    map_offset /= sizeof(*map);

+    map += map_offset;
+
    int gidx = get_global_id(0);
    int gidy = get_global_id(1);

@@ -510,7 +520,7 @@ void edgesHysteresisGlobal
    int grp_idx = get_group_id(0);
    int grp_idy = get_group_id(1);

-    volatile __local unsigned int s_counter;
+    __local unsigned int s_counter;
    __local unsigned int s_ind;

    __local ushort2 s_st[stack_size];
@@ -564,9 +574,9 @@ void edgesHysteresisGlobal
                    pos.x += c_dx[lidx & 7];
                    pos.y += c_dy[lidx & 7];

-                    if (map[pos.x + map_offset + pos.y * map_step] == 1)
+                    if (map[pos.x + pos.y * map_step] == 1)
                    {
-                        map[pos.x + map_offset + pos.y * map_step] = 2;
+                        map[pos.x + pos.y * map_step] = 2;

                        ind = atomic_inc(&s_counter);

@@ -621,6 +631,6 @@ void getEdges

    if(gidy < rows && gidx < cols)
    {
-        dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] >> 1));
+        dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
    }
 }
--- a/modules/ocl/src/opencl/imgproc_gfft.cl
+++ b/modules/ocl/src/opencl/imgproc_gfft.cl
@@ -0,0 +1,276 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WITH_MASK
+#define WITH_MASK 0
+#endif
+
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+inline float ELEM_INT2(image2d_t _eig, int _x, int _y) 
+{
+    return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
+}
+
+inline float ELEM_FLT2(image2d_t _eig, float2 pt) 
+{
+    return read_imagef(_eig, sampler, pt).x;
+}
+
+__kernel
+    void findCorners
+    (
+        image2d_t eig,
+        __global const char * mask,
+        __global float2 * corners,
+        const int mask_strip,// in pixels
+        const float threshold,
+        const int rows,
+        const int cols,
+        const int max_count,
+        __global int * g_counter
+    )
+{
+    const int j = get_global_id(0);
+    const int i = get_global_id(1);
+
+    if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
+#if WITH_MASK
+        && mask[i * mask_strip + j] != 0
+#endif
+        )
+    {
+        const float val = ELEM_INT2(eig, j, i);
+
+        if (val > threshold)
+        {
+            float maxVal = val;
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j    , i - 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j    , i + 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
+
+            if (val == maxVal)
+            {
+                const int ind = atomic_inc(g_counter);
+
+                if (ind < max_count)
+                    corners[ind] = (float2)(j, i);
+            }
+        }
+    }
+}
+
+//bitonic sort
+__kernel
+    void sortCorners_bitonicSort
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count,
+        const int stage,
+        const int passOfStage
+    )
+{
+    const int threadId = get_global_id(0);
+    if(threadId >= count / 2)
+    {
+        return;
+    }
+
+    const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
+
+    const int pairDistance = 1 << (stage - passOfStage);
+    const int blockWidth   = 2 * pairDistance;
+
+    const int leftId = min( (threadId % pairDistance) 
+                   + (threadId / pairDistance) * blockWidth, count );
+
+    const int rightId = min( leftId + pairDistance, count );
+
+    const float2 leftPt  = corners[leftId];
+    const float2 rightPt = corners[rightId];
+
+    const float leftVal  = ELEM_FLT2(eig, leftPt);
+    const float rightVal = ELEM_FLT2(eig, rightPt);
+
+    const bool compareResult = leftVal > rightVal;
+
+    float2 greater = compareResult ? leftPt:rightPt;
+    float2 lesser  = compareResult ? rightPt:leftPt;
+    
+    corners[leftId]  = sortOrder ? lesser : greater;
+    corners[rightId] = sortOrder ? greater : lesser;
+}
+
+//selection sort for gfft
+//kernel is ported from Bolt library:
+//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
+//  Local sort will firstly sort elements of each workgroup using selection sort
+//  its performance is O(n)
+__kernel
+    void sortCorners_selectionSortLocal
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count,
+        __local float2 * scratch
+    )
+{
+    int          i  = get_local_id(0); // index in workgroup
+    int numOfGroups = get_num_groups(0); // index in workgroup
+    int groupID     = get_group_id(0);
+    int         wg  = get_local_size(0); // workgroup size = block size
+    int n; // number of elements to be processed for this work group
+
+    int offset   = groupID * wg;
+    int same     = 0;
+    corners      += offset;
+    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
+    float2 pt1, pt2;
+
+    pt1 = corners[min(i, n)];
+    scratch[i] = pt1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(i >= n)
+    {
+        return;
+    }
+
+    float val1 = ELEM_FLT2(eig, pt1);
+    float val2;
+
+    int pos = 0;
+    for (int j=0;j<n;++j)
+    {
+        pt2  = scratch[j];
+        val2 = ELEM_FLT2(eig, pt2);
+        if(val2 > val1) 
+            pos++;//calculate the rank of this element in this work group
+        else 
+        {
+            if(val1 > val2)
+                continue;
+            else 
+            {
+                // val1 and val2 are same
+                same++;
+            }
+        }
+    }
+    for (int j=0; j< same; j++)      
+        corners[pos + j] = pt1;
+}
+__kernel
+    void sortCorners_selectionSortFinal
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count
+    )
+{
+    const int          i  = get_local_id(0); // index in workgroup
+    const int numOfGroups = get_num_groups(0); // index in workgroup
+    const int groupID     = get_group_id(0);
+    const int         wg  = get_local_size(0); // workgroup size = block size
+    int pos = 0, same = 0;
+    const int offset = get_group_id(0) * wg;
+    const int remainder = count - wg*(numOfGroups-1);
+
+    if((offset + i ) >= count)
+        return;
+    float2 pt1, pt2;
+    pt1 = corners[groupID*wg + i];
+
+    float val1 = ELEM_FLT2(eig, pt1);
+    float val2;
+
+    for(int j=0; j<numOfGroups-1; j++ )
+    {
+        for(int k=0; k<wg; k++)
+        {
+            pt2  = corners[j*wg + k];
+            val2 = ELEM_FLT2(eig, pt2); 
+            if(val1 > val2)
+                break;
+            else
+            {
+                //Increment only if the value is not the same. 
+                if( val2 > val1 )
+                    pos++;
+                else 
+                    same++;
+            }
+        }
+    }
+
+    for(int k=0; k<remainder; k++)
+    {
+        pt2  = corners[(numOfGroups-1)*wg + k];
+        val2 = ELEM_FLT2(eig, pt2); 
+        if(val1 > val2)
+            break;
+        else
+        {
+            //Don't increment if the value is the same. 
+            //Two elements are same if (*userComp)(jData, iData)  and (*userComp)(iData, jData) are both false
+            if(val2 > val1)
+                pos++;
+            else 
+                same++;
+        }
+    }  
+    for (int j=0; j< same; j++)      
+        corners[pos + j] = pt1;
+}
+
--- a/modules/ocl/src/opencl/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -143,7 +143,7 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
        int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
        float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
        int4 con = dpos >= 0 && dpos < dst_cols;
-        ddata = convert_float4(con) != 0 ? ddata : dVal;
+        ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
        if(dstart < dst_cols)
        {
            *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
--- a/modules/ocl/src/opencl/pyrlk.cl
+++ b/modules/ocl/src/opencl/pyrlk.cl
@@ -46,145 +46,10 @@

 //#pragma OPENCL EXTENSION cl_amd_printf : enable

-__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y < rows && x < cols * cn)
-    {
-        const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
-        const uchar src_val1 = (src + y * srcStep)[x];
-        const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
-
-        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
-        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
-    }
-}
-
-__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y < rows && x < cols * cn)
-    {
-        const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
-        const uchar src_val1 = (src + y * srcStep)[x];
-        const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
-
-        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
-        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
-    }
-}
-
-__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int colsn = cols * cn;
-
-    if (y < rows && x < colsn)
-    {
-        __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
-        __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
-
-        const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
-        const int xl = x - cn >= 0 ? x - cn : cn + x;
-
-        ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
-        ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
-    }
-}
-
-__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int colsn = cols * cn;
-
-    if (y < rows && x < colsn)
-    {
-        __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
-        __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
-
-        const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
-        const int xl = x - cn >= 0 ? x - cn : cn + x;
-
-        ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
-        ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
-    }
-}
-
-#define W_BITS 14
-#define W_BITS1 14
-
-#define  CV_DESCALE(x, n)     (((x) + (1 << ((n)-1))) >> (n))
-
-int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
-    int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
-    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
-    __global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
-
-    return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5);
-}
-
-int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
-    int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
-    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
-    __global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
-
-    return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1);
-}
-
-float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y)
-{
-    int2 ipt;
-    ipt.x = convert_int_sat_rtn(pt.x);
-    ipt.y = convert_int_sat_rtn(pt.y);
-
-    float a = pt.x - ipt.x;
-    float b = pt.y - ipt.y;
-
-    float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS));
-    float iw01 = (a * (1.0f - b) * (1 << W_BITS));
-    float iw10 = ((1.0f - a) * b * (1 << W_BITS));
-    float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
-
-    __global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn;
-    __global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn;
-
-    return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5;
-}
-
 #define	BUFFER	64
-
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
 #ifdef CPU
 void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
 {
@@ -193,71 +58,51 @@ void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local
    smem3[tid] = val3;
    barrier(CLK_LOCAL_MEM_FENCE);

-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-        smem3[tid] = val3 += smem3[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-        smem3[tid] = val3 += smem3[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
    if (tid < 32)
    {
-        smem1[tid] = val1 += smem1[tid + 32];
-        smem2[tid] = val2 += smem2[tid + 32];
-        smem3[tid] = val3 += smem3[tid + 32];
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+        smem3[tid] += smem3[tid + 32];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 16)
    {
-        smem1[tid] = val1 += smem1[tid + 16];
-        smem2[tid] = val2 += smem2[tid + 16];
-        smem3[tid] = val3 += smem3[tid + 16];
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+        smem3[tid] += smem3[tid + 16];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 8)
    {
-        smem1[tid] = val1 += smem1[tid + 8];
-        smem2[tid] = val2 += smem2[tid + 8];
-        smem3[tid] = val3 += smem3[tid + 8];
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+        smem3[tid] += smem3[tid + 8];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 4)
    {
-        smem1[tid] = val1 += smem1[tid + 4];
-        smem2[tid] = val2 += smem2[tid + 4];
-        smem3[tid] = val3 += smem3[tid + 4];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+        smem3[tid] += smem3[tid + 4];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 2)
    {
-        smem1[tid] = val1 += smem1[tid + 2];
-        smem2[tid] = val2 += smem2[tid + 2];
-        smem3[tid] = val3 += smem3[tid + 2];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+        smem3[tid] += smem3[tid + 2];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 1)
    {
-        smem1[BUFFER] = val1 += smem1[tid + 1];
-        smem2[BUFFER] = val2 += smem2[tid + 1];
-        smem3[BUFFER] = val3 += smem3[tid + 1];
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
+        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
+        smem3[BUFFER] = smem3[tid] + smem3[tid + 1];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -268,63 +113,45 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
    smem2[tid] = val2;
    barrier(CLK_LOCAL_MEM_FENCE);

-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = (val1 += smem1[tid + 128]);
-        smem2[tid] = (val2 += smem2[tid + 128]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = (val1 += smem1[tid + 64]);
-        smem2[tid] = (val2 += smem2[tid + 64]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
    if (tid < 32)
    {
-        smem1[tid] = (val1 += smem1[tid + 32]);
-        smem2[tid] = (val2 += smem2[tid + 32]);
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 16)
    {
-        smem1[tid] = (val1 += smem1[tid + 16]);
-        smem2[tid] = (val2 += smem2[tid + 16]);
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 8)
    {
-        smem1[tid] = (val1 += smem1[tid + 8]);
-        smem2[tid] = (val2 += smem2[tid + 8]);
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 4)
    {
-        smem1[tid] = (val1 += smem1[tid + 4]);
-        smem2[tid] = (val2 += smem2[tid + 4]);
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 2)
    {
-        smem1[tid] = (val1 += smem1[tid + 2]);
-        smem2[tid] = (val2 += smem2[tid + 2]);
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 1)
    {
-        smem1[BUFFER] = (val1 += smem1[tid + 1]);
-        smem2[BUFFER] = (val2 += smem2[tid + 1]);
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
+        smem2[BUFFER] = smem2[tid] + smem2[tid + 1];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -334,205 +161,146 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
    smem1[tid] = val1;
    barrier(CLK_LOCAL_MEM_FENCE);

-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = (val1 += smem1[tid + 128]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = (val1 += smem1[tid + 64]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
    if (tid < 32)
    {
-        smem1[tid] = (val1 += smem1[tid + 32]);
+        smem1[tid] += smem1[tid + 32];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 16)
    {
-        smem1[tid] = (val1 += smem1[tid + 16]);
+        smem1[tid] += smem1[tid + 16];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 8)
    {
-        smem1[tid] = (val1 += smem1[tid + 8]);
+        smem1[tid] += smem1[tid + 8];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 4)
    {
-        smem1[tid] = (val1 += smem1[tid + 4]);
+        smem1[tid] += smem1[tid + 4];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 2)
    {
-        smem1[tid] = (val1 += smem1[tid + 2]);
+        smem1[tid] += smem1[tid + 2];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 1)
    {
-        smem1[BUFFER] = (val1 += smem1[tid + 1]);
+        smem1[BUFFER] = smem1[tid] + smem1[tid + 1];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
 }
 #else
-void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
+void reduce3(float val1, float val2, float val3, 
+__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
 {
    smem1[tid] = val1;
    smem2[tid] = val2;
    smem3[tid] = val3;
    barrier(CLK_LOCAL_MEM_FENCE);

-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-        smem3[tid] = val3 += smem3[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-        smem3[tid] = val3 += smem3[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
    if (tid < 32)
    {
-        volatile __local float* vmem1 = smem1;
-        volatile __local float* vmem2 = smem2;
-        volatile __local float* vmem3 = smem3;
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+        smem3[tid] += smem3[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+        smem3[tid] += smem3[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];
+        smem3[tid] += smem3[tid + 8];

-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem2[tid] = val2 += vmem2[tid + 32];
-        vmem3[tid] = val3 += vmem3[tid + 32];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];
+        smem3[tid] += smem3[tid + 4];

-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem2[tid] = val2 += vmem2[tid + 16];
-        vmem3[tid] = val3 += vmem3[tid + 16];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];
+        smem3[tid] += smem3[tid + 2];

-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem2[tid] = val2 += vmem2[tid + 8];
-        vmem3[tid] = val3 += vmem3[tid + 8];
-
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem2[tid] = val2 += vmem2[tid + 4];
-        vmem3[tid] = val3 += vmem3[tid + 4];
-
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem2[tid] = val2 += vmem2[tid + 2];
-        vmem3[tid] = val3 += vmem3[tid + 2];
-
-        vmem1[tid] = val1 += vmem1[tid + 1];
-        vmem2[tid] = val2 += vmem2[tid + 1];
-        vmem3[tid] = val3 += vmem3[tid + 1];
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
+        smem3[tid] += smem3[tid + 1];
    }
 }

-void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)
+void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
 {
    smem1[tid] = val1;
    smem2[tid] = val2;
    barrier(CLK_LOCAL_MEM_FENCE);

-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-        smem2[tid] = val2 += smem2[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-        smem2[tid] = val2 += smem2[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
    if (tid < 32)
    {
-        volatile __local float* vmem1 = smem1;
-        volatile __local float* vmem2 = smem2;
+        smem1[tid] += smem1[tid + 32];
+        smem2[tid] += smem2[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+        smem2[tid] += smem2[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem2[tid] += smem2[tid + 8];

-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem2[tid] = val2 += vmem2[tid + 32];
+        smem1[tid] += smem1[tid + 4];
+        smem2[tid] += smem2[tid + 4];

-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem2[tid] = val2 += vmem2[tid + 16];
+        smem1[tid] += smem1[tid + 2];
+        smem2[tid] += smem2[tid + 2];

-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem2[tid] = val2 += vmem2[tid + 8];
-
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem2[tid] = val2 += vmem2[tid + 4];
-
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem2[tid] = val2 += vmem2[tid + 2];
-
-        vmem1[tid] = val1 += vmem1[tid + 1];
-        vmem2[tid] = val2 += vmem2[tid + 1];
+        smem1[tid] += smem1[tid + 1];
+        smem2[tid] += smem2[tid + 1];
    }
 }

-void reduce1(float val1, __local float* smem1, int tid)
+void reduce1(float val1, __local volatile float* smem1, int tid)
 {
    smem1[tid] = val1;
    barrier(CLK_LOCAL_MEM_FENCE);

-#if	BUFFER > 128
-    if (tid < 128)
-    {
-        smem1[tid] = val1 += smem1[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
-#if	BUFFER > 64
-    if (tid < 64)
-    {
-        smem1[tid] = val1 += smem1[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-
    if (tid < 32)
    {
-        volatile __local float* vmem1 = smem1;
-
-        vmem1[tid] = val1 += vmem1[tid + 32];
-        vmem1[tid] = val1 += vmem1[tid + 16];
-        vmem1[tid] = val1 += vmem1[tid + 8];
-        vmem1[tid] = val1 += vmem1[tid + 4];
-        vmem1[tid] = val1 += vmem1[tid + 2];
-        vmem1[tid] = val1 += vmem1[tid + 1];
+        smem1[tid] += smem1[tid + 32];
+#if WAVE_SIZE < 32
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 16) {
+#endif
+        smem1[tid] += smem1[tid + 16];
+#if WAVE_SIZE <16
+	} barrier(CLK_LOCAL_MEM_FENCE);
+	if (tid < 8) {
+#endif
+        smem1[tid] += smem1[tid + 8];
+        smem1[tid] += smem1[tid + 4];
+        smem1[tid] += smem1[tid + 2];
+        smem1[tid] += smem1[tid + 1];
    }
 }
 #endif

 #define SCALE (1.0f / (1 << 20))
 #define	THRESHOLD	0.01f
-#define	DIMENSION	21

 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
--- a/modules/ocl/src/precomp.hpp
+++ b/modules/ocl/src/precomp.hpp
@@ -61,6 +61,8 @@
 #include <exception>
 #include <stdio.h>

+#undef OPENCV_NOSTL
+
 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
 #include "opencv2/ocl.hpp"
@@ -74,6 +76,7 @@

 #if defined (HAVE_OPENCL)

+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #include "opencv2/ocl/private/util.hpp"
 #include "safe_call.hpp"

--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@ -15,8 +15,8 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//		Dachuan Zhao, dachuan@multicorewareinc.com
-//		Yao Wang, yao@multicorewareinc.com
+//      Dachuan Zhao, dachuan@multicorewareinc.com
+//      Yao Wang, yao@multicorewareinc.com
 //      Nathan, liujun@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -54,35 +54,20 @@ namespace cv
 {
 namespace ocl
 {
-///////////////////////////OpenCL kernel strings///////////////////////////
 extern const char *pyrlk;
 extern const char *pyrlk_no_image;
 extern const char *operator_setTo;
 extern const char *operator_convertTo;
 extern const char *operator_copyToM;
-extern const char *arithm_mul;
 extern const char *pyr_down;
 }
 }
-
 struct dim3
 {
    unsigned int x, y, z;
 };

-struct float2
-{
-    float x, y;
-};
-
-struct int2
-{
-    int x, y;
-};
-
-namespace
-{
-void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
+static void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
 {
    winSize.width *= cn;

@@ -102,12 +87,6 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe

    block.z = patch.z = 1;
 }
-}
-
-inline int divUp(int total, int grain)
-{
-    return (total + grain - 1) / grain;
-}

 ///////////////////////////////////////////////////////////////////////////
 //////////////////////////////// ConvertTo ////////////////////////////////
@@ -448,89 +427,6 @@ static void copyTo(const oclMat &src, oclMat &m )
                       src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset);
 }

-// static void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask)
-// {
-//     if (mask.empty())
-//     {
-//         copyTo(src, mat);
-//     }
-//     else
-//     {
-//         mat.create(src.size(), src.type());
-//         copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask");
-//     }
-// }
-
-static void arithmetic_run(const oclMat &src1, oclMat &dst, String kernelName, const char **kernelString, void *_scalar)
-{
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
-    {
-        CV_Error(Error::GpuNotSupported, "Selected device don't support double\r\n");
-        return;
-    }
-
-    //dst.create(src1.size(), src1.type());
-    //CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
-    //          src1.rows == src2.rows && src2.rows == dst.rows);
-    CV_Assert(src1.cols == dst.cols &&
-              src1.rows == dst.rows);
-
-    CV_Assert(src1.type() == dst.type());
-    CV_Assert(src1.depth() != CV_8S);
-
-    Context  *clCxt = src1.clCxt;
-    //int channels = dst.channels();
-    //int depth = dst.depth();
-
-    //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1},
-    //    {4, 0, 4, 4, 1, 1, 1},
-    //    {4, 0, 4, 4, 1, 1, 1},
-    //    {4, 0, 4, 4, 1, 1, 1}
-    //};
-
-    //size_t vector_length = vector_lengths[channels-1][depth];
-    //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    //int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-    size_t localThreads[3]  = { 16, 16, 1 };
-    //size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
-    //                               divUp(dst.rows, localThreads[1]) * localThreads[1],
-    //                               1
-    //                             };
-    size_t globalThreads[3] = { src1.cols,
-                                src1.rows,
-                                1
-                              };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    //args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-    //args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    //if(_scalar != NULL)
-    //{
-    float scalar1 = *((float *)_scalar);
-    args.push_back( std::make_pair( sizeof(float), (float *)&scalar1 ));
-    //}
-
-    openCLExecuteKernel2(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, src1.depth(), CLFLUSH);
-}
-
-static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar)
-{
-    arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar));
-}
-
 static void pyrdown_run_cus(const oclMat &src, const oclMat &dst)
 {

@@ -576,15 +472,7 @@ static void lkSparse_run(oclMat &I, oclMat &J,
    size_t localThreads[3]  = { 8, isImageSupported ? 8 : 32, 1 };
    size_t globalThreads[3] = { 8 * ptcount, isImageSupported ? 8 : 32, 1};
    int cn = I.oclchannels();
-    char calcErr;
-    if (level == 0)
-    {
-        calcErr = 1;
-    }
-    else
-    {
-        calcErr = 0;
-    }
+    char calcErr = level==0?1:0;

    std::vector<std::pair<size_t , const void *> > args;

@@ -614,7 +502,16 @@ static void lkSparse_run(oclMat &I, oclMat &J,

    if(isImageSupported)
    {
-        openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH);
+        std::stringstream idxStr;
+        idxStr << kernelName.c_str() << "_C" << I.oclchannels() << "_D" << I.depth();
+        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str().c_str());
+        int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
+        openCLSafeCall(clReleaseKernel(kernel));
+
+        static char opt[16] = {0};
+        sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
+
+        openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), opt, CLFLUSH);
        releaseTexture(ITex);
        releaseTexture(JTex);
    }
@@ -656,9 +553,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next

    oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
    oclMat temp2 = nextPts.reshape(1);
-    //oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f));
-    multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f);
-    //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
+    multiply(1.0f/(1<<maxLevel)/2.0f, temp1, temp2);

    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
    //status.setTo(Scalar::all(1));
@@ -675,7 +570,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
    //ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err);

    // build the image pyramids.
-
    prevPyr_.resize(maxLevel + 1);
    nextPyr_.resize(maxLevel + 1);

@@ -703,7 +597,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
    }

    // dI/dx ~ Ix, dI/dy ~ Iy
-
    for (int level = maxLevel; level >= 0; level--)
    {
        lkSparse_run(prevPyr_[level], nextPyr_[level],
--- a/modules/ocl/src/safe_call.hpp
+++ b/modules/ocl/src/safe_call.hpp
@@ -47,7 +47,7 @@
 #define __OPENCV_OPENCL_SAFE_CALL_HPP__

 #if defined __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
--- a/modules/ocl/test/test_canny.cpp
+++ b/modules/ocl/test/test_canny.cpp
@@ -73,7 +73,6 @@ TEST_P(Canny, Accuracy)
    double low_thresh = 50.0;
    double high_thresh = 100.0;

-    cv::resize(img, img, cv::Size(512, 384));
    cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);

    cv::ocl::oclMat edges;
--- a/modules/ocl/test/test_haar.cpp
+++ b/modules/ocl/test/test_haar.cpp
@@ -55,6 +55,12 @@ using namespace testing;
 using namespace std;
 using namespace cv;
 extern string workdir;
+
+namespace
+{
+IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
+CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
+CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
 struct getRect
 {
    Rect operator ()(const CvAvgComp &e) const
@@ -62,23 +68,24 @@ struct getRect
        return e.rect;
    }
 };
+}

-PARAM_TEST_CASE(Haar, double, int)
+PARAM_TEST_CASE(Haar, double, int, CascadeName)
 {
    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
-    cv::ocl::OclCascadeClassifierBuf cascadebuf;
    cv::CascadeClassifier cpucascade, cpunestedCascade;

    double scale;
    int flags;
+    std::string cascadeName;

    virtual void SetUp()
    {
        scale = GET_PARAM(0);
        flags = GET_PARAM(1);
-        string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
+        cascadeName = (workdir + "../../data/haarcascades/").append(GET_PARAM(2));

-        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName )))
+        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) )
        {
            cout << "ERROR: Could not load classifier cascade" << endl;
            return;
@@ -115,7 +122,7 @@ TEST_P(Haar, FaceDetect)
    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
    oclfaces.resize(vecAvgComp.size());
    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
-
+    
    cpucascade.detectMultiScale( smallImg, faces,  1.1, 3,
                                 flags,
                                 Size(30, 30), Size(0, 0) );
@@ -136,7 +143,6 @@ TEST_P(Haar, FaceDetectUseBuf)
    vector<Rect> faces, oclfaces;

    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
    cvtColor( img, gray, CV_BGR2GRAY );
    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
    equalizeHist( smallImg, smallImg );
@@ -144,19 +150,31 @@ TEST_P(Haar, FaceDetectUseBuf)
    cv::ocl::oclMat image;
    image.upload(smallImg);

+    cv::ocl::OclCascadeClassifierBuf cascadebuf;
+    if( !cascadebuf.load( cascadeName ) )
+    {
+        cout << "ERROR: Could not load classifier cascade for FaceDetectUseBuf!" << endl;
+        return;
+    }
    cascadebuf.detectMultiScale( image, oclfaces,  1.1, 3,
                                 flags,
                                 Size(30, 30), Size(0, 0) );
-    cascadebuf.release();

    cpucascade.detectMultiScale( smallImg, faces,  1.1, 3,
                                 flags,
                                 Size(30, 30), Size(0, 0) );
    EXPECT_EQ(faces.size(), oclfaces.size());
+
+    // intentionally run ocl facedetect again and check if it still works after the first run
+    cascadebuf.detectMultiScale( image, oclfaces,  1.1, 3,
+        flags,
+        Size(30, 30));
+    cascadebuf.release();
+    EXPECT_EQ(faces.size(), oclfaces.size());
 }

 INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
    Combine(Values(1.0),
-            Values(CV_HAAR_SCALE_IMAGE, 0)));
+            Values(CV_HAAR_SCALE_IMAGE, 0), Values(cascade_frontalface_alt, cascade_frontalface_alt2)));

 #endif // HAVE_OPENCL
--- a/modules/ocl/test/test_optflow.cpp
+++ b/modules/ocl/test/test_optflow.cpp
@@ -55,6 +55,83 @@ using namespace testing;
 using namespace std;

 extern string workdir;
+
+
+//////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(MinDistance, double)
+}
+PARAM_TEST_CASE(GoodFeaturesToTrack, MinDistance)
+{
+    double minDistance;
+
+    virtual void SetUp()
+    {
+        minDistance = GET_PARAM(0);
+    }
+};
+
+TEST_P(GoodFeaturesToTrack, Accuracy)
+{
+    cv::Mat frame = readImage(workdir + "../gpu/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame.empty());
+
+    int maxCorners = 1000;
+    double qualityLevel = 0.01;
+
+    cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
+
+    cv::ocl::oclMat d_pts;
+    detector(oclMat(frame), d_pts);
+
+    ASSERT_FALSE(d_pts.empty());
+
+    std::vector<cv::Point2f> pts(d_pts.cols);
+    
+    detector.downloadPoints(d_pts, pts);
+
+    std::vector<cv::Point2f> pts_gold;
+    cv::goodFeaturesToTrack(frame, pts_gold, maxCorners, qualityLevel, minDistance);
+
+    ASSERT_EQ(pts_gold.size(), pts.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < pts.size(); ++i)
+    {
+        cv::Point2i a = pts_gold[i];
+        cv::Point2i b = pts[i];
+
+        bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+
+        if (!eq)
+            ++mistmatch;
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / pts.size();
+
+    ASSERT_LE(bad_ratio, 0.01);
+}
+
+TEST_P(GoodFeaturesToTrack, EmptyCorners)
+{
+    int maxCorners = 1000;
+    double qualityLevel = 0.01;
+
+    cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
+
+    cv::ocl::oclMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
+    cv::ocl::oclMat corners(1, maxCorners, CV_32FC2);
+
+    detector(src, corners);
+
+    ASSERT_TRUE(corners.empty());
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_Video, GoodFeaturesToTrack, 
+    testing::Values(MinDistance(0.0), MinDistance(3.0)));
+
 //////////////////////////////////////////////////////////////////////////
 PARAM_TEST_CASE(TVL1, bool)
 {
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -59,17 +59,17 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,

    switch (src.type()) {
        case CV_8U:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                FastNlMeansDenoisingInvoker<uchar>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC2:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                FastNlMeansDenoisingInvoker<cv::Vec2b>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC3:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                FastNlMeansDenoisingInvoker<cv::Vec3b>(
                    src, dst, templateWindowSize, searchWindowSize, h));
            break;
@@ -159,19 +159,19 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds

    switch (srcImgs[0].type()) {
        case CV_8U:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                FastNlMeansMultiDenoisingInvoker<uchar>(
                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
                    dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC2:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                FastNlMeansMultiDenoisingInvoker<cv::Vec2b>(
                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
                    dst, templateWindowSize, searchWindowSize, h));
            break;
        case CV_8UC3:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                FastNlMeansMultiDenoisingInvoker<cv::Vec3b>(
                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
                    dst, templateWindowSize, searchWindowSize, h));
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -51,12 +51,12 @@
 using namespace cv;

 template <typename T>
-struct FastNlMeansDenoisingInvoker {
+struct FastNlMeansDenoisingInvoker : ParallelLoopBody {
    public:
        FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst,
            int template_window_size, int search_window_size, const float h);

-        void operator() (const BlockedRange& range) const;
+        void operator() (const Range& range) const;

    private:
        void operator= (const FastNlMeansDenoisingInvoker&);
@@ -152,9 +152,9 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
 }

 template <class T>
-void FastNlMeansDenoisingInvoker<T>::operator() (const BlockedRange& range) const {
-    int row_from = range.begin();
-    int row_to = range.end() - 1;
+void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const {
+    int row_from = range.start;
+    int row_to = range.end - 1;

    Array2d<int> dist_sums(search_window_size_, search_window_size_);

--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -51,13 +51,13 @@
 using namespace cv;

 template <typename T>
-struct FastNlMeansMultiDenoisingInvoker {
+struct FastNlMeansMultiDenoisingInvoker : ParallelLoopBody {
    public:
        FastNlMeansMultiDenoisingInvoker(
            const std::vector<Mat>& srcImgs, int imgToDenoiseIndex, int temporalWindowSize,
            Mat& dst, int template_window_size, int search_window_size, const float h);

-        void operator() (const BlockedRange& range) const;
+        void operator() (const Range& range) const;

    private:
        void operator= (const FastNlMeansMultiDenoisingInvoker&);
@@ -171,9 +171,9 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
 }

 template <class T>
-void FastNlMeansMultiDenoisingInvoker<T>::operator() (const BlockedRange& range) const {
-    int row_from = range.begin();
-    int row_to = range.end() - 1;
+void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const {
+    int row_from = range.start;
+    int row_to = range.end - 1;

    Array3d<int> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);

--- a/modules/photo/src/precomp.hpp
+++ b/modules/photo/src/precomp.hpp
@@ -43,8 +43,9 @@
 #ifndef __OPENCV_PRECOMP_H__
 #define __OPENCV_PRECOMP_H__

-#include "opencv2/photo.hpp"
 #include "opencv2/core/private.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/photo.hpp"

 #ifdef HAVE_TEGRA_OPTIMIZATION
 #include "opencv2/photo/photo_tegra.hpp"
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -65,7 +65,7 @@ struct DistIdxPair
 };


-struct MatchPairsBody
+struct MatchPairsBody : ParallelLoopBody
 {
    MatchPairsBody(const MatchPairsBody& other)
            : matcher(other.matcher), features(other.features),
@@ -76,10 +76,10 @@ struct MatchPairsBody
            : matcher(_matcher), features(_features),
              pairwise_matches(_pairwise_matches), near_pairs(_near_pairs) {}

-    void operator ()(const BlockedRange &r) const
+    void operator ()(const Range &r) const
    {
        const int num_images = static_cast<int>(features.size());
-        for (int i = r.begin(); i < r.end(); ++i)
+        for (int i = r.start; i < r.end; ++i)
        {
            int from = near_pairs[i].first;
            int to = near_pairs[i].second;
@@ -525,9 +525,9 @@ void FeaturesMatcher::operator ()(const std::vector<ImageFeatures> &features, st
    MatchPairsBody body(*this, features, pairwise_matches, near_pairs);

    if (is_thread_safe_)
-        parallel_for(BlockedRange(0, static_cast<int>(near_pairs.size())), body);
+        parallel_for_(Range(0, static_cast<int>(near_pairs.size())), body);
    else
-        body(BlockedRange(0, static_cast<int>(near_pairs.size())));
+        body(Range(0, static_cast<int>(near_pairs.size())));
    LOGLN_CHAT("");
 }

--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -53,6 +53,7 @@
 #include <sstream>
 #include <cmath>
 #include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
 #include "opencv2/stitching.hpp"
 #include "opencv2/stitching/detail/autocalib.hpp"
 #include "opencv2/stitching/detail/blenders.hpp"
--- a/modules/video/src/bgfg_gaussmix2.cpp
+++ b/modules/video/src/bgfg_gaussmix2.cpp
@@ -702,14 +702,14 @@ void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask,

    parallel_for_(Range(0, image.rows),
                  MOG2Invoker(image, fgmask,
-                             (GMM*)bgmodel.data,
-                             (float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
-                             bgmodelUsedModes.data, nmixtures, (float)learningRate,
-                             (float)varThreshold,
-                             backgroundRatio, varThresholdGen,
-                             fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
-                             bShadowDetection, nShadowDetection),
-                  image.total()/(double)(1 << 16));
+                              (GMM*)bgmodel.data,
+                              (float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
+                              bgmodelUsedModes.data, nmixtures, (float)learningRate,
+                              (float)varThreshold,
+                              backgroundRatio, varThresholdGen,
+                              fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
+                              bShadowDetection, nShadowDetection),
+                              image.total()/(double)(1 << 16));
 }

 void BackgroundSubtractorMOG2Impl::getBackgroundImage(OutputArray backgroundImage) const
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -167,7 +167,7 @@ typedef float acctype;
 typedef float itemtype;
 #endif

-void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
+void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 {
    Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
    const Mat& I = *prevImg;
@@ -181,7 +181,7 @@ void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
    Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
    Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);

-    for( int ptidx = range.begin(); ptidx < range.end(); ptidx++ )
+    for( int ptidx = range.start; ptidx < range.end; ptidx++ )
    {
        Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
        Point2f nextPt;
@@ -746,11 +746,11 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
        typedef cv::detail::LKTrackerInvoker LKTrackerInvoker;
 #endif

-        parallel_for(BlockedRange(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
-                                                                nextPyr[level * lvlStep2], prevPts, nextPts,
-                                                                status, err,
-                                                                winSize, criteria, level, maxLevel,
-                                                                flags, (float)minEigThreshold));
+        parallel_for_(Range(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
+                                                          nextPyr[level * lvlStep2], prevPts, nextPts,
+                                                          status, err,
+                                                          winSize, criteria, level, maxLevel,
+                                                          flags, (float)minEigThreshold));
    }
 }

--- a/modules/video/src/lkpyramid.hpp
+++ b/modules/video/src/lkpyramid.hpp
@@ -7,7 +7,7 @@ namespace detail

    typedef short deriv_type;

-    struct LKTrackerInvoker
+    struct LKTrackerInvoker : ParallelLoopBody
    {
        LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
                          const Point2f* _prevPts, Point2f* _nextPts,
@@ -15,7 +15,7 @@ namespace detail
                          Size _winSize, TermCriteria _criteria,
                          int _level, int _maxLevel, int _flags, float _minEigThreshold );

-        void operator()(const BlockedRange& range) const;
+        void operator()(const Range& range) const;

        const Mat* prevImg;
        const Mat* nextImg;