diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt
index 24595770d..a6496aef2 100644
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -64,8 +64,8 @@ ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
 ################################################################################################################
 ################################   OpenCL Module Performance  ##################################################
 ################################################################################################################
-#file(GLOB perf_srcs "perf/*.cpp")
-#file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
+file(GLOB perf_srcs "perf/*.cpp")
+file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
 
-#ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
-#                       FILES "Src" ${perf_srcs})
+ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
+                       FILES "Src" ${perf_srcs})
diff --git a/modules/ocl/perf/interpolation.hpp b/modules/ocl/perf/interpolation.hpp
new file mode 100644
index 000000000..d9180048e
--- /dev/null
+++ b/modules/ocl/perf/interpolation.hpp
@@ -0,0 +1,120 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
+#define __OPENCV_TEST_INTERPOLATION_HPP__
+
+template <typename T> T readVal(const cv::Mat& src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+{
+    if (border_type == cv::BORDER_CONSTANT)
+        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
+
+    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
+}
+
+template <typename T> struct NearestInterpolator
+{
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
+    }
+};
+
+template <typename T> struct LinearInterpolator
+{
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        x -= 0.5f;
+        y -= 0.5f;
+
+        int x1 = cvFloor(x);
+        int y1 = cvFloor(y);
+        int x2 = x1 + 1;
+        int y2 = y1 + 1;
+
+        float res = 0;
+
+        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
+        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
+        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
+        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
+
+        return cv::saturate_cast<T>(res);
+    }
+};
+
+template <typename T> struct CubicInterpolator
+{
+    static float getValue(float p[4], float x)
+    {
+        return p[1] + 0.5 * x * (p[2] - p[0] + x*(2.0*p[0] - 5.0*p[1] + 4.0*p[2] - p[3] + x*(3.0*(p[1] - p[2]) + p[3] - p[0])));
+    }
+
+    static float getValue(float p[4][4], float x, float y)
+    {
+        float arr[4];
+
+        arr[0] = getValue(p[0], x);
+        arr[1] = getValue(p[1], x);
+        arr[2] = getValue(p[2], x);
+        arr[3] = getValue(p[3], x);
+
+        return getValue(arr, y);
+    }
+
+    static T getValue(const cv::Mat& src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
+    {
+        int ix = cvRound(x);
+        int iy = cvRound(y);
+
+        float vals[4][4] =
+        {
+            {readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
+            {readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
+            {readVal<T>(src, iy    , ix - 2, c, border_type, borderVal), readVal<T>(src, iy    , ix - 1, c, border_type, borderVal), readVal<T>(src, iy    , ix, c, border_type, borderVal), readVal<T>(src, iy    , ix + 1, c, border_type, borderVal)},
+            {readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
+        };
+
+        return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
+    }
+};
+
+#endif // __OPENCV_TEST_INTERPOLATION_HPP__
diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp
new file mode 100644
index 000000000..0d9d96791
--- /dev/null
+++ b/modules/ocl/perf/main.cpp
@@ -0,0 +1,108 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_OPENCL
+
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+
+void print_info()
+{    
+    printf("\n");
+#if defined _WIN32
+#   if defined _WIN64
+        puts("OS: Windows 64");
+#   else
+        puts("OS: Windows 32");
+#   endif
+#elif defined linux
+#   if defined _LP64
+        puts("OS: Linux 64");
+#   else
+        puts("OS: Linux 32");
+#   endif
+#elif defined __APPLE__
+#   if defined _LP64
+        puts("OS: Apple 64");
+#   else
+        puts("OS: Apple 32");
+#   endif
+#endif
+
+}
+
+#if PERF_TEST_OCL
+int main(int argc, char** argv)
+{
+	
+	static std::vector<Info> ocl_info;
+	ocl::getDevice(ocl_info);
+
+    run_perf_test();
+    return 0;
+}
+#else
+int main(int argc, char** argv)
+{
+    TS::ptr()->init("ocl");
+    InitGoogleTest(&argc, argv);
+
+    print_info();
+
+    return RUN_ALL_TESTS();
+}
+#endif // PERF_TEST_OCL
+
+#else // HAVE_OPENC
+
+int main()
+{
+    printf("OpenCV was built without OpenCL support\n");
+    return 0;
+}
+
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/perf_test_ocl.cpp b/modules/ocl/perf/perf_test_ocl.cpp
new file mode 100644
index 000000000..67f20a33d
--- /dev/null
+++ b/modules/ocl/perf/perf_test_ocl.cpp
@@ -0,0 +1,1191 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicore Ware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <ctime>
+
+#if PERF_TEST_OCL
+
+#ifdef HAVE_OPENCL
+
+#define SHOW_CPU false
+#define REPEAT   1000
+#define COUNT_U  0 // count the uploading execution time for ocl mat structures
+#define COUNT_D  0
+
+
+// the following macro section tests the target function (kernel) performance
+// upload is the code snippet for converting cv::mat to cv::ocl::oclMat
+// downloading is the code snippet for converting cv::ocl::oclMat back to cv::mat
+// change COUNT_U and COUNT_D to take downloading and uploading time into account
+#define P_TEST_FULL( upload, kernel_call, download ) \
+{ \
+    std::cout<< "\n" #kernel_call "\n----------------------"; \
+    {upload;} \
+    R_TEST( kernel_call, 15 ); \
+    double t = (double)cvGetTickCount(); \
+    R_T( { \
+            if( COUNT_U ) {upload;} \
+            kernel_call; \
+            if( COUNT_D ) {download;} \
+            } ); \
+    t = (double)cvGetTickCount() - t; \
+    std::cout << "runtime is  " << t/((double)cvGetTickFrequency()* 1000.) << "ms" << std::endl; \
+}
+
+
+#define R_T2( test ) \
+{ \
+    std::cout<< "\n" #test "\n----------------------"; \
+    R_TEST( test, 15 ) \
+    clock_t st = clock(); \
+    R_T( test ) \
+    std::cout<< clock() - st << "ms\n"; \
+}
+#define R_T( test ) \
+    R_TEST( test, REPEAT )
+#define R_TEST( test, repeat ) \
+    try{ \
+        for( int i = 0; i < repeat; i ++ ) { test; } \
+    } catch( ... ) { std::cout << "||||| Exception catched! |||||\n"; return; }
+
+#define FILTER_TEST_IMAGE "C:/Windows/Web/Wallpaper/Landscapes/img9.jpg"
+#define WARN_NRUN( name ) \
+    std::cout << "Warning: " #name " is not runnable!\n";
+
+
+void print_info();
+
+// performance base class
+struct PerfTest
+{
+    virtual void Run()   = 0;
+    protected:
+    virtual void SetUp() = 0;
+};
+///////////////////////////////////////
+// Arithm
+struct ArithmTestP : PerfTest
+{
+    int type;
+    cv::Scalar val;
+
+    cv::Size size;
+    cv::Mat mat1, mat2;
+    cv::Mat mask;
+    cv::Mat dst;
+    cv::ocl::oclMat oclRes, oclmat1, oclmat2;
+    cv::ocl::oclMat oclmask;
+    std::vector<cv::Mat> dstv;
+    protected:
+    ArithmTestP() : type( CV_8UC4 ) {}
+    virtual void SetUp()
+    {
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+        size = cv::Size( 3000, 3000 ); // big input image
+        mat1 = cvtest::randomMat(rng, size, type, 1, 255, false);
+        mat2 = cvtest::randomMat(rng, size, type, 1, 255, false);
+        mask = cvtest::randomMat(rng, size, CV_8UC1, 0, 2,  false);
+
+        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+
+        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+
+        oclmat1 = cv::ocl::oclMat(mat1);
+        oclmat2 = cv::ocl::oclMat(mat2);
+        oclmask = cv::ocl::oclMat(mask);
+    }
+};
+
+struct AddArrayP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::add(oclmat1, oclmat2, oclRes),
+                oclRes.download(dst);
+                );
+    }
+};
+
+struct SubtractArrayP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::subtract(oclmat1, oclmat2, oclRes),
+                oclRes.download(dst);
+                );
+    }
+};
+
+struct MultiplyArrayP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        clock_t start = clock();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::multiply(oclmat1, oclmat2, oclRes),
+                oclRes.download(dst);
+                );		
+    }
+};
+
+struct DivideArrayP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::divide(oclmat1, oclmat2, oclRes),
+                oclRes.download(dst);
+                );
+    }
+};
+
+struct ExpP : ArithmTestP
+{
+    void Run()
+    {
+        type = CV_32FC1;
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::exp(oclmat1, oclRes),
+                oclRes.download(dst);
+                );
+    }
+};
+
+struct LogP : ArithmTestP
+{
+    void Run()
+    {
+        type = CV_32FC1;
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::log(oclmat1, oclRes),
+                oclRes.download(dst);
+                );
+    }
+};
+
+struct CompareP : ArithmTestP
+{
+    virtual void Run()
+    {
+        type = CV_32FC1;
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::compare(oclmat1, oclmat2, oclRes, cv::CMP_EQ),
+                oclRes.download(dst);
+                );
+    }
+};
+
+struct FlipP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::flip(oclmat1, oclRes, 0),
+                oclRes.download(dst);
+                );
+    }
+    protected:
+    virtual void SetUp()
+    {
+        type = CV_8UC4;
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+        size = cv::Size(3000, 3000);
+        mat1 = cvtest::randomMat(rng, size, type, 1, 255, false);
+        oclmat1 = cv::ocl::oclMat(mat1);
+    }
+};
+
+struct MagnitudeP : ArithmTestP
+{
+    virtual void Run()
+    {
+        type = CV_32F;
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::magnitude(oclmat1, oclmat1, oclRes),
+                oclRes.download(dst);
+                );
+    }
+};
+
+struct LUTP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);ocllut  = cv::ocl::oclMat(lut),
+                cv::ocl::LUT(oclmat1, ocllut, oclRes),
+                oclRes.download(dst);
+                );
+    }
+    protected:
+    cv::Mat lut;
+    cv::ocl::oclMat ocllut;
+    virtual void SetUp()
+    {
+        type = CV_8UC1;
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+        size = cv::Size(3000, 3000);
+        mat1 = cvtest::randomMat(rng, size, type, 1, 255, false);
+        lut = cvtest::randomMat(rng, cv::Size(256, 1), CV_8UC1, 100, 200, false);
+        oclmat1 = cv::ocl::oclMat(mat1);
+        ocllut  = cv::ocl::oclMat(lut);
+    }
+};
+
+struct MinMaxP : ArithmTestP
+{
+    double minVal_gold, minVal;
+    double maxVal_gold, maxVal;
+
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::minMax(oclmat1, &minVal, &maxVal, oclmat2),
+                {};
+                );
+    }
+
+    protected:
+    virtual void SetUp()
+    {
+        type = CV_64F;
+
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+
+        size = cv::Size(3000, 3000);
+
+        mat1 = cvtest::randomMat(rng, size, type, 0.0, 127.0, false);
+        mat2 = cvtest::randomMat(rng, size, CV_8UC1, 0, 2, false);
+
+        oclmat1 = cv::ocl::oclMat(mat1);
+        oclmat2 = cv::ocl::oclMat(mat2);
+    }
+};
+
+struct MinMaxLocP : MinMaxP
+{
+    cv::Point minLoc_gold;
+    cv::Point maxLoc_gold;
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::minMaxLoc(oclmat1, &minVal, &maxVal, &minLoc_gold, &maxLoc_gold, oclmat2),
+                {}
+                );
+    }
+};
+
+struct CountNonZeroP : ArithmTestP
+{
+    int n;
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                n = cv::ocl::countNonZero(oclmat1),
+                {}
+                );
+    }
+    protected:
+    virtual void SetUp()
+    {
+        type = 6;
+
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+
+        size = cv::Size( 3000, 3000 );
+
+        cv::Mat matBase = cvtest::randomMat(rng, size, CV_8U, 0.0, 1.0, false);
+        matBase.convertTo(mat1, type);
+
+        oclmat1 = cv::ocl::oclMat(mat1);
+    }
+};
+
+struct SumP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        cv::Scalar n;
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                n = cv::ocl::sum(oclmat1),
+                {}
+                );
+    }
+};
+
+struct BitwiseP : ArithmTestP
+{
+    protected:
+        virtual void SetUp()
+        {
+            type = CV_8UC4;
+
+            cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+
+            size = cv::Size( 3000, 3000 );
+
+            mat1.create(size, type);
+            mat2.create(size, type);
+
+            for (int i = 0; i < mat1.rows; ++i)
+            {
+                cv::Mat row1(1, static_cast<int>(mat1.cols * mat1.elemSize()), CV_8U, (void*)mat1.ptr(i));
+                rng.fill(row1, cv::RNG::UNIFORM, cv::Scalar(0), cv::Scalar(255));
+
+                cv::Mat row2(1, static_cast<int>(mat2.cols * mat2.elemSize()), CV_8U, (void*)mat2.ptr(i));
+                rng.fill(row2, cv::RNG::UNIFORM, cv::Scalar(0), cv::Scalar(255));
+            }
+            oclmat1 = cv::ocl::oclMat(mat1);
+            oclmat2 = cv::ocl::oclMat(mat2);
+        }
+};
+
+struct BitwiseNotP : BitwiseP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::bitwise_not(oclmat1, oclRes),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct BitwiseAndP : BitwiseP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::bitwise_and(oclmat1, oclmat2, oclRes),
+                oclRes.download(dst)
+                );
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::bitwise_and(oclmat1, val, oclRes),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct BitwiseXorP : BitwiseP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::bitwise_xor(oclmat1, oclmat2, oclRes),
+                oclRes.download(dst)
+                );
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::bitwise_xor(oclmat1, val, oclRes),
+                oclRes.download(dst)
+                );
+
+    }
+};
+
+struct BitwiseOrP : BitwiseP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::bitwise_or(oclmat1, oclmat2, oclRes),
+                oclRes.download(dst)
+                );
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::bitwise_or(oclmat1, val, oclRes),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct TransposeP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::transpose(oclmat1, oclRes),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct AbsdiffArrayP : ArithmTestP
+{
+    virtual void Run()
+    {
+        type = CV_32FC1;
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::absdiff(oclmat1, oclmat2, oclRes),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct PhaseP : ArithmTestP
+{
+    virtual void Run()
+    {
+        type = CV_32F;
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1);oclmat2 = cv::ocl::oclMat(mat2),
+                cv::ocl::phase(oclmat1,oclmat2,oclRes,1),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct CartToPolarP : ArithmTestP
+{
+    cv::ocl::oclMat oclRes1;
+    virtual void Run()
+    {
+        type = CV_64FC4;
+        SetUp();
+        clock_t start = clock();
+        R_TEST(
+                cv::ocl::cartToPolar(oclmat1,oclmat2,oclRes, oclRes1, 1);
+                if( COUNT_D ) {oclRes.download(dst);oclRes1.download(dst);}
+                , 5);
+        std::cout<< "ocl::CartToPolar -- " << clock() - start << "ms\n";
+    }
+};
+
+struct PolarToCartP : ArithmTestP
+{
+    cv::ocl::oclMat oclRes1;
+    virtual void Run()
+    {
+        type = CV_64FC4;
+        SetUp();
+        clock_t start = clock();
+        R_TEST(
+                cv::ocl::polarToCart(oclmat1,oclmat2,oclRes, oclRes1, 1);
+                if( COUNT_D ) {oclRes.download(dst);oclRes1.download(dst);}
+                , 2);
+        std::cout<< "ocl::polarToCart -- " << clock() - start << "ms\n";
+    }
+};
+
+///////////////////////////////////////
+// split & merge
+struct SplitP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::split(oclmat1, dev_dst),
+                {			
+                dstv.resize(dev_dst.size());
+                for (size_t i = 0; i < dev_dst.size(); ++i)
+                {
+                dev_dst[i].download(dstv[i]);
+                }
+                }
+                );
+    }
+    protected:
+    std::vector<cv::ocl::oclMat> dev_dst;
+    virtual void SetUp()
+    {
+        size = cv::Size( 3000, 3000 );
+
+        mat1.create(size, type);
+        mat1.setTo(cv::Scalar(1.0, 2.0, 3.0, 4.0));
+
+        oclmat1 = cv::ocl::oclMat(mat1);
+    }
+};
+
+struct MergeP : SplitP
+{
+    virtual void Run()
+    {
+        SetUp();
+        cv::ocl::split(oclmat1, dev_dst);
+        cv::split(mat1, dstv);
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                cv::ocl::merge(dev_dst, oclmat2),
+                oclmat2.download(dst)
+                );
+    }
+};
+
+struct SetToP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        static cv::Scalar s = cv::Scalar(1, 2, 3, 4);
+        P_TEST_FULL(
+                oclmat2 = cv::ocl::oclMat(mat2),
+                oclmat1.setTo( s, oclmat2 ),
+                oclmat1.download(dst);
+                );
+    }
+    protected:
+    virtual void SetUp()
+    {
+        type = CV_32FC4;
+        size = cv::Size(3000, 3000);
+
+        mat1.create(size, type);
+        oclmat1.create(size, type);
+
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+        mat2 = cvtest::randomMat(rng, size, CV_8UC1, 0.0, 1.5, false);
+        oclmat2 = cv::ocl::oclMat(mat2);
+    }
+};
+
+struct CopyToP : SetToP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                oclmat1.copyTo( oclRes, oclmat2 ),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct ConvertToP : ArithmTestP
+{
+    virtual void Run()
+    {
+        type = CV_32FC1;;
+        SetUp();
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+        const double a = rng.uniform(0.0, 1.0);
+        const double b = rng.uniform(-10.0, 10.0);
+
+        int type2 = CV_32FC4;
+
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat(mat1),
+                oclmat1.convertTo( oclRes, type2 /*, a, b */ ), // fails when scaling factors a and b are specified
+                oclRes.download(dst)
+                );
+    }
+};
+
+////////////////////////////////////////////
+// Filters
+
+struct FilterTestP : PerfTest
+{
+    protected:
+        int ksize;
+        int dx, dy;
+
+        cv::Mat img_rgba;
+        cv::Mat img_gray;
+
+        cv::ocl::oclMat ocl_img_rgba;
+        cv::ocl::oclMat ocl_img_gray;
+
+        cv::ocl::oclMat dev_dst_rgba;
+        cv::ocl::oclMat dev_dst_gray;
+
+        cv::Mat dst_rgba;
+        cv::Mat dst_gray;
+
+        cv::Mat kernel;
+
+        int bordertype;
+
+        virtual void SetUp()
+        {
+            bordertype = (int)cv::BORDER_DEFAULT;
+            ksize = 7;
+            dx = ksize/2; dy = ksize/2;
+
+            kernel = cv::Mat::ones(ksize, ksize, CV_8U);
+
+            cv::Mat img = readImage(FILTER_TEST_IMAGE);
+            ASSERT_FALSE(img.empty());
+
+            cv::cvtColor(img, img_rgba, CV_BGR2BGRA);
+            cv::cvtColor(img, img_gray, CV_BGR2GRAY);
+
+            ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+            ocl_img_gray = cv::ocl::oclMat(img_gray);
+        }
+};
+
+struct BlurP : FilterTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                {
+                ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::blur(ocl_img_rgba, dev_dst_rgba, cv::Size(ksize, ksize), cv::Point(-1,-1), bordertype);
+                cv::ocl::blur(ocl_img_gray, dev_dst_gray, cv::Size(ksize, ksize), cv::Point(-1,-1), bordertype);
+                },
+                {
+                dev_dst_rgba.download(dst_rgba);
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+struct SobelP : FilterTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                {
+                ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::Sobel(ocl_img_rgba, dev_dst_rgba, -1, dx, dy, ksize, 1, 0, bordertype);
+                cv::ocl::Sobel(ocl_img_gray, dev_dst_gray, -1, dx, dy, ksize, 1, 0, bordertype);
+                },
+                {
+                dev_dst_rgba.download(dst_rgba);
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+struct ScharrP : FilterTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        dx = 0; dy = 1;
+        P_TEST_FULL(
+                {
+                ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::Scharr(ocl_img_rgba, dev_dst_rgba, -1, dx, dy, 1, 0, bordertype);
+                cv::ocl::Scharr(ocl_img_gray, dev_dst_gray, -1, dx, dy, 1, 0, bordertype);
+                },
+                {
+                dev_dst_rgba.download(dst_rgba);
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+struct GaussianBlurP : FilterTestP
+{
+    virtual void Run()
+    {
+        double sigma1 = 3, sigma2 = 3;
+        SetUp();
+        P_TEST_FULL(
+                {
+                ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::GaussianBlur(ocl_img_rgba, dev_dst_rgba, cv::Size(ksize, ksize), sigma1, sigma2);
+                cv::ocl::GaussianBlur(ocl_img_gray, dev_dst_gray, cv::Size(ksize, ksize), sigma1, sigma2);
+                },
+                {
+                dev_dst_rgba.download(dst_rgba);
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+struct DilateP : FilterTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                {
+                ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::dilate(ocl_img_rgba, dev_dst_rgba, kernel);
+                cv::ocl::dilate(ocl_img_gray, dev_dst_gray, kernel);
+                },
+                {
+                dev_dst_rgba.download(dst_rgba);
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+struct ErodeP : FilterTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                {
+                ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::erode(ocl_img_rgba, dev_dst_rgba, kernel);
+                cv::ocl::erode(ocl_img_gray, dev_dst_gray, kernel);
+                },
+                {
+                dev_dst_rgba.download(dst_rgba);
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+struct MorphExP : FilterTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        cv::ocl::oclMat okernel;
+        P_TEST_FULL(
+                {
+                okernel      = cv::ocl::oclMat(kernel);
+                ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::morphologyEx(ocl_img_rgba, dev_dst_rgba, 3, okernel);
+                cv::ocl::morphologyEx(ocl_img_gray, dev_dst_gray, 3, okernel);
+                },
+                {
+                dev_dst_rgba.download(dst_rgba);
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+struct LaplacianP : FilterTestP
+{
+    void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                {
+                ocl_img_rgba = cv::ocl::oclMat(img_rgba);
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::Laplacian(ocl_img_rgba, dev_dst_rgba, -1, 3 );
+                cv::ocl::Laplacian(ocl_img_gray, dev_dst_gray, -1, 3 );
+                },
+                {
+                dev_dst_rgba.download(dst_rgba);
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+////////////////////
+// histograms
+struct CalcHistP : PerfTest
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat = cv::ocl::oclMat( src ),
+                cv::ocl::calcHist(oclmat, oclRes),
+                oclRes.download(hist)
+                );
+    }
+    protected:
+    cv::Size size;
+    cv::Mat src, hist;
+
+    cv::ocl::oclMat oclmat;
+    cv::ocl::oclMat oclRes;
+
+    virtual void SetUp()
+    {
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+        size = cv::Size(3000, 3000);
+        src = cvtest::randomMat(rng, size, CV_8UC1, 0, 255, false);
+        oclmat = cv::ocl::oclMat( src );
+    }
+};
+
+struct EqualizeHistP : CalcHistP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat = cv::ocl::oclMat( src ),
+                cv::ocl::equalizeHist(oclmat, oclRes),
+                oclRes.download(hist)
+                );
+    }
+};
+
+struct ThresholdP : CalcHistP
+{
+    virtual void Run()
+    {
+        SetUp();
+        int threshOp = (int)cv::THRESH_TOZERO_INV;;
+        double maxVal = 200;
+        double thresh = 125;
+
+        clock_t start = clock();
+
+        P_TEST_FULL(
+                oclmat = cv::ocl::oclMat( src ),
+                cv::ocl::threshold(oclmat, oclRes, thresh, maxVal, threshOp ),
+                oclRes.download(hist)
+                );
+    }
+};
+
+struct ResizeP : ArithmTestP
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat( mat1 ),
+                cv::ocl::resize(oclmat1, oclRes, cv::Size(), 2.0, 2.0),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct CvtColorP : PerfTest
+{
+    virtual void Run()
+    {
+        SetUp();
+        P_TEST_FULL(
+                oclmat = cv::ocl::oclMat( img ),
+                cv::ocl::cvtColor(oclmat, ocldst, cvtcode),
+                ocldst.download(dst)
+                );
+    }
+    protected:
+    int type;
+    int cvtcode;
+
+    cv::Mat img, dst;
+    cv::ocl::oclMat oclmat, ocldst;
+    virtual void SetUp()
+    {
+        type = CV_8U;
+        cvtcode = CV_BGR2GRAY;
+        cv::Mat imgBase = readImage(FILTER_TEST_IMAGE);
+        ASSERT_FALSE(imgBase.empty());
+
+        imgBase.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
+        oclmat = cv::ocl::oclMat( img );
+    };
+};
+
+
+struct WarpAffineP : ArithmTestP
+{
+    void Run()
+    {
+        SetUp();
+        const double aplha = CV_PI / 4;
+        double mat[2][3] = { {std::cos(aplha), -std::sin(aplha), mat1.cols / 2},
+            {std::sin(aplha),  std::cos(aplha), 0}};
+        cv::Mat M(2, 3, CV_64F, (void*) mat);
+
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat( mat1 ),
+                cv::ocl::warpAffine( oclmat1, oclRes, M, cv::Size(1500, 1500) ),
+                oclRes.download(dst)
+                );
+    }
+};
+
+struct WarpPerspectiveP : ArithmTestP
+{
+    void Run()
+    {
+        SetUp();
+        const double aplha = CV_PI / 4;
+        double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), mat1.cols / 2},
+            {std::sin(aplha),  std::cos(aplha), 0},
+            {0.0,              0.0,             1.0}};
+        cv::Mat M(3, 3, CV_64F, (void*) mat);
+
+        P_TEST_FULL(
+                oclmat1 = cv::ocl::oclMat( mat1 ),
+                cv::ocl::warpPerspective( oclmat1, oclRes, M, cv::Size(1500, 1500) ),
+                oclRes.download(dst)
+                );
+    }
+};
+
+
+struct CornerHarrisP : FilterTestP
+{
+    void Run()
+    {
+        SetUp();
+        bordertype = 2;
+        P_TEST_FULL(
+                {
+                ocl_img_gray = cv::ocl::oclMat(img_gray);
+                },
+                {
+                cv::ocl::cornerHarris(ocl_img_gray, dev_dst_gray, 3, ksize, 0.5, bordertype );
+                },
+                {
+                dev_dst_gray.download(dst_gray);
+                });
+    }
+};
+
+void test()
+{
+    clock_t start = clock();
+    std::cout << ">>>>>>>> Performance test started <<<<<<<<\n";
+    /*
+       {
+       AddArrayP AddArrayP;
+       AddArrayP.Run();
+       SubtractArrayP subarray;
+       subarray.Run();
+       MultiplyArrayP MultiplyArrayP;
+       MultiplyArrayP.Run();
+       DivideArrayP DivideArrayP;
+       DivideArrayP.Run();
+       }
+       std::cout.flush();
+       {
+       CompareP comp;
+       comp.Run();
+       MagnitudeP magnitude;
+       magnitude.Run();
+       LUTP lut;
+       lut.Run();
+       FlipP FlipP;
+       FlipP.Run();
+       MinMaxP minmax;
+       minmax.Run();
+       MinMaxLocP minmaxloc;
+       minmaxloc.Run();
+       CountNonZeroP cnz;
+       cnz.Run();
+       SumP sum;
+       sum.Run();
+       }*/
+      /* std::cout.flush();
+       {
+       BitwiseNotP bn;
+       bn.Run();
+       BitwiseOrP bo;
+       bo.Run();
+       BitwiseAndP ba;
+       ba.Run();
+       BitwiseXorP bx;
+       bx.Run();
+       }*/
+       
+    std::cout.flush();
+    {
+        //   TransposeP transpose;
+        //  transpose.Run();
+        // AbsdiffArrayP absdiff;
+        // absdiff.Run();
+        // SplitP split;
+        // split.Run();
+       // MergeP merge;
+       // merge.Run();
+        /*
+           SetToP setto;
+           setto.Run();
+           CopyToP copyto;
+           copyto.Run();
+           ConvertToP convertto;
+           convertto.Run();
+           */
+    }
+    /*
+       std::cout.flush();
+       {
+       BlurP blur;
+       blur.Run();
+       SobelP sobel;
+       sobel.Run();
+       ScharrP scharr;
+       scharr.Run();
+       GaussianBlurP gblur;
+       gblur.Run();
+       DilateP dilate;
+       dilate.Run();
+       ErodeP erode;
+       erode.Run();
+       }
+       std::cout.flush();
+       {
+       MorphExP morphex;
+       morphex.Run();
+       CalcHistP calchist;
+       calchist.Run();
+       EqualizeHistP eqhist;
+       eqhist.Run();
+       ThresholdP threshold;
+       threshold.Run();
+       ResizeP resize;
+       resize.Run();
+       CvtColorP cvtcolor;
+       cvtcolor.Run();
+       }
+
+       {
+       LogP log;
+       log.Run();
+       ExpP exp;
+       exp.Run();
+       }
+
+       std::cout.flush();
+       {
+    //PhaseP phase;
+    //phase.Run();
+    }
+    std::cout.flush();
+    {
+    CartToPolarP ctop;
+    ctop.Run();
+    }
+    std::cout.flush();
+    {
+    PolarToCartP ptoc;
+    ptoc.Run();
+    }
+    {
+    WarpAffineP warpA;
+    warpA.Run();
+    WarpPerspectiveP warpP;
+    warpP.Run();	
+    }
+
+    {
+    CornerHarrisP ch;
+    ch.Run();
+    }
+
+    {
+    LaplacianP laplacian;
+    laplacian.Run();
+    }
+
+
+    */
+        std::cout << ">>>>>>>> Performance test ended <<<<<<<<\ntotal - " << clock() - start << "ms\n";
+    std::cout.flush();
+}
+
+void  run_perf_test()
+{
+    print_info();
+    cvtest::TS::ptr()->init("ocl");
+    test();
+}
+
+#endif // WITH_OPENCL
+
+#endif // PREF_TEST_OCL
diff --git a/modules/ocl/perf/precomp.cpp b/modules/ocl/perf/precomp.cpp
new file mode 100644
index 000000000..f505dac9f
--- /dev/null
+++ b/modules/ocl/perf/precomp.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+
+	
\ No newline at end of file
diff --git a/modules/ocl/perf/precomp.hpp b/modules/ocl/perf/precomp.hpp
new file mode 100644
index 000000000..cad26fc8d
--- /dev/null
+++ b/modules/ocl/perf/precomp.hpp
@@ -0,0 +1,72 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <limits>
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <cstdarg>
+#include "cvconfig.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/video/video.hpp"
+#include "opencv2/ts/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+#include "opencv2/ocl/ocl.hpp"
+#include "opencv2/nonfree/nonfree.hpp"
+
+#include "utility.hpp"
+#include "interpolation.hpp"
+//#include "add_test_info.h"
+//#define  PERF_TEST_OCL 1
+
+#endif
+
diff --git a/modules/ocl/perf/test_arithm.cpp b/modules/ocl/perf/test_arithm.cpp
new file mode 100644
index 000000000..0e6cf6e4b
--- /dev/null
+++ b/modules/ocl/perf/test_arithm.cpp
@@ -0,0 +1,3658 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Shengen Yan, yanshengen@gmail.com
+//    Jiang Liyuan,jlyuan001.good@163.com
+//    Rock Li, Rock.Li@amd.com
+//    Zailong Wu, bullet@yeah.net
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "precomp.hpp"
+#include <iomanip>
+
+#ifdef HAVE_OPENCL
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
+{
+	int type;
+	cv::Scalar val;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat mat2;
+	cv::Mat mask;
+	cv::Mat dst;
+	cv::Mat dst1; //bak, for two outputs
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int src2x;
+	int src2y;
+	int dstx;
+	int dsty;
+	int maskx;
+	int masky;
+
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat mat2_roi;
+	cv::Mat mask_roi;
+	cv::Mat dst_roi;
+	cv::Mat dst1_roi; //bak
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+	cv::ocl::oclMat gdst1_whole; //bak
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gmat2;
+	cv::ocl::oclMat gdst;
+	cv::ocl::oclMat gdst1;   //bak
+	cv::ocl::oclMat gmask;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		//mat2 = randomMat(rng, cv::Size(512,3), type, 5, 16, false);
+		mat2 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		dst1  = randomMat(rng, size, type, 5, 16, false);
+		mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+
+		cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+
+		val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums>0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat1.cols-1; 
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src2x   = 1;
+			src1y   = 1;
+			src2y   = 1;
+			dstx    = 1;
+			dsty    =1;
+			maskx	 =1;
+			masky	=1;
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src2x = 0;
+			src1y = 0;
+			src2y = 0;
+			dstx = 0;
+			dsty = 0;
+			maskx	 =0;
+			masky	=0;
+		};
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		//mat2_roi = mat2(Rect(src2x,src2y,256,1));
+		mat2_roi = mat2(Rect(src2x,src2y,roicols,roirows));
+		mask_roi = mask(Rect(maskx,masky,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+		dst1_roi = dst1(Rect(dstx,dsty,roicols,roirows));
+
+		//gdst_whole = dst;
+		//gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+		//gdst1_whole = dst1;
+		//gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+
+		//gmat1 = mat1_roi;
+		//gmat2 = mat2_roi;
+		//gmask = mask_roi; 
+	}
+
+};
+////////////////////////////////lut/////////////////////////////////////////////////
+
+struct Lut : ArithmTestBase {};
+
+TEST_P(Lut, Mat)
+{       
+
+	cv::Mat mat2(3, 512, CV_8UC1);
+	cv::RNG& rng = TS::ptr()->get_rng();
+	rng.fill(mat2, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(256));
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);  
+			mat2 = randomMat(rng, cv::Size(512,3), type, 5, 16, false);
+			mat2_roi = mat2(Rect(src2x,src2y,256,1));
+
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::LUT(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::LUT(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		// s=GetParam();
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		//  src2x = rng.uniform( 0,mat2.cols - 256);
+		// src2y = rng.uniform (0,mat2.rows - 1);
+
+		// cv::Mat mat2_roi = mat2(Rect(src2x,src2y,256,1));
+		mat2 = randomMat(rng, cv::Size(512,3), type, 5, 16, false);
+		mat2_roi = mat2(Rect(src2x,src2y,256,1));
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		//   gdst1_whole = dst1;
+		//     gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		//     gmask = mask_roi; 
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::LUT(gmat1, gmat2, gdst);
+	};
+#endif
+
+}
+
+
+
+////////////////////////////////exp/////////////////////////////////////////////////
+
+struct Exp : ArithmTestBase {};
+
+TEST_P(Exp, Mat) 
+{  
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::exp(mat1_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+			gmat1 = mat1_roi;
+
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::exp(gmat1, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download(cpu_dst);
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+			//EXPECT_MAT_NEAR(dst, cpu_dst, 0,"");
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::exp(gmat1, gdst);
+	};
+#endif
+
+}
+
+
+////////////////////////////////log/////////////////////////////////////////////////
+
+struct Log : ArithmTestBase {};
+
+TEST_P(Log, Mat) 
+{  
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::log(mat1_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::log(gmat1, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::log(gmat1, gdst);
+	};
+#endif
+
+}
+
+
+
+
+////////////////////////////////add/////////////////////////////////////////////////
+
+struct Add : ArithmTestBase {};
+
+TEST_P(Add, Mat) 
+{    
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::add(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::add(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::add(gmat1, gmat2, gdst);
+	};
+#endif
+}
+
+TEST_P(Add, Mat_Mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::add(mat1_roi, mat2_roi, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gmask = mask_roi; 
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::add(gmat1, gmat2, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		gmask = mask_roi; 
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::add(gmat1, gmat2, gdst, gmask);
+	};
+#endif
+}
+TEST_P(Add, Scalar) 
+{  
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::add(mat1_roi, val, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::add(gmat1, val, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::add(gmat1, val, gdst);
+	};
+#endif
+}
+
+TEST_P(Add, Scalar_Mask) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::add(mat1_roi, val, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+			gmat1 = mat1_roi;
+			gmask = mask_roi; 
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::add(gmat1, val, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmask = mask_roi; 
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::add(gmat1, val, gdst, gmask);
+	};
+#endif
+}
+
+
+////////////////////////////////sub/////////////////////////////////////////////////
+struct Sub : ArithmTestBase {};
+
+TEST_P(Sub, Mat) 
+{ 
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::subtract(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::subtract(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::subtract(gmat1, gmat2, gdst);
+	};
+#endif
+}
+
+TEST_P(Sub, Mat_Mask) 
+{  
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::subtract(mat1_roi, mat2_roi, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		gmask = mask_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
+	};
+#endif
+}
+TEST_P(Sub, Scalar) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::subtract(mat1_roi, val, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::subtract(gmat1, val, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::subtract(gmat1, val, gdst);
+	};
+#endif
+}
+
+TEST_P(Sub, Scalar_Mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::subtract(mat1_roi, val, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::subtract(gmat1, val, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmask = mask_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::subtract(gmat1, val, gdst, gmask);
+	};
+#endif
+}
+
+
+////////////////////////////////Mul/////////////////////////////////////////////////
+struct Mul : ArithmTestBase {};
+
+TEST_P(Mul, Mat) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::multiply(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::multiply(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::multiply(gmat1, gmat2, gdst);
+	};
+#endif
+}
+
+TEST_P(Mul, Mat_Scalar) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			cv::RNG& rng = TS::ptr()->get_rng();
+			double s = rng.uniform(-10.0, 10.0);    
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::multiply(mat1_roi, mat2_roi, dst_roi, s);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::multiply(gmat1, gmat2, gdst, s);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		cv::RNG& rng = TS::ptr()->get_rng();
+		double s = rng.uniform(-10.0, 10.0);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::multiply(gmat1, gmat2, gdst, s);
+	};
+#endif
+}
+
+
+struct Div : ArithmTestBase {};
+
+TEST_P(Div, Mat) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::divide(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::divide(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::divide(gmat1, gmat2, gdst);
+	};
+#endif
+}
+
+TEST_P(Div, Mat_Scalar) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			cv::RNG& rng = TS::ptr()->get_rng();
+			double s = rng.uniform(-10.0, 10.0);  
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::divide(mat1_roi, mat2_roi, dst_roi, s);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::divide(gmat1, gmat2, gdst, s);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		cv::RNG& rng = TS::ptr()->get_rng();
+		double s = rng.uniform(-10.0, 10.0);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::divide(gmat1, gmat2, gdst, s);
+	};
+#endif
+}
+
+
+struct Absdiff : ArithmTestBase {};
+
+TEST_P(Absdiff, Mat) 
+{ 
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::absdiff(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::absdiff(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::absdiff(gmat1, gmat2, gdst);
+	};
+#endif
+}
+
+TEST_P(Absdiff, Mat_Scalar) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::absdiff(mat1_roi, val, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::absdiff(gmat1, val, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::absdiff(gmat1, val, gdst);
+	};
+#endif
+}
+
+
+
+struct CartToPolar : ArithmTestBase {};
+
+TEST_P(CartToPolar, angleInDegree) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gdst1_whole = dst1;
+			gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			cv::Mat cpu_dst1;
+			gdst1_whole.download(cpu_dst1);
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gdst1_whole = dst1;
+		gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
+	};
+#endif
+}
+
+TEST_P(CartToPolar, angleInRadians) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+			gdst1_whole = dst1;
+			gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			cv::Mat cpu_dst1;
+			gdst1_whole.download(cpu_dst1);
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gdst1_whole = dst1;
+		gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
+	};
+#endif
+}
+
+
+struct PolarToCart : ArithmTestBase {};
+
+TEST_P(PolarToCart, angleInDegree) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gdst1_whole = dst1;
+			gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			cv::Mat cpu_dst1;
+			gdst1_whole.download(cpu_dst1);
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gdst1_whole = dst1;
+		gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
+	};
+#endif
+}
+
+TEST_P(PolarToCart, angleInRadians) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gdst1_whole = dst1;
+			gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			cv::Mat cpu_dst1;
+			gdst1_whole.download(cpu_dst1);
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		gdst1_whole = dst1;
+		gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
+	};
+#endif
+}
+
+
+
+struct Magnitude : ArithmTestBase {};
+
+TEST_P(Magnitude, Mat) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::magnitude(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::magnitude(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::magnitude(gmat1, gmat2, gdst);
+	};
+#endif
+}
+
+struct Transpose : ArithmTestBase {};
+
+TEST_P(Transpose, Mat) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::transpose(mat1_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::transpose(gmat1, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::transpose(gmat1, gdst);
+	};
+#endif
+}
+
+
+struct Flip : ArithmTestBase {};
+
+TEST_P(Flip, X) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::flip(mat1_roi, dst_roi, 0);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::flip(gmat1, gdst, 0);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::flip(gmat1, gdst, 0);
+	};
+#endif
+}
+
+TEST_P(Flip, Y) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::flip(mat1_roi, dst_roi, 1);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::flip(gmat1, gdst, 1);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::flip(gmat1, gdst, 1);
+	};
+#endif
+}
+
+TEST_P(Flip, BOTH) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::flip(mat1_roi, dst_roi, -1);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::flip(gmat1, gdst, -1);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::flip(gmat1, gdst, -1);
+	};
+#endif
+}
+
+
+
+struct MinMax : ArithmTestBase {};
+
+TEST_P(MinMax, MAT) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			double minVal, maxVal;
+			cv::Point minLoc, maxLoc;
+			t0 = (double)cvGetTickCount();//cpu start
+			if (mat1.depth() != CV_8S)
+			{
+				cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
+			}
+			else 
+			{
+				minVal = std::numeric_limits<double>::max();
+				maxVal = -std::numeric_limits<double>::max();
+				for (int i = 0; i < mat1_roi.rows; ++i)
+					for (int j = 0; j < mat1_roi.cols; ++j)
+					{
+						signed char val = mat1_roi.at<signed char>(i, j);
+						if (val < minVal) minVal = val;
+						if (val > maxVal) maxVal = val;
+					}
+			}
+
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gmat1 = mat1_roi;
+			double minVal_, maxVal_;  
+			t2=(double)cvGetTickCount();//kernel        
+			cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat1 = mat1_roi;
+		double minVal_, maxVal_;  
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
+	};
+#endif
+}
+
+TEST_P(MinMax, MASK) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			double minVal, maxVal;
+			cv::Point minLoc, maxLoc;
+			t0 = (double)cvGetTickCount();//cpu start
+			if (mat1.depth() != CV_8S)
+			{
+				cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc,mask_roi);
+			}
+			else 
+			{
+				minVal = std::numeric_limits<double>::max();
+				maxVal = -std::numeric_limits<double>::max();
+				for (int i = 0; i < mat1_roi.rows; ++i)
+					for (int j = 0; j < mat1_roi.cols; ++j)
+					{
+						signed char val = mat1_roi.at<signed char>(i, j);
+						unsigned char m = mask_roi.at<unsigned char>(i, j);
+						if (val < minVal && m) minVal = val;
+						if (val > maxVal && m) maxVal = val;
+					}
+			}
+
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gmat1 = mat1_roi;
+			gmask = mask_roi;
+			double minVal_, maxVal_;  
+			t2=(double)cvGetTickCount();//kernel        
+			cv::ocl::minMax(gmat1, &minVal_, &maxVal_,gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat1 = mat1_roi;
+		gmask = mask_roi;
+		double minVal_, maxVal_;  
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::minMax(gmat1, &minVal_, &maxVal_,gmask);
+	};
+#endif
+}
+
+
+struct MinMaxLoc : ArithmTestBase {};
+
+TEST_P(MinMaxLoc, MAT) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			double minVal, maxVal;
+			cv::Point minLoc, maxLoc;
+			int depth = mat1.depth();
+			t0 = (double)cvGetTickCount();//cpu start
+			if (depth != CV_8S)
+			{
+				cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
+			}
+			else 
+			{
+				minVal = std::numeric_limits<double>::max();
+				maxVal = -std::numeric_limits<double>::max();
+				for (int i = 0; i < mat1_roi.rows; ++i)
+					for (int j = 0; j < mat1_roi.cols; ++j)
+					{
+						signed char val = mat1_roi.at<signed char>(i, j);
+						if (val < minVal) {
+							minVal = val;
+							minLoc.x = j;
+							minLoc.y = i;
+						}
+						if (val > maxVal) {
+							maxVal = val;
+							maxLoc.x = j;
+							maxLoc.y = i;
+						} 
+					}
+			}
+
+
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gmat1 = mat1_roi;
+			double minVal_, maxVal_;  
+			cv::Point minLoc_, maxLoc_;    
+			t2=(double)cvGetTickCount();//kernel                    
+			cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_,&minLoc_, &maxLoc_, cv::ocl::oclMat());
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat1 = mat1_roi;
+		double minVal_, maxVal_;  
+		cv::Point minLoc_, maxLoc_;    
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_,&minLoc_, &maxLoc_, cv::ocl::oclMat());
+	};
+#endif
+
+}
+
+
+TEST_P(MinMaxLoc, MASK) 
+{    
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			double minVal, maxVal;
+			cv::Point minLoc, maxLoc;
+			int depth = mat1.depth();
+			t0 = (double)cvGetTickCount();//cpu start
+			if (depth != CV_8S)
+			{
+				cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc,mask_roi);
+			}
+			else 
+			{
+				minVal = std::numeric_limits<double>::max();
+				maxVal = -std::numeric_limits<double>::max();
+				for (int i = 0; i < mat1_roi.rows; ++i)
+					for (int j = 0; j < mat1_roi.cols; ++j)
+					{
+						signed char val = mat1_roi.at<signed char>(i, j);
+						unsigned char m = mask_roi.at<unsigned char>(i ,j);
+						if (val < minVal && m) {
+							minVal = val;
+							minLoc.x = j;
+							minLoc.y = i;
+						}
+						if (val > maxVal && m) {
+							maxVal = val;
+							maxLoc.x = j;
+							maxLoc.y = i;
+						} 
+					}
+			}
+
+
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gmat1 = mat1_roi;
+			gmask = mask_roi;
+			double minVal_, maxVal_;  
+			cv::Point minLoc_, maxLoc_;    
+			t2=(double)cvGetTickCount();//kernel                    
+			cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_,&minLoc_, &maxLoc_, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat1 = mat1_roi;
+		gmask = mask_roi;
+		double minVal_, maxVal_;  
+		cv::Point minLoc_, maxLoc_;    
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_,&minLoc_, &maxLoc_, gmask);
+	};
+#endif
+}
+
+
+struct Sum : ArithmTestBase {};
+
+TEST_P(Sum, MAT) 
+{    
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			Scalar cpures =cv::sum(mat1_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			Scalar gpures=cv::ocl::sum(gmat1);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		Scalar gpures=cv::ocl::sum(gmat1);
+	};
+#endif
+}
+
+//TEST_P(Sum, MASK) 
+//{    
+//    for(int j=0; j<LOOP_TIMES; j++)
+//    {
+//       
+//    }
+//}
+
+struct CountNonZero : ArithmTestBase {};
+
+TEST_P(CountNonZero, MAT) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			int cpures =cv::countNonZero(mat1_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			int gpures=cv::ocl::countNonZero(gmat1);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		int gpures=cv::ocl::countNonZero(gmat1);
+	};
+#endif
+
+}
+
+
+
+////////////////////////////////phase/////////////////////////////////////////////////
+struct Phase : ArithmTestBase {};
+
+TEST_P(Phase, Mat)
+{
+	if(mat1.depth()!=CV_32F && mat1.depth()!=CV_64F)
+	{
+		cout<<"\tUnsupported type\t\n";
+	}
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::phase(mat1_roi,mat2_roi,dst_roi,0);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::phase(gmat1,gmat2,gdst,0);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::phase(gmat1,gmat2,gdst,0);
+	};
+#endif
+
+}
+
+
+////////////////////////////////bitwise_and/////////////////////////////////////////////////
+struct Bitwise_and : ArithmTestBase {};
+
+TEST_P(Bitwise_and, Mat) 
+{    
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_and(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_and(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_and(gmat1, gmat2, gdst);
+	};
+#endif
+
+}
+
+TEST_P(Bitwise_and, Mat_Mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_and(mat1_roi, mat2_roi, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		gmask = mask_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
+	};
+#endif
+}
+
+TEST_P(Bitwise_and, Scalar) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_and(mat1_roi, val, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_and(gmat1, val, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_and(gmat1, val, gdst);
+	};
+#endif
+}
+
+TEST_P(Bitwise_and, Scalar_Mask) 
+{   
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_and(mat1_roi, val, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmask = mask_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
+	};
+#endif
+}
+
+
+
+////////////////////////////////bitwise_or/////////////////////////////////////////////////
+
+struct Bitwise_or : ArithmTestBase {};
+
+TEST_P(Bitwise_or, Mat) 
+{    
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_or(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_or(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_or(gmat1, gmat2, gdst);
+	};
+#endif
+}
+
+TEST_P(Bitwise_or, Mat_Mask) 
+{    
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_or(mat1_roi, mat2_roi, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		gmask = mask_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
+	};
+#endif
+}
+TEST_P(Bitwise_or, Scalar) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_or(mat1_roi, val, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_or(gmat1, val, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_or(gmat1, val, gdst);
+	};
+#endif
+}
+
+TEST_P(Bitwise_or, Scalar_Mask) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_or(mat1_roi, val, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmask = mask_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
+	};
+#endif
+}
+
+
+////////////////////////////////bitwise_xor/////////////////////////////////////////////////
+
+struct Bitwise_xor : ArithmTestBase {};
+
+TEST_P(Bitwise_xor, Mat) 
+{   
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
+	};
+#endif
+}
+
+TEST_P(Bitwise_xor, Mat_Mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		gmask = mask_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
+	};
+#endif
+}
+
+TEST_P(Bitwise_xor, Scalar) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_xor(mat1_roi, val, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_xor(gmat1, val, gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_xor(gmat1, val, gdst);
+	};
+#endif
+}
+
+TEST_P(Bitwise_xor, Scalar_Mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_xor(mat1_roi, val, dst_roi, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		gmask = mask_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
+	};
+#endif
+}
+
+
+////////////////////////////////bitwise_not/////////////////////////////////////////////////
+
+struct Bitwise_not : ArithmTestBase {};
+
+TEST_P(Bitwise_not, Mat) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::bitwise_not(mat1_roi,dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::bitwise_not(gmat1,gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::bitwise_not(gmat1,gdst);
+	};
+#endif
+}
+
+////////////////////////////////compare/////////////////////////////////////////////////
+PARAM_TEST_CASE ( CompareTestBase, MatType, bool)
+{
+	int type;
+	cv::Scalar val;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat mat2;
+	cv::Mat mask;
+	cv::Mat dst;
+	cv::Mat dst1; //bak, for two outputs
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int src2x;
+	int src2y;
+	int dstx;
+	int dsty;
+	int maskx;
+	int masky;
+
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat mat2_roi;
+	cv::Mat mask_roi;
+	cv::Mat dst_roi;
+	cv::Mat dst1_roi; //bak
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+	cv::ocl::oclMat gdst1_whole; //bak
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gmat2;
+	cv::ocl::oclMat gdst;
+	cv::ocl::oclMat gdst1;   //bak
+	cv::ocl::oclMat gmask;
+
+	virtual void SetUp()
+	{
+		//type = GET_PARAM(0);
+		type = CV_8UC1;
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		//mat2 = randomMat(rng, cv::Size(512,3), type, 5, 16, false);
+		mat2 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		dst1  = randomMat(rng, size, type, 5, 16, false);
+		mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+
+		cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+
+		val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums>0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat1.cols-1; 
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src2x   = 1;
+			src1y   = 1;
+			src2y   = 1;
+			dstx    = 1;
+			dsty    =1;
+			maskx	 =1;
+			masky	=1;
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src2x = 0;
+			src1y = 0;
+			src2y = 0;
+			dstx = 0;
+			dsty = 0;
+			maskx	 =0;
+			masky	=0;
+		};
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		//mat2_roi = mat2(Rect(src2x,src2y,256,1));
+		mat2_roi = mat2(Rect(src2x,src2y,roicols,roirows));
+		mask_roi = mask(Rect(maskx,masky,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+		dst1_roi = dst1(Rect(dstx,dsty,roicols,roirows));
+
+		//gdst_whole = dst;
+		//gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+		//gdst1_whole = dst1;
+		//gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+
+		//gmat1 = mat1_roi;
+		//gmat2 = mat2_roi;
+		//gmask = mask_roi; 
+	}
+
+};
+struct Compare : CompareTestBase {};
+
+TEST_P(Compare, Mat) 
+{   
+	if(mat1.type()==CV_8SC1)
+	{
+		cout << "\tUnsupported type\t\n";
+	}	
+
+	int cmp_codes[] = {CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE};
+	//const char* cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
+	int cmp_num = sizeof(cmp_codes) / sizeof(int);
+	for (int i = 0; i < cmp_num; ++i)
+	{
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+		double totalcputick=0;
+		double totalgputick=0;
+		double totalgputick_kernel=0;
+		double t0=0;
+		double t1=0;
+		double t2=0;	
+		for(int k=1;k<2;k++){
+			totalcputick=0;
+			totalgputick=0;
+			totalgputick_kernel=0;
+			for(int j = 0; j < LOOP_TIMES+1; j ++)
+			{
+				Has_roi(k);       
+
+				t0 = (double)cvGetTickCount();//cpu start
+				cv::compare(mat1_roi,mat2_roi,dst_roi,cmp_codes[i]);
+				t0 = (double)cvGetTickCount() - t0;//cpu end
+
+				t1 = (double)cvGetTickCount();//gpu start1		
+				gdst_whole = dst;
+				gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+				gmat1 = mat1_roi;
+				gmat2 = mat2_roi;
+				t2=(double)cvGetTickCount();//kernel
+				cv::ocl::compare(gmat1,gmat2,gdst,cmp_codes[i]);
+				t2 = (double)cvGetTickCount() - t2;//kernel
+				cv::Mat cpu_dst;
+				gdst_whole.download (cpu_dst);//download
+				t1 = (double)cvGetTickCount() - t1;//gpu end1		
+				if(j == 0)
+					continue;
+				totalgputick=t1+totalgputick;
+				totalcputick=t0+totalcputick;	
+				totalgputick_kernel=t2+totalgputick_kernel;	
+
+			}
+			if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+			cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+			cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+			cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		}
+#else
+		for(int j = 0; j < 2; j ++)
+		{
+			Has_roi(j);
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+			cv::ocl::compare(gmat1,gmat2,gdst,cmp_codes[i]);
+		};
+#endif
+	}
+
+}
+
+struct Pow : ArithmTestBase {};
+
+TEST_P(Pow, Mat)
+{
+	if(mat1.depth()!=CV_32F && mat1.depth()!=CV_64F)
+	{
+		cout<<"\tUnsupported type\t\n";
+	}
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			double p=4.5;
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::pow(mat1_roi,p,dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::pow(gmat1,p,gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		double p=4.5;
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::pow(gmat1,p,gdst);
+	};
+#endif
+}
+
+
+struct MagnitudeSqr : ArithmTestBase {};
+
+TEST_P(MagnitudeSqr, Mat) 
+{    
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			for(int i = 0;i < mat1.rows;++i)
+				for(int j = 0;j < mat1.cols;++j)
+				{
+					float val1 = mat1.at<float>(i,j);
+					float val2 = mat2.at<float>(i,j);
+
+					((float *)(dst.data))[i*dst.step/4 +j]= val1 * val1 +val2 * val2;
+
+				}
+				t0 = (double)cvGetTickCount() - t0;//cpu end
+
+				t1 = (double)cvGetTickCount();//gpu start1		
+				cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
+				t2=(double)cvGetTickCount();//kernel
+				cv::ocl::magnitudeSqr(clmat1,clmat2, cldst);
+				t2 = (double)cvGetTickCount() - t2;//kernel
+				cv::Mat cpu_dst;
+				cldst.download(cpu_dst);//download
+				t1 = (double)cvGetTickCount() - t1;//gpu end1	
+				if(j == 0)
+					continue;
+				totalgputick=t1+totalgputick;
+				totalcputick=t0+totalcputick;	
+				totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::magnitudeSqr(clmat1,clmat2, cldst);
+	};
+#endif
+
+}
+
+
+struct AddWeighted : ArithmTestBase {};
+
+TEST_P(AddWeighted, Mat) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+    double totalcputick=0;
+    double totalgputick=0;
+    double totalgputick_kernel=0;
+    double t0=0;
+    double t1=0;
+    double t2=0;
+    for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+    for(int j = 0; j < LOOP_TIMES+1; j ++)
+    {
+        Has_roi(k);
+        double alpha=2.0,beta=1.0,gama=3.0;      
+
+        t0 = (double)cvGetTickCount();//cpu start
+        cv::addWeighted(mat1_roi,alpha,mat2_roi,beta,gama,dst_roi);
+        t0 = (double)cvGetTickCount() - t0;//cpu end
+
+        t1 = (double)cvGetTickCount();//gpu start1
+
+            gdst_whole = dst;
+            gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+            gmat1 = mat1_roi;
+            gmat2 = mat2_roi;
+
+        t2=(double)cvGetTickCount();//kernel
+        cv::ocl::addWeighted(gmat1,alpha,gmat2,beta,gama, gdst);
+        t2 = (double)cvGetTickCount() - t2;//kernel
+        cv::Mat cpu_dst;
+        gdst_whole.download(cpu_dst);
+        t1 = (double)cvGetTickCount() - t1;//gpu end1
+        if(j == 0)
+            continue;
+        totalgputick=t1+totalgputick;
+        totalcputick=t0+totalcputick;	
+        totalgputick_kernel=t2+totalgputick_kernel;	
+
+    }
+
+        if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+    cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+    cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+}
+#else
+    for(int j = 0; j < 2; j ++)
+    	{
+          Has_roi(j);
+    double alpha=2.0,beta=1.0,gama=3.0;   
+        gdst_whole = dst;
+        gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+        gmat1 = mat1_roi;
+        gmat2 = mat2_roi;
+        if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+        cv::ocl::addWeighted(gmat1,alpha, gmat2,beta,gama, gdst);
+   // double alpha=2.0,beta=1.0,gama=3.0;   
+   // cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
+   // if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+   // cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
+    	};
+#endif
+
+}
+/*
+struct AddWeighted : ArithmTestBase {};
+
+TEST_P(AddWeighted, Mat) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int j = 0; j < LOOP_TIMES+1; j ++)
+	{
+		double alpha=2.0,beta=1.0,gama=3.0;      
+
+		t0 = (double)cvGetTickCount();//cpu start
+		cv::addWeighted(mat1,alpha,mat2,beta,gama,dst);
+		t0 = (double)cvGetTickCount() - t0;//cpu end
+
+		t1 = (double)cvGetTickCount();//gpu start1		
+		cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
+
+		t2=(double)cvGetTickCount();//kernel
+		cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
+		t2 = (double)cvGetTickCount() - t2;//kernel
+		cv::Mat cpu_dst;
+		cldst.download(cpu_dst);
+		t1 = (double)cvGetTickCount() - t1;//gpu end1
+		if(j == 0)
+			continue;
+		totalgputick=t1+totalgputick;
+		totalcputick=t0+totalcputick;	
+		totalgputick_kernel=t2+totalgputick_kernel;	
+
+	}
+	cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+
+#else
+	//for(int j = 0; j < 2; j ++)
+	//	{
+	double alpha=2.0,beta=1.0,gama=3.0;   
+	cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
+	//if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+	cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
+	//	};
+#endif
+
+}
+
+*/
+//********test****************
+
+INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(
+						Values(CV_8UC1, CV_8UC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(
+						Values(CV_32FC1, CV_64FC1),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(
+						Values(CV_32FC1, CV_64FC1),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1,  CV_32FC4),
+						Values(false)));
+
+INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+
+INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(
+						Values(CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(
+						Values(CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(
+						Values(CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(
+						Values(CV_8UC1, CV_32FC1),
+						Values(false)));
+
+INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(
+						Values(CV_8UC1, CV_32FC1),
+						Values(false)));
+
+INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(
+						Values(CV_8U, CV_32S, CV_32F),
+						Values(false)));
+
+INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(
+						Values(CV_8U, CV_32S, CV_32F),
+						Values(false)));
+
+
+INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC4), Values(false)));
+//Values(false) is the reserved parameter
+
+
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
+						Values(CV_8UC1, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), Values(false)));
+//Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false)));
+//Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
+						Values(CV_8UC1, CV_32SC1, CV_32FC1, CV_32FC4), Values(false)));
+//Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
+						Values(CV_8UC1, CV_32SC1, CV_32FC1, CV_32FC4), Values(false)));
+//Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1,CV_16UC1,CV_16SC1,CV_32SC1,CV_32FC1,CV_64FC1), Values(false)));
+//Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32FC1, CV_32FC4), Values(false)));
+//Values(false) is the reserved parameter
+
+
+INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(
+						Values(CV_8UC1, CV_32SC1, CV_32FC1),
+						Values(false))); // Values(false) is the reserved parameter
+
+
+
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/test_filters.cpp b/modules/ocl/perf/test_filters.cpp
new file mode 100644
index 000000000..ac9a86573
--- /dev/null
+++ b/modules/ocl/perf/test_filters.cpp
@@ -0,0 +1,1096 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Zero Lin, Zero.Lin@amd.com
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_OPENCL
+
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+//using namespace cv::ocl;
+
+PARAM_TEST_CASE(FilterTestBase, MatType, bool)
+{
+	int type;
+	cv::Scalar val;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat mat2;
+	cv::Mat mask;
+	cv::Mat dst;
+	cv::Mat dst1; //bak, for two outputs
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int src2x;
+	int src2y;
+	int dstx;
+	int dsty;
+	int maskx;
+	int masky;
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat mat2_roi;
+	cv::Mat mask_roi;
+	cv::Mat dst_roi;
+	cv::Mat dst1_roi; //bak
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+	cv::ocl::oclMat gdst1_whole; //bak
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gmat2;
+	cv::ocl::oclMat gdst;
+	cv::ocl::oclMat gdst1;   //bak
+	cv::ocl::oclMat gmask;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		mat2 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		dst1  = randomMat(rng, size, type, 5, 16, false);
+		mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+
+		cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+
+		val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+	}
+
+	void random_roi()
+	{
+		cv::RNG& rng = TS::ptr()->get_rng();
+
+		//randomize ROI
+		roicols = rng.uniform(1, mat1.cols);
+		roirows = rng.uniform(1, mat1.rows);
+		src1x   = rng.uniform(0, mat1.cols - roicols);
+		src1y   = rng.uniform(0, mat1.rows - roirows);
+		src2x   = rng.uniform(0, mat2.cols - roicols);
+		src2y   = rng.uniform(0, mat2.rows - roirows);
+		dstx    = rng.uniform(0, dst.cols  - roicols);
+		dsty    = rng.uniform(0, dst.rows  - roirows);
+		maskx   = rng.uniform(0, mask.cols - roicols);
+		masky   = rng.uniform(0, mask.rows - roirows);
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		mat2_roi = mat2(Rect(src2x,src2y,roicols,roirows));
+		mask_roi = mask(Rect(maskx,masky,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+		dst1_roi = dst1(Rect(dstx,dsty,roicols,roirows));
+
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+		gdst1_whole = dst1;
+		gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
+
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		gmask = mask_roi;
+	}
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// blur
+
+PARAM_TEST_CASE(Blur, MatType, cv::Size, int)
+{
+	int type;
+	cv::Size ksize;
+	int bordertype;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		ksize = GET_PARAM(1);
+		bordertype = GET_PARAM(2);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+
+
+	void Has_roi(int b)
+	{
+		if(b)
+		{
+			roicols =  mat1.cols-1; 
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			dstx    = 1;
+			dsty    =1;
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src1y = 0;
+			dstx = 0;
+			dsty = 0;
+		};
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+	}
+
+};
+
+TEST_P(Blur, Mat)
+{
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::blur(mat1_roi, dst_roi, ksize, Point(-1,-1), bordertype);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::blur(gmat1, gdst, ksize, Point(-1,-1), bordertype);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::blur(gmat1, gdst, ksize, Point(-1,-1), bordertype);
+	};
+#endif
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//Laplacian 
+
+PARAM_TEST_CASE(LaplacianTestBase, MatType, int)
+{
+	int type;
+	int ksize;
+
+	//src mat
+	cv::Mat mat; 
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int srcx;
+	int srcy;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		ksize = GET_PARAM(1);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size = cv::Size(2560, 2560);
+
+		mat  = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		if(b)
+		{
+			roicols =  mat.cols-1; 
+			roirows = mat.rows-1;
+			srcx   = 1;
+			srcy   = 1;
+			dstx    = 1;
+			dsty    =1;
+		}else
+		{
+			roicols = mat.cols;
+			roirows = mat.rows;
+			srcx = 0;
+			srcy = 0;
+			dstx = 0;
+			dsty = 0;
+		};
+
+		mat_roi = mat(Rect(srcx,srcy,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+	}
+
+};
+
+struct Laplacian : LaplacianTestBase {};
+
+TEST_P(Laplacian, Accuracy) 
+{    
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::Laplacian(mat_roi, dst_roi, -1, ksize, 1);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat = mat_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat = mat_roi;
+
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1);
+	};
+#endif
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// erode & dilate 
+
+PARAM_TEST_CASE(ErodeDilateBase, MatType, bool)
+{
+	int type;
+	//int iterations;
+
+	//erode or dilate kernel
+	cv::Mat kernel;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		//  iterations = GET_PARAM(1);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size = cv::Size(2560, 2560);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		//		rng.fill(kernel, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
+		kernel = randomMat(rng, Size(3,3), CV_8UC1, 0, 3, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		if(b)
+		{
+			roicols =  mat1.cols-1; 
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			dstx    = 1;
+			dsty    =1;
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src1y = 0;
+			dstx = 0;
+			dsty = 0;
+		};
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+	}
+
+};
+
+// erode 
+
+struct Erode : ErodeDilateBase{};
+
+TEST_P(Erode, Mat)
+{
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::erode(mat1_roi, dst_roi, kernel);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::erode(gmat1, gdst, kernel);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::erode(gmat1, gdst, kernel);
+	};
+#endif
+
+}
+
+// dilate
+
+struct Dilate : ErodeDilateBase{};
+
+TEST_P(Dilate, Mat)
+{
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::dilate(mat1_roi, dst_roi, kernel);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::dilate(gmat1, gdst, kernel);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::dilate(gmat1, gdst, kernel);
+	};
+#endif
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Sobel 
+
+PARAM_TEST_CASE(Sobel, MatType, int, int, int, int)
+{
+	int type;
+	int dx, dy, ksize, bordertype;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		dx = GET_PARAM(1);
+		dy = GET_PARAM(2);
+		ksize = GET_PARAM(3);
+		bordertype = GET_PARAM(4);
+		dx = 2; dy=0;
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size = cv::Size(2560, 2560);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		if(b)
+		{
+			roicols =  mat1.cols-1; 
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			dstx    = 1;
+			dsty    =1;
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src1y = 0;
+			dstx = 0;
+			dsty = 0;
+		};
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+	}
+
+};
+
+TEST_P(Sobel, Mat)
+{
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::Sobel(mat1_roi, dst_roi, -1, dx, dy, ksize, /*scale*/0.00001,/*delta*/0, bordertype);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::Sobel(gmat1, gdst,-1, dx,dy,ksize,/*scale*/0.00001,/*delta*/0, bordertype);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::Sobel(gmat1, gdst,-1, dx,dy,ksize,/*scale*/0.00001,/*delta*/0, bordertype);
+	};
+#endif
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scharr 
+
+PARAM_TEST_CASE(Scharr, MatType, int, int, int)
+{
+	int type;
+	int dx, dy, bordertype;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		dx = GET_PARAM(1);
+		dy = GET_PARAM(2);
+		bordertype = GET_PARAM(3);
+		dx = 1; dy=0;
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size = cv::Size(2560, 2560);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		if(b)
+		{
+			roicols =  mat1.cols-1; 
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			dstx    = 1;
+			dsty    =1;
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src1y = 0;
+			dstx = 0;
+			dsty = 0;
+		};
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+	}
+};
+
+TEST_P(Scharr, Mat)
+{
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::Scharr(mat1_roi, dst_roi, -1, dx, dy, /*scale*/1,/*delta*/0, bordertype);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::Scharr(gmat1, gdst,-1, dx,dy,/*scale*/1,/*delta*/0, bordertype);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::Scharr(gmat1, gdst,-1, dx,dy,/*scale*/1,/*delta*/0, bordertype);
+	};
+#endif
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// GaussianBlur
+
+PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int)
+{
+	int type;
+	cv::Size ksize;
+	int bordertype;
+
+	double sigma1, sigma2;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		ksize = GET_PARAM(1);
+		bordertype = GET_PARAM(2);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size = cv::Size(2560, 2560);
+
+		sigma1 = rng.uniform(0.1, 1.0); 
+		sigma2 = rng.uniform(0.1, 1.0);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		if(b)
+		{
+			roicols =  mat1.cols-1; 
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			dstx    = 1;
+			dsty    =1;
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src1y = 0;
+			dstx = 0;
+			dsty = 0;
+		};
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+	}
+
+};
+
+TEST_P(GaussianBlur, Mat)
+{
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::GaussianBlur(mat1_roi, dst_roi, ksize, sigma1, sigma2, bordertype);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype);
+	};
+#endif
+
+}
+
+//************test**********
+
+INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(cv::Size(3, 3)/*, cv::Size(5, 5), cv::Size(7, 7)*/),
+						Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
+
+
+INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(1/*, 3*/)));
+
+//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
+
+INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false)));
+
+//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
+
+INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false)));
+
+
+INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(1, 2), Values(0, 1), Values(3, 5, 7), Values((MatType)cv::BORDER_CONSTANT,
+						(MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
+
+
+INSTANTIATE_TEST_CASE_P(Filter, Scharr, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(0, 1), Values(0, 1),
+						Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
+
+INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)),
+						Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
+
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/test_haar.cpp b/modules/ocl/perf/test_haar.cpp
new file mode 100644
index 000000000..8aabd67d6
--- /dev/null
+++ b/modules/ocl/perf/test_haar.cpp
@@ -0,0 +1,198 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/objdetect/objdetect.hpp"
+#include "precomp.hpp"
+
+#ifdef HAVE_OPENCL
+
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+using namespace cv;
+
+struct getRect { Rect operator ()(const CvAvgComp& e) const { return e.rect; } };
+
+PARAM_TEST_CASE(HaarTestBase, int, int)
+{
+	std::vector<cv::ocl::Info> oclinfo;
+    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
+	cv::CascadeClassifier cpucascade, cpunestedCascade;
+//    Mat img;
+
+    double scale;
+    int index;
+
+    virtual void SetUp()
+    {
+        scale = 1.1;
+
+#if WIN32
+        string cascadeName="E:\\opencvbuffer\\trunk\\data\\haarcascades\\haarcascade_frontalface_alt.xml";
+#else
+        string cascadeName="../data/haarcascades/haarcascade_frontalface_alt.xml";
+#endif
+
+        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
+        {
+            cout << "ERROR: Could not load classifier cascade" << endl;
+            cout << "Usage: facedetect [--cascade=<cascade_path>]\n"
+                "   [--nested-cascade[=nested_cascade_path]]\n"
+                "   [--scale[=<image scale>\n"
+                "   [filename|camera_index]\n" << endl ;
+
+            return;
+        }
+	int devnums = getDevice(oclinfo);
+	CV_Assert(devnums>0);
+	//if you want to use undefault device, set it here
+	//setDevice(oclinfo[0]);
+	cv::ocl::setBinpath("E:\\");
+    }
+};
+
+////////////////////////////////faceDetect/////////////////////////////////////////////////
+
+struct Haar : HaarTestBase {};
+
+TEST_P(Haar, FaceDetect) 
+{    
+    for(int index = 1;index < 2; index++)
+    {
+        Mat img;
+        char buff[256];
+#if WIN32
+        sprintf(buff,"E:\\myDataBase\\%d.jpg",index);
+        img = imread( buff, 1 );
+#else 
+        sprintf(buff,"%d.jpg",index);
+        img = imread( buff, 1 );
+        std::cout << "Now test " << index << ".jpg" <<std::endl;
+#endif
+        if(img.empty())
+        { 
+            std::cout << "Couldn't read test" << index <<".jpg" << std::endl;
+            continue;
+        }
+
+        int i = 0;
+        double t = 0;
+        vector<Rect> faces;
+
+        const static Scalar colors[] =  { CV_RGB(0,0,255),
+            CV_RGB(0,128,255),
+            CV_RGB(0,255,255),
+            CV_RGB(0,255,0),
+            CV_RGB(255,128,0),
+            CV_RGB(255,255,0),
+            CV_RGB(255,0,0),
+            CV_RGB(255,0,255)} ;
+
+        Mat gray, smallImg(cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
+        MemStorage storage(cvCreateMemStorage(0));
+        cvtColor( img, gray, CV_BGR2GRAY );
+        resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
+        equalizeHist( smallImg, smallImg );
+        CvMat _image = smallImg;
+
+        Mat tempimg(&_image, false);
+
+        cv::ocl::oclMat image(tempimg);
+        CvSeq* _objects;
+
+#if 1
+        for(int k= 0; k<10; k++)
+        {
+            t = (double)cvGetTickCount();
+            _objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
+                    2, 0
+                    |CV_HAAR_SCALE_IMAGE
+                    , Size(30,30), Size(0, 0) );
+
+            t = (double)cvGetTickCount() - t ;
+            printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
+        }
+
+#else
+        cpucascade.detectMultiScale( image, faces,  1.1,
+                2, 0
+                |CV_HAAR_SCALE_IMAGE
+                , Size(30,30), Size(0, 0) );
+
+#endif
+        vector<CvAvgComp> vecAvgComp;
+        Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
+        faces.resize(vecAvgComp.size());
+        std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
+
+        for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
+        { 
+            Mat smallImgROI;
+            vector<Rect> nestedObjects;
+            Point center;
+            Scalar color = colors[i%8];
+            int radius;
+            center.x = cvRound((r->x + r->width*0.5)*scale);
+            center.y = cvRound((r->y + r->height*0.5)*scale);
+            radius = cvRound((r->width + r->height)*0.25*scale);
+            circle( img, center, radius, color, 3, 8, 0 );
+        }  
+
+#if WIN32
+        sprintf(buff,"E:\\result1\\%d.jpg",index);
+        imwrite(buff,img);
+#else 
+        sprintf(buff,"testdet_%d.jpg",index);
+        imwrite(buff,img);
+#endif
+    }
+}
+
+
+//INSTANTIATE_TEST_CASE_P(HaarTestBase, Haar, Combine(Values(1),
+//            Values(1)));
+
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/test_imgproc.cpp b/modules/ocl/perf/test_imgproc.cpp
new file mode 100644
index 000000000..e01e97681
--- /dev/null
+++ b/modules/ocl/perf/test_imgproc.cpp
@@ -0,0 +1,1551 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Shengen Yan, yanshengen@gmail.com
+//    Jiang Liyuan, lyuan001.good@163.com
+//    Rock Li, Rock.Li@amd.com
+//    Zailong Wu, bullet@yeah.net
+//    Xu Pang, pangxu010@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_OPENCL
+
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+
+MatType nulltype = -1;
+
+#define ONE_TYPE(type)  testing::ValuesIn(typeVector(type))
+#define NULL_TYPE  testing::ValuesIn(typeVector(nulltype))
+
+
+vector<MatType> typeVector(MatType type)
+{
+	vector<MatType> v;
+	v.push_back(type);
+	return v;
+}
+
+
+PARAM_TEST_CASE(ImgprocTestBase, MatType,MatType,MatType,MatType,MatType, bool)
+{
+	int type1,type2,type3,type4,type5;
+	cv::Scalar val;
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int src2x;
+	int src2y;
+	int dstx;
+	int dsty;
+	int dst1x;
+	int dst1y;
+	int maskx;
+	int masky;
+
+	//mat
+	cv::Mat mat1; 
+	cv::Mat mat2;
+	cv::Mat mask;
+	cv::Mat dst;
+	cv::Mat dst1; //bak, for two outputs
+
+	//mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat mat2_roi;
+	cv::Mat mask_roi;
+	cv::Mat dst_roi;
+	cv::Mat dst1_roi; //bak
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl mat
+	cv::ocl::oclMat clmat1;
+	cv::ocl::oclMat clmat2;
+	cv::ocl::oclMat clmask;
+	cv::ocl::oclMat cldst;
+	cv::ocl::oclMat cldst1; //bak
+
+	//ocl mat with roi
+	cv::ocl::oclMat clmat1_roi;
+	cv::ocl::oclMat clmat2_roi;
+	cv::ocl::oclMat clmask_roi;
+	cv::ocl::oclMat cldst_roi;
+	cv::ocl::oclMat cldst1_roi;
+
+	virtual void SetUp()
+	{
+		type1 = GET_PARAM(0);
+		type2 = GET_PARAM(1);
+		type3 = GET_PARAM(2);
+		type4 = GET_PARAM(3);
+		type5 = GET_PARAM(4);
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+		double min = 1,max = 20; 
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums>0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+		if(type1!=nulltype)
+		{
+			mat1 = randomMat(rng, size, type1, min, max, false);
+			clmat1 = mat1;
+		}
+		if(type2!=nulltype)
+		{
+			mat2 = randomMat(rng, size, type2, min, max, false);
+			clmat2 = mat2;
+		}
+		if(type3!=nulltype)
+		{
+			dst  = randomMat(rng, size, type3, min, max, false);
+			cldst = dst;
+		}
+		if(type4!=nulltype)
+		{
+			dst1 = randomMat(rng, size, type4, min, max, false);
+			cldst1 = dst1;
+		}
+		if(type5!=nulltype)
+		{
+			mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+			cv::threshold(mask, mask, 0.5, 255., type5);
+			clmask = mask;
+		}
+		val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+	}
+
+
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat1.cols-1; //start
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src2x   = 1;
+			src1y   = 1;
+			src2y   = 1;
+			dstx    = 1;
+			dsty    =1;
+			dst1x    = 1;
+			dst1y    =1;
+			maskx	 =1;
+			masky	=1;
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src2x = 0;
+			src1y = 0;
+			src2y = 0;
+			dstx = 0;
+			dsty = 0;
+			dst1x  =0;
+			dst1y  =0;
+			maskx	 =0;
+			masky	=0;
+		};
+
+		if(type1!=nulltype)
+		{
+			mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+			//clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+		}
+		if(type2!=nulltype)
+		{
+			mat2_roi = mat2(Rect(src2x,src2y,roicols,roirows));
+			//clmat2_roi = clmat2(Rect(src2x,src2y,roicols,roirows));
+		}
+		if(type3!=nulltype)
+		{
+			dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+			//cldst_roi = cldst(Rect(dstx,dsty,roicols,roirows));
+		}
+		if(type4!=nulltype)
+		{
+			dst1_roi = dst1(Rect(dst1x,dst1y,roicols,roirows));
+			//cldst1_roi = cldst1(Rect(dst1x,dst1y,roicols,roirows));
+		}
+		if(type5!=nulltype)
+		{
+			mask_roi = mask(Rect(maskx,masky,roicols,roirows));
+			//clmask_roi = clmask(Rect(maskx,masky,roicols,roirows));
+		}
+	}
+
+	void random_roi()
+	{
+		cv::RNG& rng = TS::ptr()->get_rng();
+
+		//randomize ROI
+		roicols = rng.uniform(1, mat1.cols);
+		roirows = rng.uniform(1, mat1.rows);
+		src1x   = rng.uniform(0, mat1.cols - roicols);
+		src1y   = rng.uniform(0, mat1.rows - roirows);
+		src2x   = rng.uniform(0, mat2.cols - roicols);
+		src2y   = rng.uniform(0, mat2.rows - roirows);
+		dstx    = rng.uniform(0, dst.cols  - roicols);
+		dsty    = rng.uniform(0, dst.rows  - roirows);
+		dst1x    = rng.uniform(0, dst1.cols  - roicols);
+		dst1y    = rng.uniform(0, dst1.rows  - roirows);
+		maskx   = rng.uniform(0, mask.cols - roicols);
+		masky   = rng.uniform(0, mask.rows - roirows);
+
+		if(type1!=nulltype)
+		{
+			mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+			//clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+		}
+		if(type2!=nulltype)
+		{
+			mat2_roi = mat2(Rect(src2x,src2y,roicols,roirows));
+			//clmat2_roi = clmat2(Rect(src2x,src2y,roicols,roirows));
+		}
+		if(type3!=nulltype)
+		{
+			dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+			//cldst_roi = cldst(Rect(dstx,dsty,roicols,roirows));
+		}
+		if(type4!=nulltype)
+		{
+			dst1_roi = dst1(Rect(dst1x,dst1y,roicols,roirows));
+			//cldst1_roi = cldst1(Rect(dst1x,dst1y,roicols,roirows));
+		}
+		if(type5!=nulltype)
+		{
+			mask_roi = mask(Rect(maskx,masky,roicols,roirows));
+			//clmask_roi = clmask(Rect(maskx,masky,roicols,roirows));
+		}
+	}
+};
+////////////////////////////////equalizeHist//////////////////////////////////////////
+
+struct equalizeHist : ImgprocTestBase {};
+
+TEST_P(equalizeHist, MatType) 
+{ 
+	if (mat1.type() != CV_8UC1 || mat1.type() != dst.type())
+	{
+		cout<<"Unsupported type"<<endl;
+		EXPECT_DOUBLE_EQ(0.0, 0.0);
+	}
+	else
+	{
+#ifndef PRINT_KERNEL_RUN_TIME   
+		double totalcputick=0;
+		double totalgputick=0;
+		double totalgputick_kernel=0;
+		double t0=0;
+		double t1=0;
+		double t2=0;	
+		for(int k=0;k<2;k++){
+			totalcputick=0;
+			totalgputick=0;
+			totalgputick_kernel=0;
+			for(int j = 0; j < LOOP_TIMES+1; j ++)
+			{
+				Has_roi(k);       
+
+				t0 = (double)cvGetTickCount();//cpu start
+				cv::equalizeHist(mat1_roi, dst_roi);
+				t0 = (double)cvGetTickCount() - t0;//cpu end
+
+				t1 = (double)cvGetTickCount();//gpu start1		
+				if(type1!=nulltype)
+				{
+					clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+				}
+				cldst_roi = cldst(Rect(dstx,dsty,roicols,roirows));
+				t2=(double)cvGetTickCount();//kernel
+				cv::ocl::equalizeHist(clmat1_roi, cldst_roi);
+				t2 = (double)cvGetTickCount() - t2;//kernel
+				cv::Mat cpu_cldst;
+				//cldst.download(cpu_cldst);//download
+				t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+				if(j == 0)
+					continue;
+
+				totalgputick=t1+totalgputick;
+				totalcputick=t0+totalcputick;	
+				totalgputick_kernel=t2+totalgputick_kernel;	
+
+			}
+			if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+			cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+			cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+			cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		}
+#else
+		for(int j = 0; j < 2; j ++)
+		{
+			Has_roi(j);
+			if(type1!=nulltype)
+			{
+				clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+			}
+			if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+			cv::ocl::equalizeHist(clmat1_roi, cldst_roi);
+		};
+#endif
+	}
+}
+
+
+////////////////////////////////bilateralFilter////////////////////////////////////////////
+
+struct bilateralFilter : ImgprocTestBase {};
+
+TEST_P(bilateralFilter, Mat) 
+{    
+	double sigmacolor = 50.0;
+	int radius = 9;
+	int d = 2*radius+1;
+	double sigmaspace = 20.0;
+	int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
+	//const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
+	if (mat1.type() != CV_8UC1 || mat1.type() != dst.type())
+	{
+		cout<<"Unsupported type"<<endl;
+		EXPECT_DOUBLE_EQ(0.0, 0.0);
+	}
+	else
+	{
+		for(int i=0;i<sizeof(bordertype)/sizeof(int);i++){
+#ifndef PRINT_KERNEL_RUN_TIME   
+			double totalcputick=0;
+			double totalgputick=0;
+			double totalgputick_kernel=0;
+			double t0=0;
+			double t1=0;
+			double t2=0;	
+			for(int k=0;k<2;k++){
+				totalcputick=0;
+				totalgputick=0;
+				totalgputick_kernel=0;
+				for(int j = 0; j < LOOP_TIMES+1; j ++)
+				{
+					Has_roi(k);       
+
+					t0 = (double)cvGetTickCount();//cpu start
+					cv::bilateralFilter(mat1_roi, dst_roi, d,sigmacolor,sigmaspace, bordertype[i]);
+					t0 = (double)cvGetTickCount() - t0;//cpu end
+
+					t1 = (double)cvGetTickCount();//gpu start1		
+					if(type1!=nulltype)
+					{
+						clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+					}
+					t2=(double)cvGetTickCount();//kernel
+					cv::ocl::bilateralFilter(clmat1_roi, cldst_roi, d,sigmacolor,sigmaspace, bordertype[i]);
+					t2 = (double)cvGetTickCount() - t2;//kernel
+					cv::Mat cpu_cldst;
+					cldst.download(cpu_cldst);//download
+					t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+					if(j == 0)
+						continue;
+
+					totalgputick=t1+totalgputick;
+					totalcputick=t0+totalcputick;	
+					totalgputick_kernel=t2+totalgputick_kernel;	
+
+				}
+				if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+				cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+				cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+				cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+			}
+
+#else
+			for(int j = 0; j < 2; j ++)
+			{
+				Has_roi(j);
+				if(type1!=nulltype)
+				{
+					clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+				};
+				if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+				cv::ocl::bilateralFilter(clmat1_roi, cldst_roi, d,sigmacolor,sigmaspace, bordertype[i]);
+			};
+
+#endif
+		};
+
+	}
+}
+
+////////////////////////////////copyMakeBorder////////////////////////////////////////////
+
+struct CopyMakeBorder : ImgprocTestBase {};
+
+TEST_P(CopyMakeBorder, Mat) 
+{    
+	int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
+	//const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
+
+	if ((mat1.type() != CV_8UC1 && mat1.type() != CV_8UC4 && mat1.type() != CV_32SC1) || mat1.type() != dst.type())
+	{
+		cout<<"Unsupported type"<<endl;
+		EXPECT_DOUBLE_EQ(0.0, 0.0);
+	}
+	else
+	{
+		for(int i=0;i<sizeof(bordertype)/sizeof(int);i++){
+#ifndef PRINT_KERNEL_RUN_TIME   
+			double totalcputick=0;
+			double totalgputick=0;
+			double totalgputick_kernel=0;
+			double t0=0;
+			double t1=0;
+			double t2=0;	
+			for(int k=0;k<2;k++){
+				totalcputick=0;
+				totalgputick=0;
+				totalgputick_kernel=0;
+				for(int j = 0; j < LOOP_TIMES+1; j ++)
+				{
+					Has_roi(k);       
+
+					t0 = (double)cvGetTickCount();//cpu start
+					cv::copyMakeBorder(mat1_roi, dst_roi, 7,5,5,7, bordertype[i],cv::Scalar(1.0));
+					t0 = (double)cvGetTickCount() - t0;//cpu end
+
+					t1 = (double)cvGetTickCount();//gpu start1		
+					if(type1!=nulltype)
+					{
+						clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+					}
+					t2=(double)cvGetTickCount();//kernel
+					cv::ocl::copyMakeBorder(clmat1_roi, cldst_roi,7,5,5,7,  bordertype[i],cv::Scalar(1.0));
+					t2 = (double)cvGetTickCount() - t2;//kernel
+					cv::Mat cpu_cldst;
+					cldst.download(cpu_cldst);//download
+					t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+					if(j == 0)
+						continue;
+
+					totalgputick=t1+totalgputick;
+					totalcputick=t0+totalcputick;	
+					totalgputick_kernel=t2+totalgputick_kernel;	
+
+				}
+				if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+				cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+				cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+				cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+			}
+#else
+			for(int j = 0; j < 2; j ++)
+			{
+				Has_roi(j);
+				if(type1!=nulltype)
+				{
+					clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+				};
+				if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+				cv::ocl::copyMakeBorder(clmat1_roi, cldst_roi,7,5,5,7,  bordertype[i],cv::Scalar(1.0));
+			};
+#endif
+		};
+	}
+}
+
+////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
+
+struct cornerMinEigenVal : ImgprocTestBase {};
+
+TEST_P(cornerMinEigenVal, Mat) 
+{    	
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			int blockSize = 7, apertureSize= 1 + 2 * (rand() % 4);
+			int borderType = cv::BORDER_REFLECT;
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::cornerMinEigenVal(mat1_roi, dst_roi, blockSize, apertureSize, borderType); 
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			if(type1!=nulltype)
+			{
+				clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+			}
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::cornerMinEigenVal(clmat1_roi, cldst_roi, blockSize, apertureSize, borderType);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_cldst;
+			cldst.download(cpu_cldst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		int blockSize = 7, apertureSize= 1 + 2 * (rand() % 4);
+		int borderType = cv::BORDER_REFLECT;
+		if(type1!=nulltype)
+		{
+			clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+		};
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::cornerMinEigenVal(clmat1_roi, cldst_roi, blockSize, apertureSize, borderType);
+	};
+#endif
+}
+
+
+////////////////////////////////cornerHarris//////////////////////////////////////////
+
+struct cornerHarris : ImgprocTestBase {};
+
+TEST_P(cornerHarris, Mat) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);   
+			int blockSize = 7, apertureSize= 3;
+			int borderType = cv::BORDER_REFLECT;
+			double kk = 2;
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::cornerHarris(mat1_roi, dst_roi, blockSize, apertureSize, kk, borderType); 
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			if(type1!=nulltype)
+			{
+				clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+			}
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::cornerHarris(clmat1_roi, cldst_roi, blockSize, apertureSize, kk, borderType);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_cldst;
+			cldst.download(cpu_cldst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		double kk = 2;
+		int blockSize = 7, apertureSize= 3;
+		int borderType = cv::BORDER_REFLECT;
+		if(type1!=nulltype)
+		{
+			clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+		};
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::cornerHarris(clmat1_roi, cldst_roi, blockSize, apertureSize, kk, borderType);
+	};
+#endif
+
+}
+
+
+////////////////////////////////integral/////////////////////////////////////////////////
+
+struct integral : ImgprocTestBase {};
+
+TEST_P(integral, Mat) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);   
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::integral(mat1_roi, dst_roi, dst1_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			if(type1!=nulltype)
+			{
+				clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+			}
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::integral(clmat1_roi, cldst_roi, cldst1_roi);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_cldst;
+			cv::Mat cpu_cldst1;
+			cldst.download(cpu_cldst);//download
+			cldst1.download(cpu_cldst1);
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		if(type1!=nulltype)
+		{
+			clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
+		};
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::integral(clmat1_roi, cldst_roi, cldst1_roi);
+	};
+#endif
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// warpAffine  & warpPerspective
+
+PARAM_TEST_CASE(WarpTestBase, MatType, int)
+{
+	int type;
+	cv::Size size;
+	int interpolation;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat dst;
+
+	// set up roi
+	int src_roicols;
+	int src_roirows;
+	int dst_roicols;
+	int dst_roirows;
+	int src1x;
+	int src1y;
+	int dstx;
+	int dsty;
+
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		//dsize = GET_PARAM(1);
+		interpolation = GET_PARAM(1);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		size = cv::Size(MWIDTH, MHEIGHT);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			src_roicols =  mat1.cols-1; //start
+			src_roirows = mat1.rows-1;
+			dst_roicols=dst.cols-1;
+			dst_roirows=dst.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			dstx    = 1;
+			dsty    =1;
+
+		}else
+		{
+			src_roicols = mat1.cols;
+			src_roirows = mat1.rows;
+			dst_roicols=dst.cols;
+			dst_roirows=dst.rows;
+			src1x = 0;
+			src1y = 0;
+			dstx = 0;
+			dsty = 0;
+
+		};
+		mat1_roi = mat1(Rect(src1x,src1y,src_roicols,src_roirows));
+		dst_roi  = dst(Rect(dstx,dsty,dst_roicols,dst_roirows));
+
+
+	}
+
+};
+
+/////warpAffine
+
+struct WarpAffine : WarpTestBase{};
+
+TEST_P(WarpAffine, Mat)
+{
+	static const double coeffs[2][3] =
+	{
+		{cos(3.14 / 6), -sin(3.14 / 6), 100.0},
+		{sin(3.14 / 6), cos(3.14 / 6), -100.0}
+	};
+	Mat M(2, 3, CV_64F, (void*)coeffs);
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::warpAffine(mat1_roi, dst_roi, M, size, interpolation);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,dst_roicols,dst_roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::warpAffine(gmat1, gdst, M, size, interpolation);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,dst_roicols,dst_roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::warpAffine(gmat1, gdst, M, size, interpolation);
+	};
+#endif
+
+}
+
+
+// warpPerspective
+
+struct WarpPerspective : WarpTestBase{};
+
+TEST_P(WarpPerspective, Mat)
+{
+	static const double coeffs[3][3] =
+	{
+		{cos(3.14 / 6), -sin(3.14 / 6), 100.0},
+		{sin(3.14 / 6), cos(3.14 / 6), -100.0},
+		{0.0, 0.0, 1.0}
+	};
+	Mat M(3, 3, CV_64F, (void*)coeffs);
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::warpPerspective(mat1_roi, dst_roi, M, size, interpolation);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,dst_roicols,dst_roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::warpPerspective(gmat1, gdst, M, size, interpolation);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,dst_roicols,dst_roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::warpPerspective(gmat1, gdst, M, size, interpolation);
+	};
+#endif
+
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// resize
+
+PARAM_TEST_CASE(Resize, MatType, cv::Size, double, double, int)
+{
+	int type;
+	cv::Size dsize;
+	double fx, fy;
+	int interpolation;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat dst;
+
+	// set up roi
+	int src_roicols;
+	int src_roirows;
+	int dst_roicols;
+	int dst_roirows;
+	int src1x;
+	int src1y;
+	int dstx;
+	int dsty;
+
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		dsize = GET_PARAM(1);
+		fx = GET_PARAM(2);
+		fy = GET_PARAM(3);
+		interpolation = GET_PARAM(4);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		if(dsize == cv::Size() && !(fx > 0 && fy > 0))
+		{
+			cout << "invalid dsize and fx fy" << endl;
+			return;
+		}
+
+		if(dsize == cv::Size()) 
+		{
+			dsize.width = (int)(size.width * fx);
+			dsize.height = (int)(size.height * fy);
+		}
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, dsize, type, 5, 16, false);
+
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			src_roicols =  mat1.cols-1; //start
+			src_roirows = mat1.rows-1;
+			dst_roicols=dst.cols-1;
+			dst_roirows=dst.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			dstx    = 1;
+			dsty    =1;
+
+		}else
+		{
+			src_roicols = mat1.cols;
+			src_roirows = mat1.rows;
+			dst_roicols=dst.cols;
+			dst_roirows=dst.rows;
+			src1x = 0;
+			src1y = 0;
+			dstx = 0;
+			dsty = 0;
+
+		};
+		mat1_roi = mat1(Rect(src1x,src1y,src_roicols,src_roirows));
+		dst_roi  = dst(Rect(dstx,dsty,dst_roicols,dst_roirows));
+
+
+	}
+
+};
+
+TEST_P(Resize, Mat)
+{
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::resize(mat1_roi, dst_roi, dsize, fx, fy, interpolation);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,dst_roicols,dst_roirows));
+
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::resize(gmat1, gdst, dsize, fx, fy, interpolation);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,dst_roicols,dst_roirows));
+		gmat1 = mat1_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::resize(gmat1, gdst, dsize, fx, fy, interpolation);
+	};
+#endif
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//threshold 
+
+PARAM_TEST_CASE(Threshold, MatType, ThreshOp)
+{
+	int type;
+	int threshOp;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		threshOp = GET_PARAM(1);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat1 = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat1.cols-1; //start
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			dstx    = 1;
+			dsty    =1;
+
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x = 0;
+			src1y = 0;
+			dstx = 0;
+			dsty = 0;
+
+		};
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+
+	}
+};
+
+TEST_P(Threshold, Mat)
+{
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			double maxVal = randomDouble(20.0, 127.0);
+			double thresh = randomDouble(0.0, maxVal);
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::threshold(mat1_roi, dst_roi, thresh, maxVal, threshOp);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+			gmat1 = mat1_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::threshold(gmat1, gdst, thresh, maxVal, threshOp);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		double maxVal = randomDouble(20.0, 127.0);
+		double thresh = randomDouble(0.0, maxVal);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		gmat1 = mat1_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::threshold(gmat1, gdst, thresh, maxVal, threshOp);
+	};
+#endif
+
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//meanShift
+
+PARAM_TEST_CASE(meanShiftTestBase, MatType, MatType, int, int, cv::TermCriteria)
+{
+	int type, typeCoor;
+	int sp, sr;
+	cv::TermCriteria crit;
+	//src mat
+	cv::Mat src;
+	cv::Mat dst;
+	cv::Mat dstCoor;
+
+	//set up roi
+	int roicols;
+	int roirows;
+	int srcx;
+	int srcy;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat src_roi;
+	cv::Mat dst_roi;
+	cv::Mat dstCoor_roi;
+
+	//ocl dst mat
+	cv::ocl::oclMat gdst;
+	cv::ocl::oclMat gdstCoor;
+
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl mat with roi
+	cv::ocl::oclMat gsrc_roi;
+	cv::ocl::oclMat gdst_roi;
+	cv::ocl::oclMat gdstCoor_roi;
+
+	virtual void SetUp()
+	{
+		type     = GET_PARAM(0);
+		typeCoor = GET_PARAM(1);
+		sp       = GET_PARAM(2);
+		sr       = GET_PARAM(3);
+		crit     = GET_PARAM(4);
+
+		cv::RNG &rng = TS::ptr()->get_rng();
+
+		// MWIDTH=256, MHEIGHT=256. defined in utility.hpp
+		cv::Size size = cv::Size(MWIDTH, MHEIGHT);
+
+		src = randomMat(rng, size, type, 5, 16, false);
+		dst = randomMat(rng, size, type, 5, 16, false);
+		dstCoor = randomMat(rng, size, typeCoor, 5, 16, false);
+
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		cv::ocl::setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		if(b)
+		{
+			//randomize ROI
+			roicols = src.cols - 1;
+			roirows = src.rows - 1;
+			srcx = 1;
+			srcy = 1;
+			dstx = 1;
+			dsty = 1;
+		}else
+		{
+			roicols = src.cols;
+			roirows = src.rows;
+			srcx = 0;
+			srcy = 0;
+			dstx = 0;
+			dsty = 0;
+		};
+
+		src_roi = src(Rect(srcx, srcy, roicols, roirows));
+		dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
+		dstCoor_roi = dstCoor(Rect(dstx, dsty, roicols, roirows));
+
+		gdst = dst;
+		gdstCoor = dstCoor;
+	}
+};
+
+/////////////////////////meanShiftFiltering/////////////////////////////
+struct meanShiftFiltering : meanShiftTestBase {};
+
+TEST_P(meanShiftFiltering, Mat)
+{
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++)
+	{
+		double totalgputick=0;
+		double totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t1 = (double)cvGetTickCount();//gpu start1	
+
+			gsrc_roi = src_roi;
+			gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
+
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::meanShiftFiltering(gsrc_roi, gdst_roi, sp, sr, crit);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+
+			cv::Mat cpu_gdst;
+			gdst.download(cpu_gdst);//download
+
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+
+		gsrc_roi = src_roi;
+		gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::meanShiftFiltering(gsrc_roi, gdst_roi, sp, sr, crit);
+	};
+#endif
+
+}
+
+///////////////////////////meanShiftProc//////////////////////////////////
+struct meanShiftProc : meanShiftTestBase {};
+
+TEST_P(meanShiftProc, Mat)
+{
+
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++)
+	{
+		double totalgputick=0;
+		double totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+
+			gsrc_roi = src_roi;
+			gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
+			gdstCoor_roi = gdstCoor(Rect(dstx, dsty, roicols, roirows));
+
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::meanShiftProc(gsrc_roi, gdst_roi, gdstCoor_roi, sp, sr, crit);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+
+			cv::Mat cpu_gdstCoor;
+			gdstCoor.download(cpu_gdstCoor);//download
+
+			t1 = (double)cvGetTickCount() - t1;//gpu end1	
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+
+		gsrc_roi = src_roi;
+		gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
+		gdstCoor_roi = gdstCoor(Rect(dstx, dsty, roicols, roirows));
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::meanShiftProc(gsrc_roi, gdst_roi, gdstCoor_roi, sp, sr, crit);
+	};
+#endif
+
+}
+
+
+
+//************test*******************
+
+INSTANTIATE_TEST_CASE_P(ImgprocTestBase, equalizeHist, Combine(
+						ONE_TYPE(CV_8UC1),
+						NULL_TYPE,
+						ONE_TYPE(CV_8UC1),
+						NULL_TYPE,
+						NULL_TYPE,
+						Values(false))); // Values(false) is the reserved parameter
+
+//INSTANTIATE_TEST_CASE_P(ImgprocTestBase, bilateralFilter, Combine(
+//	ONE_TYPE(CV_8UC1),
+//	NULL_TYPE,
+//	ONE_TYPE(CV_8UC1),
+//	NULL_TYPE,
+//	NULL_TYPE,
+//	Values(false))); // Values(false) is the reserved parameter
+//
+//
+//INSTANTIATE_TEST_CASE_P(ImgprocTestBase, CopyMakeBorder, Combine(
+//	Values(CV_8UC1, CV_8UC4/*, CV_32SC1*/),
+//	NULL_TYPE,
+//	Values(CV_8UC1,CV_8UC4/*,CV_32SC1*/),
+//	NULL_TYPE,
+//	NULL_TYPE,
+//	Values(false))); // Values(false) is the reserved parameter
+
+//INSTANTIATE_TEST_CASE_P(ImgprocTestBase, cornerMinEigenVal, Combine(
+//	Values(CV_8UC1,CV_32FC1),
+//	NULL_TYPE,
+//	ONE_TYPE(CV_32FC1),
+//	NULL_TYPE,
+//	NULL_TYPE,
+//	Values(false))); // Values(false) is the reserved parameter
+//
+//INSTANTIATE_TEST_CASE_P(ImgprocTestBase, cornerHarris, Combine(
+//	Values(CV_8UC1,CV_32FC1),
+//	NULL_TYPE,
+//	ONE_TYPE(CV_32FC1),
+//	NULL_TYPE,
+//	NULL_TYPE,
+//	Values(false))); // Values(false) is the reserved parameter
+
+
+INSTANTIATE_TEST_CASE_P(ImgprocTestBase, integral, Combine(
+						ONE_TYPE(CV_8UC1),
+						NULL_TYPE,
+						ONE_TYPE(CV_32SC1),
+						ONE_TYPE(CV_32FC1),
+						NULL_TYPE,
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(Imgproc, WarpAffine, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR,
+						(MatType)cv::INTER_CUBIC, (MatType)(cv::INTER_NEAREST | cv::WARP_INVERSE_MAP),
+						(MatType)(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP), (MatType)(cv::INTER_CUBIC | cv::WARP_INVERSE_MAP))));
+
+
+INSTANTIATE_TEST_CASE_P(Imgproc, WarpPerspective, Combine
+						(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR,
+						(MatType)cv::INTER_CUBIC, (MatType)(cv::INTER_NEAREST | cv::WARP_INVERSE_MAP),
+						(MatType)(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP), (MatType)(cv::INTER_CUBIC | cv::WARP_INVERSE_MAP))));
+
+
+INSTANTIATE_TEST_CASE_P(Imgproc, Resize, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),  Values(cv::Size()),
+						Values(0.5/*, 1.5, 2*/), Values(0.5/*, 1.5, 2*/), Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR)));
+
+
+INSTANTIATE_TEST_CASE_P(Imgproc, Threshold, Combine(
+						Values(CV_8UC1, CV_32FC1), Values(ThreshOp(cv::THRESH_BINARY),
+						ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC),
+						ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))));
+
+INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftFiltering, Combine(
+						ONE_TYPE(CV_8UC4),
+						ONE_TYPE(CV_16SC2),//it is no use in meanShiftFiltering
+						Values(5),
+						Values(6),
+						Values(cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 5, 1))
+						));
+
+INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine(
+						ONE_TYPE(CV_8UC4),
+						ONE_TYPE(CV_16SC2),
+						Values(5),
+						Values(6),
+						Values(cv::TermCriteria(cv::TermCriteria::COUNT+cv::TermCriteria::EPS, 5, 1))
+						));
+
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/test_matrix_operation.cpp b/modules/ocl/perf/test_matrix_operation.cpp
new file mode 100644
index 000000000..cc9a142a6
--- /dev/null
+++ b/modules/ocl/perf/test_matrix_operation.cpp
@@ -0,0 +1,616 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_OPENCL
+
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+using namespace cv::ocl;
+////////////////////////////////converto/////////////////////////////////////////////////
+PARAM_TEST_CASE(ConvertToTestBase, MatType, MatType)
+{
+	int type;
+	int dst_type;
+
+	//src mat
+	cv::Mat mat; 
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int srcx;
+	int srcy;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type     = GET_PARAM(0);
+		dst_type = GET_PARAM(1);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat.cols-1; //start
+			roirows = mat.rows-1;
+			srcx   = 1;
+			srcy   = 1;
+			dstx    = 1;
+			dsty    =1;
+		}else
+		{
+			roicols = mat.cols;
+			roirows = mat.rows;
+			srcx   = 0;
+			srcy   = 0;
+			dstx   = 0;
+			dsty   = 0;
+		};
+
+		mat_roi = mat(Rect(srcx,srcy,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+		//gdst_whole = dst;
+		//gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+		//gmat = mat_roi;
+	}
+};
+
+
+struct ConvertTo :ConvertToTestBase {};
+
+TEST_P(ConvertTo, Accuracy) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			mat_roi.convertTo(dst_roi, dst_type);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat = mat_roi;
+			t2=(double)cvGetTickCount();//kernel
+			gmat.convertTo(gdst, dst_type);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+		gmat = mat_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		gmat.convertTo(gdst, dst_type);
+	};
+#endif
+
+}
+
+
+///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
+
+PARAM_TEST_CASE(CopyToTestBase, MatType, bool)
+{
+	int type;
+
+	cv::Mat mat; 
+	cv::Mat mask;
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int srcx;
+	int srcy;
+	int dstx;
+	int dsty;
+	int maskx;
+	int masky;
+
+	//src mat with roi
+	cv::Mat mat_roi;
+	cv::Mat mask_roi;
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat;
+	cv::ocl::oclMat gdst;
+	cv::ocl::oclMat gmask;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat = randomMat(rng, size, type, 5, 16, false);
+		dst  = randomMat(rng, size, type, 5, 16, false);
+		mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+
+		cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat.cols-1; //start
+			roirows = mat.rows-1;
+			srcx   = 1;
+			srcy   = 1;
+			dstx    = 1;
+			dsty    =1;
+			maskx   = 1;
+			masky   = 1;
+		}else
+		{
+			roicols = mat.cols;
+			roirows = mat.rows;
+			srcx   = 0;
+			srcy   = 0;
+			dstx   = 0;
+			dsty   = 0;
+			maskx   = 0;
+			masky   = 0;
+		};
+
+		mat_roi = mat(Rect(srcx,srcy,roicols,roirows));
+		mask_roi = mask(Rect(maskx,masky,roicols,roirows));
+		dst_roi  = dst(Rect(dstx,dsty,roicols,roirows));
+
+		//gdst_whole = dst;
+		//gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+		//gmat = mat_roi;
+		//gmask = mask_roi;
+	}
+};
+
+struct CopyTo :CopyToTestBase {};
+
+TEST_P(CopyTo, Without_mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			mat_roi.copyTo(dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat = mat_roi;
+			t2=(double)cvGetTickCount();//kernel
+			gmat.copyTo(gdst);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+		gmat = mat_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		gmat.copyTo(gdst);
+	};
+#endif
+}
+
+TEST_P(CopyTo, With_mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			mat_roi.copyTo(dst_roi,mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+			gmat = mat_roi;
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			gmat.copyTo(gdst, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+
+		gmat = mat_roi;
+		gmask = mask_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		gmat.copyTo(gdst, gmask);
+	};
+#endif
+}
+
+///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
+
+PARAM_TEST_CASE(SetToTestBase, MatType, bool)
+{
+	int type;
+	cv::Scalar val;
+
+	cv::Mat mat; 
+	cv::Mat mask;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int srcx;
+	int srcy;
+	int maskx;
+	int masky;
+
+	//src mat with roi
+	cv::Mat mat_roi;
+	cv::Mat mask_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gmat_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat;
+	cv::ocl::oclMat gmask;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat = randomMat(rng, size, type, 5, 16, false);
+		mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+
+		cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+		val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat.cols-1; //start
+			roirows = mat.rows-1;
+			srcx   = 1;
+			srcy   = 1;
+			maskx   = 1;
+			masky   = 1;
+		}else
+		{
+			roicols = mat.cols;
+			roirows = mat.rows;
+			srcx   = 0;
+			srcy   = 0;
+			maskx   = 0;
+			masky   = 0;
+		};
+
+		mat_roi = mat(Rect(srcx,srcy,roicols,roirows));
+		mask_roi = mask(Rect(maskx,masky,roicols,roirows));
+
+		//gmat_whole = mat;
+		//gmat = gmat_whole(Rect(srcx,srcy,roicols,roirows));
+
+		//gmask = mask_roi;
+	}
+};
+
+struct SetTo :SetToTestBase {};
+
+TEST_P(SetTo, Without_mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			mat_roi.setTo(val);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gmat_whole = mat;
+			gmat = gmat_whole(Rect(srcx,srcy,roicols,roirows));
+			t2=(double)cvGetTickCount();//kernel
+			gmat.setTo(val);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gmat_whole.download(cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat_whole = mat;
+		gmat = gmat_whole(Rect(srcx,srcy,roicols,roirows));
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		gmat.setTo(val);
+	};
+#endif
+}
+
+TEST_P(SetTo, With_mask) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+
+			t0 = (double)cvGetTickCount();//cpu start
+			mat_roi.setTo(val, mask_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gmat_whole = mat;
+			gmat = gmat_whole(Rect(srcx,srcy,roicols,roirows));
+
+			gmask = mask_roi;
+			t2=(double)cvGetTickCount();//kernel
+			gmat.setTo(val, gmask);
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gmat_whole.download(cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat_whole = mat;
+		gmat = gmat_whole(Rect(srcx,srcy,roicols,roirows));
+
+		gmask = mask_roi;
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		gmat.setTo(val, gmask);
+	};
+#endif
+}
+
+//**********test************	
+
+INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)));
+
+INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+
+INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine(
+						Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+						Values(false))); // Values(false) is the reserved parameter
+#endif
diff --git a/modules/ocl/perf/test_split_merge.cpp b/modules/ocl/perf/test_split_merge.cpp
new file mode 100644
index 000000000..e3e8ee445
--- /dev/null
+++ b/modules/ocl/perf/test_split_merge.cpp
@@ -0,0 +1,455 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_OPENCL
+
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+using namespace cv::ocl;
+PARAM_TEST_CASE(MergeTestBase, MatType, int)
+{
+	int type;
+	int channels;
+
+	//src mat
+	cv::Mat mat1; 
+	cv::Mat mat2;
+	cv::Mat mat3;
+	cv::Mat mat4;
+
+	//dst mat
+	cv::Mat dst;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int src1x;
+	int src1y;
+	int src2x;
+	int src2y;
+	int src3x;
+	int src3y;
+	int src4x;
+	int src4y;
+	int dstx;
+	int dsty;
+
+	//src mat with roi
+	cv::Mat mat1_roi;
+	cv::Mat mat2_roi;
+	cv::Mat mat3_roi;
+	cv::Mat mat4_roi;
+
+	//dst mat with roi
+	cv::Mat dst_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat1;
+	cv::ocl::oclMat gmat2;
+	cv::ocl::oclMat gmat3;
+	cv::ocl::oclMat gmat4;
+	cv::ocl::oclMat gdst;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		channels = GET_PARAM(1);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+		mat2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+		mat3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+		mat4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+		dst  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		setBinpath(CLBINPATH);
+	}
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat1.cols-1; //start
+			roirows = mat1.rows-1;
+			src1x   = 1;
+			src1y   = 1;
+			src2x   = 1;
+			src2y   = 1;
+			src3x   = 1;
+			src3y   = 1;
+			src4x   = 1;
+			src4y   = 1;
+			dstx    = 1;
+			dsty    =1;
+
+		}else
+		{
+			roicols = mat1.cols;
+			roirows = mat1.rows;
+			src1x   = 0;
+			src1y   = 0;
+			src2x   = 0;
+			src2y   = 0;
+			src3x   = 0;
+			src3y   = 0;
+			src4x   = 0;
+			src4y   = 0;
+			dstx    = 0;
+			dsty    = 0;
+		};
+
+		mat1_roi = mat1(Rect(src1x,src1y,roicols,roirows));
+		mat2_roi = mat2(Rect(src2x,src2y,roicols,roirows));
+		mat3_roi = mat3(Rect(src3x,src3y,roicols,roirows));
+		mat4_roi = mat4(Rect(src4x,src4y,roicols,roirows));
+
+
+		dst_roi = dst(Rect(dstx,dsty,roicols,roirows));
+	}
+
+};
+
+struct Merge : MergeTestBase {};
+
+TEST_P(Merge, Accuracy) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			std::vector<cv::Mat> dev_src;
+			dev_src.push_back(mat1_roi);
+			dev_src.push_back(mat2_roi);
+			dev_src.push_back(mat3_roi);
+			dev_src.push_back(mat4_roi);   
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::merge(dev_src, dst_roi);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1	]
+			gmat1 = mat1_roi;
+			gmat2 = mat2_roi;
+			gmat3 = mat3_roi;
+			gmat4 = mat4_roi;
+			gdst_whole = dst;
+			gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+			std::vector<cv::ocl::oclMat> dev_gsrc;
+			dev_gsrc.push_back(gmat1);
+			dev_gsrc.push_back(gmat2);
+			dev_gsrc.push_back(gmat3);
+			dev_gsrc.push_back(gmat4);
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::merge(dev_gsrc, gdst); 
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst;
+			gdst_whole.download (cpu_dst);//download
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+
+			if(j == 0)
+				continue;
+
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		gmat1 = mat1_roi;
+		gmat2 = mat2_roi;
+		gmat3 = mat3_roi;
+		gmat4 = mat4_roi;
+		gdst_whole = dst;
+		gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
+		std::vector<cv::ocl::oclMat> dev_gsrc;
+		dev_gsrc.push_back(gmat1);
+		dev_gsrc.push_back(gmat2);
+		dev_gsrc.push_back(gmat3);
+		dev_gsrc.push_back(gmat4);
+
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::merge(dev_gsrc, gdst); 
+	};
+#endif
+}
+
+
+PARAM_TEST_CASE(SplitTestBase, MatType, int)
+{
+	int type;
+	int channels;
+
+	//src mat
+	cv::Mat mat; 
+
+	//dstmat
+	cv::Mat dst1;
+	cv::Mat dst2;
+	cv::Mat dst3;
+	cv::Mat dst4;
+
+	// set up roi
+	int roicols;
+	int roirows;
+	int srcx;
+	int srcy;
+	int dst1x;
+	int dst1y;
+	int dst2x;
+	int dst2y;
+	int dst3x;
+	int dst3y;
+	int dst4x;
+	int dst4y;
+
+	//src mat with roi
+	cv::Mat mat_roi;
+
+	//dst mat with roi
+	cv::Mat dst1_roi;
+	cv::Mat dst2_roi;
+	cv::Mat dst3_roi;
+	cv::Mat dst4_roi;
+	std::vector<cv::ocl::Info> oclinfo;
+	//ocl dst mat for testing
+	cv::ocl::oclMat gdst1_whole;
+	cv::ocl::oclMat gdst2_whole;
+	cv::ocl::oclMat gdst3_whole;
+	cv::ocl::oclMat gdst4_whole;
+
+	//ocl mat with roi
+	cv::ocl::oclMat gmat;
+	cv::ocl::oclMat gdst1;
+	cv::ocl::oclMat gdst2;
+	cv::ocl::oclMat gdst3;
+	cv::ocl::oclMat gdst4;
+
+	virtual void SetUp()
+	{
+		type = GET_PARAM(0);
+		channels = GET_PARAM(1);
+
+		cv::RNG& rng = TS::ptr()->get_rng();
+		cv::Size size(MWIDTH, MHEIGHT);
+
+		mat  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
+		dst1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+		dst2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+		dst3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+		dst4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
+		int devnums = getDevice(oclinfo);
+		CV_Assert(devnums > 0);
+		//if you want to use undefault device, set it here
+		//setDevice(oclinfo[0]);
+		setBinpath(CLBINPATH);
+	}
+
+	void Has_roi(int b)
+	{
+		//cv::RNG& rng = TS::ptr()->get_rng();
+		if(b)
+		{
+			//randomize ROI
+			roicols =  mat.cols-1; //start
+			roirows = mat.rows-1;
+			srcx   = 1;
+			srcx   = 1;
+			dst1x    = 1;
+			dst1y    =1;
+			dst2x    = 1;
+			dst2y    =1;
+			dst3x    = 1;
+			dst3y    =1;
+			dst4x    = 1;
+			dst4y    =1;
+		}else
+		{
+			roicols = mat.cols;
+			roirows = mat.rows;
+			srcx = 0;
+			srcy = 0;
+			dst1x = 0;
+			dst1y = 0;
+			dst2x    = 0;
+			dst2y    =0;
+			dst3x    = 0;
+			dst3y    =0;
+			dst4x    = 0;
+			dst4y    =0;
+		};
+
+		mat_roi = mat(Rect(srcx,srcy,roicols,roirows));
+
+		dst1_roi = dst1(Rect(dst1x,dst1y,roicols,roirows));
+		dst2_roi = dst2(Rect(dst2x,dst2y,roicols,roirows));
+		dst3_roi = dst3(Rect(dst3x,dst3y,roicols,roirows));
+		dst4_roi = dst4(Rect(dst4x,dst4y,roicols,roirows));
+	}
+
+};
+
+struct Split :SplitTestBase {};
+
+TEST_P(Split, Accuracy) 
+{    
+#ifndef PRINT_KERNEL_RUN_TIME   
+	double totalcputick=0;
+	double totalgputick=0;
+	double totalgputick_kernel=0;
+	double t0=0;
+	double t1=0;
+	double t2=0;	
+	for(int k=0;k<2;k++){
+		totalcputick=0;
+		totalgputick=0;
+		totalgputick_kernel=0;
+		for(int j = 0; j < LOOP_TIMES+1; j ++)
+		{
+			Has_roi(k);       
+			cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
+			cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
+			t0 = (double)cvGetTickCount();//cpu start
+			cv::split(mat_roi, dev_dst);
+			t0 = (double)cvGetTickCount() - t0;//cpu end
+
+			t1 = (double)cvGetTickCount();//gpu start1		
+			gdst1_whole = dst1;
+			gdst1 = gdst1_whole(Rect(dst1x,dst1y,roicols,roirows));
+
+			gdst2_whole = dst2;
+			gdst2 = gdst2_whole(Rect(dst2x,dst2y,roicols,roirows));
+
+			gdst3_whole = dst3;
+			gdst3 = gdst3_whole(Rect(dst3x,dst3y,roicols,roirows));
+
+			gdst4_whole = dst4;
+			gdst4 = gdst4_whole(Rect(dst4x,dst4y,roicols,roirows));
+
+			gmat = mat_roi;
+			t2=(double)cvGetTickCount();//kernel
+			cv::ocl::split(gmat, dev_gdst); 
+			t2 = (double)cvGetTickCount() - t2;//kernel
+			cv::Mat cpu_dst1;
+			cv::Mat cpu_dst2;
+			cv::Mat cpu_dst3;
+			cv::Mat cpu_dst4;
+			gdst1_whole.download(cpu_dst1);
+			gdst2_whole.download(cpu_dst2);
+			gdst3_whole.download(cpu_dst3);
+			gdst4_whole.download(cpu_dst4);
+			t1 = (double)cvGetTickCount() - t1;//gpu end1		
+			if(j == 0)
+				continue;
+			totalgputick=t1+totalgputick;
+			totalcputick=t0+totalcputick;	
+			totalgputick_kernel=t2+totalgputick_kernel;	
+
+		}
+		if(k==0){cout<<"no roi\n";}else{cout<<"with roi\n";};
+		cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+		cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
+	}
+#else
+	for(int j = 0; j < 2; j ++)
+	{
+		Has_roi(j);
+		cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
+		cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
+		gdst1_whole = dst1;
+		gdst1 = gdst1_whole(Rect(dst1x,dst1y,roicols,roirows));
+
+		gdst2_whole = dst2;
+		gdst2 = gdst2_whole(Rect(dst2x,dst2y,roicols,roirows));
+
+		gdst3_whole = dst3;
+		gdst3 = gdst3_whole(Rect(dst3x,dst3y,roicols,roirows));
+
+		gdst4_whole = dst4;
+		gdst4 = gdst4_whole(Rect(dst4x,dst4y,roicols,roirows));
+		gmat = mat_roi;
+		if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
+		cv::ocl::split(gmat, dev_gdst); 
+	};
+#endif
+}
+
+//*************test*****************
+INSTANTIATE_TEST_CASE_P(SplitMerge, Merge, Combine(
+						Values(CV_8UC4, CV_32FC4), Values(1, 4)));
+
+INSTANTIATE_TEST_CASE_P(SplitMerge, Split , Combine(
+						Values(CV_8U, CV_32S, CV_32F), Values(1, 4)));     
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/utility.cpp b/modules/ocl/perf/utility.cpp
new file mode 100644
index 000000000..417f72f05
--- /dev/null
+++ b/modules/ocl/perf/utility.cpp
@@ -0,0 +1,265 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#define VARNAME(A) #A
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+
+
+//std::string generateVarList(int first,...)
+//{
+//	vector<std::string> varname;
+//
+//	va_list argp;
+//	string s;
+//	stringstream ss;
+//	va_start(argp,first);
+//	int i=first;
+//	while(i!=-1)
+//	{
+//		ss<<i<<",";
+//		i=va_arg(argp,int);
+//	};
+//	s=ss.str();
+//	va_end(argp);
+//	return s;
+//};
+
+//std::string generateVarList(int& p1,int& p2)
+//{
+//	stringstream ss;
+//	ss<<VARNAME(p1)<<":"<<src1x<<","<<VARNAME(p2)<<":"<<src1y;
+//	return ss.str();
+//};
+
+int randomInt(int minVal, int maxVal)
+{
+    RNG& rng = TS::ptr()->get_rng();
+    return rng.uniform(minVal, maxVal);
+}
+
+double randomDouble(double minVal, double maxVal)
+{
+    RNG& rng = TS::ptr()->get_rng();
+    return rng.uniform(minVal, maxVal);
+}
+
+Size randomSize(int minVal, int maxVal)
+{
+    return cv::Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
+}
+
+Scalar randomScalar(double minVal, double maxVal)
+{
+    return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
+}
+
+Mat randomMat(Size size, int type, double minVal, double maxVal)
+{
+    return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
+}
+
+
+
+
+
+
+
+/*
+void showDiff(InputArray gold_, InputArray actual_, double eps)
+{
+    Mat gold;
+    if (gold_.kind() == _InputArray::MAT)
+        gold = gold_.getMat();
+    else
+        gold_.getGpuMat().download(gold);
+
+    Mat actual;
+    if (actual_.kind() == _InputArray::MAT)
+        actual = actual_.getMat();
+    else
+        actual_.getGpuMat().download(actual);
+
+    Mat diff;
+    absdiff(gold, actual, diff);
+    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
+
+    namedWindow("gold", WINDOW_NORMAL);
+    namedWindow("actual", WINDOW_NORMAL);
+    namedWindow("diff", WINDOW_NORMAL);
+
+    imshow("gold", gold);
+    imshow("actual", actual);
+    imshow("diff", diff);
+
+    waitKey();
+}
+*/
+
+/*
+bool supportFeature(const DeviceInfo& info, FeatureSet feature)
+{
+    return TargetArchs::builtWith(feature) && info.supports(feature);
+}
+
+const vector<DeviceInfo>& devices()
+{
+    static vector<DeviceInfo> devs;
+    static bool first = true;
+
+    if (first)
+    {
+        int deviceCount = getCudaEnabledDeviceCount();
+
+        devs.reserve(deviceCount);
+
+        for (int i = 0; i < deviceCount; ++i)
+        {
+            DeviceInfo info(i);
+            if (info.isCompatible())
+                devs.push_back(info);
+        }
+
+        first = false;
+    }
+
+    return devs;
+}
+
+vector<DeviceInfo> devices(FeatureSet feature)
+{
+    const vector<DeviceInfo>& d = devices();
+    
+    vector<DeviceInfo> devs_filtered;
+
+    if (TargetArchs::builtWith(feature))
+    {
+        devs_filtered.reserve(d.size());
+
+        for (size_t i = 0, size = d.size(); i < size; ++i)
+        {
+            const DeviceInfo& info = d[i];
+
+            if (info.supports(feature))
+                devs_filtered.push_back(info);
+        }
+    }
+
+    return devs_filtered;
+}
+*/
+
+vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
+{
+    vector<MatType> v;
+
+    v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
+
+    for (int depth = depth_start; depth <= depth_end; ++depth)
+    {
+        for (int cn = cn_start; cn <= cn_end; ++cn)
+        {
+            v.push_back(CV_MAKETYPE(depth, cn));
+        }
+    }
+
+    return v;
+}
+
+const vector<MatType>& all_types()
+{
+    static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
+
+    return v;
+}
+
+Mat readImage(const string& fileName, int flags)
+{
+    return imread(string(cvtest::TS::ptr()->get_data_path()) + fileName, flags);
+}
+
+Mat readImageType(const string& fname, int type)
+{
+    Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
+    if (CV_MAT_CN(type) == 4)
+    {
+        Mat temp;
+        cvtColor(src, temp, cv::COLOR_BGR2BGRA);
+        swap(src, temp);
+    }
+    src.convertTo(src, CV_MAT_DEPTH(type));
+    return src;
+}
+
+double checkNorm(const Mat& m)
+{
+    return norm(m, NORM_INF);
+}
+
+double checkNorm(const Mat& m1, const Mat& m2)
+{
+    return norm(m1, m2, NORM_INF);
+}
+
+double checkSimilarity(const Mat& m1, const Mat& m2)
+{
+    Mat diff;
+    matchTemplate(m1, m2, diff, CV_TM_CCORR_NORMED);
+    return std::abs(diff.at<float>(0, 0) - 1.f);
+}
+
+/*
+void cv::ocl::PrintTo(const DeviceInfo& info, ostream* os)
+{
+    (*os) << info.name();
+}
+*/
+
+void PrintTo(const Inverse& inverse, std::ostream* os)
+{
+    if (inverse)
+        (*os) << "inverse";
+    else
+        (*os) << "direct";
+}
diff --git a/modules/ocl/perf/utility.hpp b/modules/ocl/perf/utility.hpp
new file mode 100644
index 000000000..0a0bfba6d
--- /dev/null
+++ b/modules/ocl/perf/utility.hpp
@@ -0,0 +1,177 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_TEST_UTILITY_HPP__
+#define __OPENCV_TEST_UTILITY_HPP__
+//#define PRINT_KERNEL_RUN_TIME
+#ifdef PRINT_KERNEL_RUN_TIME
+#define LOOP_TIMES 1
+#else
+#define LOOP_TIMES 1
+#endif
+#define MWIDTH 2557
+#define MHEIGHT 2579
+#define CLBINPATH ".\\"
+int randomInt(int minVal, int maxVal);
+double randomDouble(double minVal, double maxVal);
+
+//std::string generateVarList(int first,...);
+std::string generateVarList(int& p1,int& p2);
+cv::Size randomSize(int minVal, int maxVal);
+cv::Scalar randomScalar(double minVal, double maxVal);
+cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
+
+void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
+
+//! return true if device supports specified feature and gpu module was built with support the feature.
+//bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+
+//! return all devices compatible with current gpu module build.
+//const std::vector<cv::ocl::DeviceInfo>& devices();
+//! return all devices compatible with current gpu module build which support specified feature.
+//std::vector<cv::ocl::DeviceInfo> devices(cv::gpu::FeatureSet feature);
+
+//! read image from testdata folder.
+cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
+cv::Mat readImageType(const std::string& fname, int type);
+
+double checkNorm(const cv::Mat& m);
+double checkNorm(const cv::Mat& m1, const cv::Mat& m2);
+double checkSimilarity(const cv::Mat& m1, const cv::Mat& m2);
+
+#define EXPECT_MAT_NORM(mat, eps) \
+{ \
+    EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
+}
+
+//#define EXPECT_MAT_NEAR(mat1, mat2, eps) \
+//{ \
+//    ASSERT_EQ(mat1.type(), mat2.type()); \
+//    ASSERT_EQ(mat1.size(), mat2.size()); \
+//    EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
+//}
+
+#define EXPECT_MAT_NEAR(mat1, mat2, eps,s) \
+{ \
+    ASSERT_EQ(mat1.type(), mat2.type()); \
+    ASSERT_EQ(mat1.size(), mat2.size()); \
+    EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps)<<s; \
+}
+
+#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
+{ \
+    ASSERT_EQ(mat1.type(), mat2.type()); \
+    ASSERT_EQ(mat1.size(), mat2.size()); \
+    EXPECT_LE(checkSimilarity(cv::Mat(mat1), cv::Mat(mat2)), eps); \
+}
+
+namespace cv 
+{ 
+    namespace ocl 
+    {
+        // void PrintTo(const DeviceInfo& info, std::ostream* os);
+    }
+}
+
+using perf::MatDepth;
+using perf::MatType;
+
+//! return vector with types from specified range.
+std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
+
+//! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
+const std::vector<MatType>& all_types();
+
+class Inverse
+{
+    public:
+        inline Inverse(bool val = false) : val_(val) {}
+
+        inline operator bool() const { return val_; }
+
+    private:
+        bool val_;
+};
+
+void PrintTo(const Inverse& useRoi, std::ostream* os);
+
+CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
+
+CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
+
+    enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
+CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
+
+CV_ENUM(ReduceOp, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
+
+    CV_FLAGS(GemmFlags, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
+
+CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
+
+CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
+
+CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC)
+
+CV_ENUM(Border, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
+
+CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WARP_INVERSE_MAP)
+
+CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
+
+CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
+
+void  run_perf_test();
+
+#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
+
+#define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+#define ALL_DEVICES testing::ValuesIn(devices())
+#define DEVICES(feature) testing::ValuesIn(devices(feature))
+
+#define ALL_TYPES testing::ValuesIn(all_types())
+#define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
+
+#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
+
+#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
+
+#endif // __OPENCV_TEST_UTILITY_HPP__