From 5539e85a1179c51da0b709fa48a516ca67370847 Mon Sep 17 00:00:00 2001
From: yao <bitwangyaoyao@gmail.com>
Date: Wed, 27 Mar 2013 12:04:48 +0800
Subject: [PATCH] use perf test replace performance sample

---
 modules/ocl/perf/interpolation.hpp            |  120 -
 modules/ocl/perf/main.cpp                     |  200 +-
 modules/ocl/perf/perf_arithm.cpp              | 4977 +++--------------
 modules/ocl/perf/perf_blend.cpp               |  134 +-
 modules/ocl/perf/perf_brute_force_matcher.cpp |  150 +
 modules/ocl/perf/perf_canny.cpp               |  122 +-
 modules/ocl/perf/perf_color.cpp               |   91 +
 modules/ocl/perf/perf_columnsum.cpp           |  112 +-
 modules/ocl/perf/perf_fft.cpp                 |  105 +-
 modules/ocl/perf/perf_filters.cpp             | 1349 +----
 modules/ocl/perf/perf_gemm.cpp                |  105 +-
 modules/ocl/perf/perf_haar.cpp                |  198 +-
 modules/ocl/perf/perf_hog.cpp                 |  150 +-
 modules/ocl/perf/perf_imgproc.cpp             | 2683 +++------
 modules/ocl/perf/perf_match_template.cpp      |  278 +-
 modules/ocl/perf/perf_matrix_operation.cpp    |  781 +--
 modules/ocl/perf/perf_norm.cpp                |   84 +
 modules/ocl/perf/perf_pyrdown.cpp             |  126 +-
 modules/ocl/perf/perf_pyrlk.cpp               |  143 +
 modules/ocl/perf/perf_pyrup.cpp               |  109 +-
 modules/ocl/perf/perf_split_merge.cpp         |  519 +-
 modules/ocl/perf/precomp.cpp                  |  330 +-
 modules/ocl/perf/precomp.hpp                  |  386 +-
 modules/ocl/perf/utility.cpp                  |  265 -
 modules/ocl/perf/utility.hpp                  |  182 -
 samples/ocl/performance.cpp                   | 4397 ---------------
 26 files changed, 3791 insertions(+), 14305 deletions(-)
 delete mode 100644 modules/ocl/perf/interpolation.hpp
 create mode 100644 modules/ocl/perf/perf_brute_force_matcher.cpp
 create mode 100644 modules/ocl/perf/perf_color.cpp
 create mode 100644 modules/ocl/perf/perf_norm.cpp
 create mode 100644 modules/ocl/perf/perf_pyrlk.cpp
 delete mode 100644 modules/ocl/perf/utility.cpp
 delete mode 100644 modules/ocl/perf/utility.hpp
 delete mode 100644 samples/ocl/performance.cpp

diff --git a/modules/ocl/perf/interpolation.hpp b/modules/ocl/perf/interpolation.hpp
deleted file mode 100644
index fb89e701d..000000000
--- a/modules/ocl/perf/interpolation.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
-#define __OPENCV_TEST_INTERPOLATION_HPP__
-
-template <typename T> T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-{
-    if (border_type == cv::BORDER_CONSTANT)
-        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
-
-    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
-}
-
-template <typename T> struct NearestInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
-    }
-};
-
-template <typename T> struct LinearInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        x -= 0.5f;
-        y -= 0.5f;
-
-        int x1 = cvFloor(x);
-        int y1 = cvFloor(y);
-        int x2 = x1 + 1;
-        int y2 = y1 + 1;
-
-        float res = 0;
-
-        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
-        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
-        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
-        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
-
-        return cv::saturate_cast<T>(res);
-    }
-};
-
-template <typename T> struct CubicInterpolator
-{
-    static float getValue(float p[4], float x)
-    {
-        return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
-    }
-
-    static float getValue(float p[4][4], float x, float y)
-    {
-        float arr[4];
-
-        arr[0] = getValue(p[0], x);
-        arr[1] = getValue(p[1], x);
-        arr[2] = getValue(p[2], x);
-        arr[3] = getValue(p[3], x);
-
-        return getValue(arr, y);
-    }
-
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        int ix = cvRound(x);
-        int iy = cvRound(y);
-
-        float vals[4][4] =
-        {
-            {readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy    , ix - 2, c, border_type, borderVal), readVal<T>(src, iy    , ix - 1, c, border_type, borderVal), readVal<T>(src, iy    , ix, c, border_type, borderVal), readVal<T>(src, iy    , ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
-        };
-
-        return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
-    }
-};
-
-#endif // __OPENCV_TEST_INTERPOLATION_HPP__
diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp
index e517a371d..2da17755e 100644
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -41,129 +42,118 @@
 
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
-
-using namespace std;
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-
-void print_info()
+int main(int argc, const char *argv[])
 {
-    printf("\n");
-#if defined _WIN32
-#   if defined _WIN64
-    puts("OS: Windows 64");
-#   else
-    puts("OS: Windows 32");
-#   endif
-#elif defined linux
-#   if defined _LP64
-    puts("OS: Linux 64");
-#   else
-    puts("OS: Linux 32");
-#   endif
-#elif defined __APPLE__
-#   if defined _LP64
-    puts("OS: Apple 64");
-#   else
-    puts("OS: Apple 32");
-#   endif
-#endif
+    vector<ocl::Info> oclinfo;
+    int num_devices = getDevice(oclinfo);
+
+    if (num_devices < 1)
+    {
+        cerr << "no device found\n";
+        return -1;
+    }
+
+    int devidx = 0;
+
+    for (size_t i = 0; i < oclinfo.size(); i++)
+    {
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
+        {
+            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
+        }
+    }
+
+    redirectError(cvErrorCallback);
 
-}
-std::string workdir;
-int main(int argc, char **argv)
-{
-    TS::ptr()->init("ocl");
-    InitGoogleTest(&argc, argv);
     const char *keys =
-
-        "{ h | help     | false              | print help message }"
-
-        "{ w | workdir  | ../../../samples/c/| set working directory }"
-
-        "{ t | type     | gpu                | set device type:cpu or gpu}"
-
-        "{ p | platform | 0                  | set platform id }"
-
-        "{ d | device   | 0                  | set device id }";
-
-
+        "{ h | help    | false | print help message }"
+        "{ f | filter  |       | filter for test }"
+        "{ w | workdir |       | set working directory }"
+        "{ l | list    | false | show all tests }"
+        "{ d | device  | 0     | device id }"
+        "{ i | iters   | 10    | iteration count }"
+        "{ m | warmup  | 1     | gpu warm up iteration count}"
+        "{ t | xtop    | 1.1	  | xfactor top boundary}"
+        "{ b | xbottom | 0.9	  | xfactor bottom boundary}"
+        "{ v | verify  | false | only run gpu once to verify if problems occur}";
 
     CommandLineParser cmd(argc, argv, keys);
 
     if (cmd.get<bool>("help"))
-
     {
-
-        cout << "Avaible options besides goole test option:" << endl;
-
+        cout << "Avaible options:" << endl;
         cmd.printParams();
+        return 0;
     }
 
-    workdir = cmd.get<string>("workdir");
-
-    string type = cmd.get<string>("type");
-
-    unsigned int pid = cmd.get<unsigned int>("platform");
-
     int device = cmd.get<int>("device");
 
-
-    print_info();
-    // int flag = CVCL_DEVICE_TYPE_GPU;
-
-    // if(type == "cpu")
-
-    // {
-
-    //     flag = CVCL_DEVICE_TYPE_CPU;
-
-    // }
-    std::vector<cv::ocl::Info> oclinfo;
-    int devnums = getDevice(oclinfo);
-    if(devnums <= device || device < 0)
-
+    if (device < 0 || device >= num_devices)
     {
-
-        std::cout << "device invalid\n";
-
+        cerr << "Invalid device ID" << endl;
         return -1;
-
     }
 
-    if(pid >= oclinfo.size())
-
+    if (cmd.get<bool>("verify"))
     {
-
-        std::cout << "platform invalid\n";
-
-        return -1;
-
+        TestSystem::instance().setNumIters(1);
+        TestSystem::instance().setGPUWarmupIters(0);
+        TestSystem::instance().setCPUIters(0);
     }
 
-    if(pid != 0 || device != 0)
+    devidx = 0;
 
+    for (size_t i = 0; i < oclinfo.size(); i++)
     {
-
-        setDevice(oclinfo[pid], device);
-
+        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
+        {
+            if (device == devidx)
+            {
+                ocl::setDevice(oclinfo[i], (int)j);
+                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
+                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
+                goto END_DEV;
+            }
+        }
     }
 
-    cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
-    setBinpath(CLBINPATH);
-    return RUN_ALL_TESTS();
-}
+END_DEV:
 
-#else // DON'T HAVE_OPENCL
+    string filter = cmd.get<string>("filter");
+    string workdir = cmd.get<string>("workdir");
+    bool list = cmd.get<bool>("list");
+    int iters = cmd.get<int>("iters");
+    int wu_iters = cmd.get<int>("warmup");
+    double x_top = cmd.get<double>("xtop");
+    double x_bottom = cmd.get<double>("xbottom");
+
+    TestSystem::instance().setTopThreshold(x_top);
+    TestSystem::instance().setBottomThreshold(x_bottom);
+
+    if (!filter.empty())
+    {
+        TestSystem::instance().setTestFilter(filter);
+    }
+
+    if (!workdir.empty())
+    {
+        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
+        {
+            workdir += '/';
+        }
+
+        TestSystem::instance().setWorkingDir(workdir);
+    }
+
+    if (list)
+    {
+        TestSystem::instance().setListMode(true);
+    }
+
+    TestSystem::instance().setNumIters(iters);
+    TestSystem::instance().setGPUWarmupIters(wu_iters);
+
+    TestSystem::instance().run();
 
-int main()
-{
-    printf("OpenCV was built without OpenCL support\n");
     return 0;
-}
-
-
-#endif // HAVE_OPENCL
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp
index b7f82b685..e6e957641 100644
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@@ -1,4 +1,4 @@
-///////////////////////////////////////////////////////////////////////////////////////
+/*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -10,17 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan,jlyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Zailong Wu, bullet@yeah.net
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -35,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -48,4371 +43,1165 @@
 //
 //M*/
 
-
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
+///////////// Lut ////////////////////////
+TEST(lut)
 {
-    int type;
-    cv::Scalar val;
+    Mat src, lut, dst;
+    ocl::oclMat d_src, d_lut, d_dst;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
+    int all_type[] = {CV_8UC1, CV_8UC3};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC3"};
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-        cv::RNG &rng = TS::ptr()->get_rng();
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(lut, 1, 256, CV_8UC1, 0, 1);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-        cv::Size size(MWIDTH, MHEIGHT);
+            LUT(src, lut, dst);
 
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        //mat2 = randomMat(rng, cv::Size(512,3), type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+            CPU_ON;
+            LUT(src, lut, dst);
+            CPU_OFF;
 
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+            d_src.upload(src);
+            d_lut.upload(lut);
+
+            WARMUP_ON;
+            ocl::LUT(d_src, d_lut, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::LUT(d_src, d_lut, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_lut.upload(lut);
+            ocl::LUT(d_src, d_lut, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+
+        }
 
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
     }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src2x   = 1;
-            src1y   = 1;
-            src2y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            maskx	 = 1;
-            masky	= 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src2x = 0;
-            src1y = 0;
-            src2y = 0;
-            dstx = 0;
-            dsty = 0;
-            maskx	 = 0;
-            masky	= 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        //mat2_roi = mat2(Rect(src2x,src2y,256,1));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gdst1_whole = dst1;
-        //gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat1 = mat1_roi;
-        //gmat2 = mat2_roi;
-        //gmask = mask_roi;
-    }
-
-};
-////////////////////////////////lut/////////////////////////////////////////////////
-
-struct Lut : ArithmTestBase {};
-
-TEST_P(Lut, Mat)
-{
-
-    cv::Mat mat2(3, 512, CV_8UC1);
-    cv::RNG &rng = TS::ptr()->get_rng();
-    rng.fill(mat2, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(256));
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            mat2 = randomMat(rng, cv::Size(512, 3), type, 5, 16, false);
-            mat2_roi = mat2(Rect(src2x, src2y, 256, 1));
-
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::LUT(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::LUT(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        // s=GetParam();
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        //  src2x = rng.uniform( 0,mat2.cols - 256);
-        // src2y = rng.uniform (0,mat2.rows - 1);
-
-        // cv::Mat mat2_roi = mat2(Rect(src2x,src2y,256,1));
-        mat2 = randomMat(rng, cv::Size(512, 3), type, 5, 16, false);
-        mat2_roi = mat2(Rect(src2x, src2y, 256, 1));
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        //   gdst1_whole = dst1;
-        //     gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        //     gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::LUT(gmat1, gmat2, gdst);
-    };
-#endif
-
 }
 
-
-
-////////////////////////////////exp/////////////////////////////////////////////////
-
-struct Exp : ArithmTestBase {};
-
-TEST_P(Exp, Mat)
+///////////// Exp ////////////////////////
+TEST(Exp)
 {
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
+        SUBTEST << size << 'x' << size << "; CV_32FC1";
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::exp(mat1_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+        gen(src, size, size, CV_32FC1, 0, 256);
+        gen(dst, size, size, CV_32FC1, 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
+        exp(src, dst);
 
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
+        CPU_ON;
+        exp(src, dst);
+        CPU_OFF;
+        d_src.upload(src);
 
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::exp(gmat1, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download(cpu_dst);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-            //EXPECT_MAT_NEAR(dst, cpu_dst, 0,"");
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
+        WARMUP_ON;
+        ocl::exp(d_src, d_dst);
+        WARMUP_OFF;
 
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+        GPU_ON;
+        ocl::exp(d_src, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::exp(d_src, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::exp(gmat1, gdst);
-    };
-#endif
-
 }
 
-
-////////////////////////////////log/////////////////////////////////////////////////
-
-struct Log : ArithmTestBase {};
-
-TEST_P(Log, Mat)
+///////////// LOG ////////////////////////
+TEST(Log)
 {
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
+        SUBTEST << size << 'x' << size << "; 32F";
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::log(mat1_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+        gen(src, size, size, CV_32F, 1, 10);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+        log(src, dst);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::log(gmat1, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+        CPU_ON;
+        log(src, dst);
+        CPU_OFF;
+        d_src.upload(src);
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+        WARMUP_ON;
+        ocl::log(d_src, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::log(d_src, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::log(d_src, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::log(gmat1, gdst);
-    };
-#endif
-
 }
 
-
-
-
-////////////////////////////////add/////////////////////////////////////////////////
-
-struct Add : ArithmTestBase {};
-
-TEST_P(Add, Mat)
+///////////// Add ////////////////////////
+TEST(Add)
 {
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::add(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 1);
+            gen(src2, size, size, all_type[j], 0, 1);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            add(src1, src2, dst);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::add(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
+            CPU_ON;
+            add(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::add(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::add(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::add(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::add(gmat1, gmat2, gdst);
-    };
-#endif
 }
 
-TEST_P(Add, Mat_Mask)
+///////////// Mul ////////////////////////
+TEST(Mul)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::add(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::add(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            multiply(src1, src2, dst);
 
+            CPU_ON;
+            multiply(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::multiply(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::multiply(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::multiply(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::add(gmat1, gmat2, gdst, gmask);
-    };
-#endif
-}
-TEST_P(Add, Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::add(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::add(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::add(gmat1, val, gdst);
-    };
-#endif
 }
 
-TEST_P(Add, Scalar_Mask)
+///////////// Div ////////////////////////
+TEST(Div)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::add(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::add(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
 
+            divide(src1, src2, dst);
+
+            CPU_ON;
+            divide(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::divide(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::divide(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::divide(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::add(gmat1, val, gdst, gmask);
-    };
-#endif
 }
 
-
-////////////////////////////////sub/////////////////////////////////////////////////
-struct Sub : ArithmTestBase {};
-
-TEST_P(Sub, Mat)
+///////////// Absdiff ////////////////////////
+TEST(Absdiff)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::subtract(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::subtract(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            absdiff(src1, src2, dst);
 
+            CPU_ON;
+            absdiff(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::absdiff(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::absdiff(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::absdiff(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::subtract(gmat1, gmat2, gdst);
-    };
-#endif
 }
 
-TEST_P(Sub, Mat_Mask)
+///////////// CartToPolar ////////////////////////
+TEST(CartToPolar)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst, dst1;
+    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::subtract(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+            gen(dst1, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            cartToPolar(src1, src2, dst, dst1, 1);
 
+            CPU_ON;
+            cartToPolar(src1, src2, dst, dst1, 1);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
+            d_dst.download(dst);
+            d_dst1.download(dst1);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
-    };
-#endif
-}
-TEST_P(Sub, Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::subtract(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::subtract(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::subtract(gmat1, val, gdst);
-    };
-#endif
 }
 
-TEST_P(Sub, Scalar_Mask)
+///////////// PolarToCart ////////////////////////
+TEST(PolarToCart)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst, dst1;
+    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::subtract(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+            gen(dst1, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::subtract(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            polarToCart(src1, src2, dst, dst1, 1);
 
+            CPU_ON;
+            polarToCart(src1, src2, dst, dst1, 1);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
+            d_dst.download(dst);
+            d_dst1.download(dst1);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::subtract(gmat1, val, gdst, gmask);
-    };
-#endif
 }
 
-
-////////////////////////////////Mul/////////////////////////////////////////////////
-struct Mul : ArithmTestBase {};
-
-TEST_P(Mul, Mat)
+///////////// Magnitude ////////////////////////
+TEST(magnitude)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat x, y, mag;
+    ocl::oclMat d_x, d_y, d_mag;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::multiply(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(x, size, size, all_type[j], 0, 1);
+            gen(y, size, size, all_type[j], 0, 1);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            magnitude(x, y, mag);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::multiply(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            magnitude(x, y, mag);
+            CPU_OFF;
+            d_x.upload(x);
+            d_y.upload(y);
 
+            WARMUP_ON;
+            ocl::magnitude(d_x, d_y, d_mag);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::magnitude(d_x, d_y, d_mag);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_x.upload(x);
+            d_y.upload(y);
+            ocl::magnitude(d_x, d_y, d_mag);
+            d_mag.download(mag);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::multiply(gmat1, gmat2, gdst);
-    };
-#endif
 }
 
-TEST_P(Mul, Mat_Scalar)
+///////////// Transpose ////////////////////////
+TEST(Transpose)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            cv::RNG &rng = TS::ptr()->get_rng();
-            double s = rng.uniform(-10.0, 10.0);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::multiply(mat1_roi, mat2_roi, dst_roi, s);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::multiply(gmat1, gmat2, gdst, s);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            transpose(src, dst);
 
+            CPU_ON;
+            transpose(src, dst);
+            CPU_OFF;
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::transpose(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::transpose(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::transpose(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        double s = rng.uniform(-10.0, 10.0);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::multiply(gmat1, gmat2, gdst, s);
-    };
-#endif
 }
 
-
-struct Div : ArithmTestBase {};
-
-TEST_P(Div, Mat)
+///////////// Flip ////////////////////////
+TEST(Flip)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; FLIP_BOTH";
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::divide(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            flip(src, dst, 0);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::divide(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            flip(src, dst, 0);
+            CPU_OFF;
+            d_src.upload(src);
 
+            WARMUP_ON;
+            ocl::flip(d_src, d_dst, 0);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::flip(d_src, d_dst, 0);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::flip(d_src, d_dst, 0);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::divide(gmat1, gmat2, gdst);
-    };
-#endif
 }
 
-TEST_P(Div, Mat_Scalar)
+///////////// minMax ////////////////////////
+TEST(minMax)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src;
+    ocl::oclMat d_src;
+
+    double min_val, max_val;
+    Point min_loc, max_loc;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            cv::RNG &rng = TS::ptr()->get_rng();
-            double s = rng.uniform(-10.0, 10.0);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::divide(mat1_roi, mat2_roi, dst_roi, s);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            gen(src, size, size, all_type[j], 0, 256);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::divide(gmat1, gmat2, gdst, s);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
+            CPU_OFF;
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::minMax(d_src, &min_val, &max_val);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::minMax(d_src, &min_val, &max_val);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::minMax(d_src, &min_val, &max_val);
+            GPU_FULL_OFF;
 
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        double s = rng.uniform(-10.0, 10.0);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::divide(gmat1, gmat2, gdst, s);
-    };
-#endif
 }
 
-
-struct Absdiff : ArithmTestBase {};
-
-TEST_P(Absdiff, Mat)
+///////////// minMaxLoc ////////////////////////
+TEST(minMaxLoc)
 {
+    Mat src;
+    ocl::oclMat d_src;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    double min_val, max_val;
+    Point min_loc, max_loc;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::absdiff(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 1);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            CPU_ON;
+            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
+            CPU_OFF;
+            d_src.upload(src);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::absdiff(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::absdiff(gmat1, gmat2, gdst);
-    };
-#endif
 }
 
-TEST_P(Absdiff, Mat_Scalar)
+///////////// Sum ////////////////////////
+TEST(Sum)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src;
+    Scalar cpures, gpures;
+    ocl::oclMat d_src;
+
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::absdiff(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            cpures = sum(src);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::absdiff(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            cpures = sum(src);
+            CPU_OFF;
+            d_src.upload(src);
 
+            WARMUP_ON;
+            gpures = ocl::sum(d_src);
+            WARMUP_OFF;
+
+            GPU_ON;
+            gpures = ocl::sum(d_src);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            gpures = ocl::sum(d_src);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::absdiff(gmat1, val, gdst);
-    };
-#endif
 }
 
-
-
-struct CartToPolar : ArithmTestBase {};
-
-TEST_P(CartToPolar, angleInDegree)
+///////////// countNonZero ////////////////////////
+TEST(countNonZero)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src;
+    ocl::oclMat d_src;
+
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            countNonZero(src);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            cv::Mat cpu_dst1;
-            gdst1_whole.download(cpu_dst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            countNonZero(src);
+            CPU_OFF;
+            d_src.upload(src);
 
+            WARMUP_ON;
+            ocl::countNonZero(d_src);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::countNonZero(d_src);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::countNonZero(d_src);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
-    };
-#endif
 }
 
-TEST_P(CartToPolar, angleInRadians)
+///////////// Phase ////////////////////////
+TEST(Phase)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            cv::Mat cpu_dst1;
-            gdst1_whole.download(cpu_dst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
 
+            phase(src1, src2, dst, 1);
+
+            CPU_ON;
+            phase(src1, src2, dst, 1);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::phase(d_src1, d_src2, d_dst, 1);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::phase(d_src1, d_src2, d_dst, 1);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::phase(d_src1, d_src2, d_dst, 1);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
-    };
-#endif
 }
 
-
-struct PolarToCart : ArithmTestBase {};
-
-TEST_P(PolarToCart, angleInDegree)
+///////////// bitwise_and////////////////////////
+TEST(bitwise_and)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            cv::Mat cpu_dst1;
-            gdst1_whole.download(cpu_dst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            bitwise_and(src1, src2, dst);
 
+            CPU_ON;
+            bitwise_and(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::bitwise_and(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::bitwise_and(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::bitwise_and(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
-    };
-#endif
 }
 
-TEST_P(PolarToCart, angleInRadians)
+///////////// bitwise_or////////////////////////
+TEST(bitwise_or)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            cv::Mat cpu_dst1;
-            gdst1_whole.download(cpu_dst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            bitwise_or(src1, src2, dst);
 
+            CPU_ON;
+            bitwise_or(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::bitwise_or(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::bitwise_or(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::bitwise_or(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
-    };
-#endif
 }
 
-
-
-struct Magnitude : ArithmTestBase {};
-
-TEST_P(Magnitude, Mat)
+///////////// bitwise_xor////////////////////////
+TEST(bitwise_xor)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::magnitude(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::magnitude(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            bitwise_xor(src1, src2, dst);
 
+            CPU_ON;
+            bitwise_xor(src1, src2, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::bitwise_xor(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::bitwise_xor(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::bitwise_xor(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::magnitude(gmat1, gmat2, gdst);
-    };
-#endif
 }
 
-struct Transpose : ArithmTestBase {};
-
-TEST_P(Transpose, Mat)
+///////////// bitwise_not////////////////////////
+TEST(bitwise_not)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, dst;
+    ocl::oclMat d_src1, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_32SC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::transpose(mat1_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::transpose(gmat1, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            bitwise_not(src1, dst);
 
+            CPU_ON;
+            bitwise_not(src1, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+
+            WARMUP_ON;
+            ocl::bitwise_not(d_src1, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::bitwise_not(d_src1, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            ocl::bitwise_not(d_src1, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::transpose(gmat1, gdst);
-    };
-#endif
 }
 
-
-struct Flip : ArithmTestBase {};
-
-TEST_P(Flip, X)
+///////////// compare////////////////////////
+TEST(compare)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int CMP_EQ = 0;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::flip(mat1_roi, dst_roi, 0);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::flip(gmat1, gdst, 0);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            compare(src1, src2, dst, CMP_EQ);
 
+            CPU_ON;
+            compare(src1, src2, dst, CMP_EQ);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::flip(gmat1, gdst, 0);
-    };
-#endif
 }
 
-TEST_P(Flip, Y)
+///////////// pow ////////////////////////
+TEST(pow)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::flip(mat1_roi, dst_roi, 1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 100);
+            gen(dst, size, size, all_type[j], 0, 100);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            pow(src, -2.0, dst);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::flip(gmat1, gdst, 1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            pow(src, -2.0, dst);
+            CPU_OFF;
+            d_src.upload(src);
+            d_dst.upload(dst);
 
+            WARMUP_ON;
+            ocl::pow(d_src, -2.0, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::pow(d_src, -2.0, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pow(d_src, -2.0, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::flip(gmat1, gdst, 1);
-    };
-#endif
 }
 
-TEST_P(Flip, BOTH)
+///////////// MagnitudeSqr////////////////////////
+TEST(MagnitudeSqr)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    int all_type[] = {CV_32FC1};
+    std::string type_name[] = {"CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[t];
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::flip(mat1_roi, dst_roi, -1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[t], 0, 256);
+            gen(src2, size, size, all_type[t], 0, 256);
+            gen(dst, size, size, all_type[t], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::flip(gmat1, gdst, -1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            for (int i = 0; i < src1.rows; ++i)
 
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::flip(gmat1, gdst, -1);
-    };
-#endif
-}
-
-
-
-struct MinMax : ArithmTestBase {};
-
-TEST_P(MinMax, MAT)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double minVal, maxVal;
-            cv::Point minLoc, maxLoc;
-            t0 = (double)cvGetTickCount();//cpu start
-            if (mat1.depth() != CV_8S)
-            {
-                cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-            }
-            else
-            {
-                minVal = std::numeric_limits<double>::max();
-                maxVal = -std::numeric_limits<double>::max();
-                for (int i = 0; i < mat1_roi.rows; ++i)
-                    for (int j = 0; j < mat1_roi.cols; ++j)
-                    {
-                        signed char val = mat1_roi.at<signed char>(i, j);
-                        if (val < minVal) minVal = val;
-                        if (val > maxVal) maxVal = val;
-                    }
-            }
-
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            double minVal_, maxVal_;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        double minVal_, maxVal_;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
-    };
-#endif
-}
-
-TEST_P(MinMax, MASK)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double minVal, maxVal;
-            cv::Point minLoc, maxLoc;
-            t0 = (double)cvGetTickCount();//cpu start
-            if (mat1.depth() != CV_8S)
-            {
-                cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-            }
-            else
-            {
-                minVal = std::numeric_limits<double>::max();
-                maxVal = -std::numeric_limits<double>::max();
-                for (int i = 0; i < mat1_roi.rows; ++i)
-                    for (int j = 0; j < mat1_roi.cols; ++j)
-                    {
-                        signed char val = mat1_roi.at<signed char>(i, j);
-                        unsigned char m = mask_roi.at<unsigned char>(i, j);
-                        if (val < minVal && m) minVal = val;
-                        if (val > maxVal && m) maxVal = val;
-                    }
-            }
-
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            double minVal_, maxVal_;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-        double minVal_, maxVal_;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask);
-    };
-#endif
-}
-
-
-struct MinMaxLoc : ArithmTestBase {};
-
-TEST_P(MinMaxLoc, MAT)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double minVal, maxVal;
-            cv::Point minLoc, maxLoc;
-            int depth = mat1.depth();
-            t0 = (double)cvGetTickCount();//cpu start
-            if (depth != CV_8S)
-            {
-                cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-            }
-            else
-            {
-                minVal = std::numeric_limits<double>::max();
-                maxVal = -std::numeric_limits<double>::max();
-                for (int i = 0; i < mat1_roi.rows; ++i)
-                    for (int j = 0; j < mat1_roi.cols; ++j)
-                    {
-                        signed char val = mat1_roi.at<signed char>(i, j);
-                        if (val < minVal)
-                        {
-                            minVal = val;
-                            minLoc.x = j;
-                            minLoc.y = i;
-                        }
-                        if (val > maxVal)
-                        {
-                            maxVal = val;
-                            maxLoc.x = j;
-                            maxLoc.y = i;
-                        }
-                    }
-            }
-
-
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            double minVal_, maxVal_;
-            cv::Point minLoc_, maxLoc_;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        double minVal_, maxVal_;
-        cv::Point minLoc_, maxLoc_;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
-    };
-#endif
-
-}
-
-
-TEST_P(MinMaxLoc, MASK)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double minVal, maxVal;
-            cv::Point minLoc, maxLoc;
-            int depth = mat1.depth();
-            t0 = (double)cvGetTickCount();//cpu start
-            if (depth != CV_8S)
-            {
-                cv::minMaxLoc(mat1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-            }
-            else
-            {
-                minVal = std::numeric_limits<double>::max();
-                maxVal = -std::numeric_limits<double>::max();
-                for (int i = 0; i < mat1_roi.rows; ++i)
-                    for (int j = 0; j < mat1_roi.cols; ++j)
-                    {
-                        signed char val = mat1_roi.at<signed char>(i, j);
-                        unsigned char m = mask_roi.at<unsigned char>(i , j);
-                        if (val < minVal && m)
-                        {
-                            minVal = val;
-                            minLoc.x = j;
-                            minLoc.y = i;
-                        }
-                        if (val > maxVal && m)
-                        {
-                            maxVal = val;
-                            maxLoc.x = j;
-                            maxLoc.y = i;
-                        }
-                    }
-            }
-
-
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            double minVal_, maxVal_;
-            cv::Point minLoc_, maxLoc_;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-        double minVal_, maxVal_;
-        cv::Point minLoc_, maxLoc_;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
-    };
-#endif
-}
-
-
-struct Sum : ArithmTestBase {};
-
-TEST_P(Sum, MAT)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::sum(mat1_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::sum(gmat1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        Scalar gpures = cv::ocl::sum(gmat1);
-    };
-#endif
-}
-
-//TEST_P(Sum, MASK)
-//{
-//    for(int j=0; j<LOOP_TIMES; j++)
-//    {
-//
-//    }
-//}
-
-struct CountNonZero : ArithmTestBase {};
-
-TEST_P(CountNonZero, MAT)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::countNonZero(mat1_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::countNonZero(gmat1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::countNonZero(gmat1);
-    };
-#endif
-
-}
-
-
-
-////////////////////////////////phase/////////////////////////////////////////////////
-struct Phase : ArithmTestBase {};
-
-TEST_P(Phase, Mat)
-{
-    if(mat1.depth() != CV_32F && mat1.depth() != CV_64F)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::phase(mat1_roi, mat2_roi, dst_roi, 0);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::phase(gmat1, gmat2, gdst, 0);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::phase(gmat1, gmat2, gdst, 0);
-    };
-#endif
-
-}
-
-
-////////////////////////////////bitwise_and/////////////////////////////////////////////////
-struct Bitwise_and : ArithmTestBase {};
-
-TEST_P(Bitwise_and, Mat)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_and(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_and(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_and(gmat1, gmat2, gdst);
-    };
-#endif
-
-}
-
-TEST_P(Bitwise_and, Mat_Mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_and(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
-    };
-#endif
-}
-
-TEST_P(Bitwise_and, Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_and(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_and(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_and(gmat1, val, gdst);
-    };
-#endif
-}
-
-TEST_P(Bitwise_and, Scalar_Mask)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_and(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
-    };
-#endif
-}
-
-
-
-////////////////////////////////bitwise_or/////////////////////////////////////////////////
-
-struct Bitwise_or : ArithmTestBase {};
-
-TEST_P(Bitwise_or, Mat)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_or(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_or(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_or(gmat1, gmat2, gdst);
-    };
-#endif
-}
-
-TEST_P(Bitwise_or, Mat_Mask)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_or(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
-    };
-#endif
-}
-TEST_P(Bitwise_or, Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_or(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_or(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_or(gmat1, val, gdst);
-    };
-#endif
-}
-
-TEST_P(Bitwise_or, Scalar_Mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_or(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
-    };
-#endif
-}
-
-
-////////////////////////////////bitwise_xor/////////////////////////////////////////////////
-
-struct Bitwise_xor : ArithmTestBase {};
-
-TEST_P(Bitwise_xor, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
-    };
-#endif
-}
-
-TEST_P(Bitwise_xor, Mat_Mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
-    };
-#endif
-}
-
-TEST_P(Bitwise_xor, Scalar)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_xor(mat1_roi, val, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_xor(gmat1, val, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_xor(gmat1, val, gdst);
-    };
-#endif
-}
-
-TEST_P(Bitwise_xor, Scalar_Mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_xor(mat1_roi, val, dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
-    };
-#endif
-}
-
-
-////////////////////////////////bitwise_not/////////////////////////////////////////////////
-
-struct Bitwise_not : ArithmTestBase {};
-
-TEST_P(Bitwise_not, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::bitwise_not(mat1_roi, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::bitwise_not(gmat1, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::bitwise_not(gmat1, gdst);
-    };
-#endif
-}
-
-////////////////////////////////compare/////////////////////////////////////////////////
-PARAM_TEST_CASE ( CompareTestBase, MatType, bool)
-{
-    int type;
-    cv::Scalar val;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
-    {
-        //type = GET_PARAM(0);
-        type = CV_8UC1;
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        //mat2 = randomMat(rng, cv::Size(512,3), type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src2x   = 1;
-            src1y   = 1;
-            src2y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            maskx	 = 1;
-            masky	= 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src2x = 0;
-            src1y = 0;
-            src2y = 0;
-            dstx = 0;
-            dsty = 0;
-            maskx	 = 0;
-            masky	= 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        //mat2_roi = mat2(Rect(src2x,src2y,256,1));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gdst1_whole = dst1;
-        //gdst1 = gdst1_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat1 = mat1_roi;
-        //gmat2 = mat2_roi;
-        //gmask = mask_roi;
-    }
-
-};
-struct Compare : CompareTestBase {};
-
-TEST_P(Compare, Mat)
-{
-    if(mat1.type() == CV_8SC1)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
-
-    int cmp_codes[] = {CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE};
-    const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
-    int cmp_num = sizeof(cmp_codes) / sizeof(int);
-    for (int i = 0; i < cmp_num; ++i)
-    {
-
-#ifndef PRINT_KERNEL_RUN_TIME
-        double totalcputick = 0;
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        double t0 = 0;
-        double t1 = 0;
-        double t2 = 0;
-        for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-        {
-            totalcputick = 0;
-            totalgputick = 0;
-            totalgputick_kernel = 0;
-            for(int j = 0; j < LOOP_TIMES + 1; j ++)
-            {
-                Has_roi(k);
-
-                t0 = (double)cvGetTickCount();//cpu start
-                cv::compare(mat1_roi, mat2_roi, dst_roi, cmp_codes[i]);
-                t0 = (double)cvGetTickCount() - t0;//cpu end
-
-                t1 = (double)cvGetTickCount();//gpu start1
-                gdst_whole = dst;
-                gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-                gmat1 = mat1_roi;
-                gmat2 = mat2_roi;
-                t2 = (double)cvGetTickCount(); //kernel
-                cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]);
-                t2 = (double)cvGetTickCount() - t2;//kernel
-                cv::Mat cpu_dst;
-                gdst_whole.download (cpu_dst);//download
-                t1 = (double)cvGetTickCount() - t1;//gpu end1
-                if(j == 0)
-                    continue;
-                totalgputick = t1 + totalgputick;
-                totalcputick = t0 + totalcputick;
-                totalgputick_kernel = t2 + totalgputick_kernel;
-
-            }
-            cout << cmp_str[i] << endl;
-            if(k == 0)
-            {
-                cout << "no roi\n";
-            }
-            else
-            {
-                cout << "with roi\n";
-            };
-            cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        }
-#else
-        for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-        {
-            Has_roi(j);
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            if(j == 0)
-            {
-                cout << "no roi:";
-            }
-            else
-            {
-                cout << "\nwith roi:";
-            };
-            cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]);
-        };
-#endif
-    }
-
-}
-
-struct Pow : ArithmTestBase {};
-
-TEST_P(Pow, Mat)
-{
-    if(mat1.depth() != CV_32F && mat1.depth() != CV_64F)
-    {
-        cout << "\tUnsupported type\t\n";
-    }
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            double p = 4.5;
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::pow(mat1_roi, p, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::pow(gmat1, p, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        double p = 4.5;
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::pow(gmat1, p, gdst);
-    };
-#endif
-}
-
-
-struct MagnitudeSqr : ArithmTestBase {};
-
-TEST_P(MagnitudeSqr, Mat)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            for(int i = 0; i < mat1.rows; ++i)
-                for(int j = 0; j < mat1.cols; ++j)
+                for (int j = 0; j < src1.cols; ++j)
                 {
-                    float val1 = mat1.at<float>(i, j);
-                    float val2 = mat2.at<float>(i, j);
+                    float val1 = src1.at<float>(i, j);
+                    float val2 = src2.at<float>(i, j);
 
                     ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
 
                 }
-            t0 = (double)cvGetTickCount() - t0;//cpu end
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            cv::ocl::oclMat clmat1(mat1), clmat2(mat2), cldst;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::magnitudeSqr(clmat1, clmat2, cldst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            cldst.download(cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
 
+            for (int i = 0; i < src1.rows; ++i)
+                for (int j = 0; j < src1.cols; ++j)
+                {
+                    float val1 = src1.at<float>(i, j);
+                    float val2 = src2.at<float>(i, j);
+
+                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
+
+                }
+
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+
+            WARMUP_ON;
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        cv::ocl::oclMat clmat1(mat1), clmat2(mat2), cldst;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::magnitudeSqr(clmat1, clmat2, cldst);
-    };
-#endif
-
 }
 
-
-struct AddWeighted : ArithmTestBase {};
-
-TEST_P(AddWeighted, Mat)
+///////////// AddWeighted////////////////////////
+TEST(AddWeighted)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src1, src2, dst;
+    ocl::oclMat d_src1, d_src2, d_dst;
+
+    double alpha = 2.0, beta = 1.0, gama = 3.0;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            double alpha = 2.0, beta = 1.0, gama = 3.0;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::addWeighted(mat1_roi, alpha, mat2_roi, beta, gama, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
 
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            addWeighted(src1, alpha, src2, beta, gama, dst);
 
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
+            CPU_ON;
+            addWeighted(src1, alpha, src2, beta, gama, dst);
+            CPU_OFF;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
 
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download(cpu_dst);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
 
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        double alpha = 2.0, beta = 1.0, gama = 3.0;
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst);
-        // double alpha=2.0,beta=1.0,gama=3.0;
-        // cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
-        // if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
-        // cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
-    };
-#endif
-
-}
-/*
-struct AddWeighted : ArithmTestBase {};
-
-TEST_P(AddWeighted, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick=0;
-    double totalgputick=0;
-    double totalgputick_kernel=0;
-    double t0=0;
-    double t1=0;
-    double t2=0;
-    for(int j = 0; j < LOOP_TIMES+1; j ++)
-    {
-        double alpha=2.0,beta=1.0,gama=3.0;
-
-        t0 = (double)cvGetTickCount();//cpu start
-        cv::addWeighted(mat1,alpha,mat2,beta,gama,dst);
-        t0 = (double)cvGetTickCount() - t0;//cpu end
-
-        t1 = (double)cvGetTickCount();//gpu start1
-        cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
-
-        t2=(double)cvGetTickCount();//kernel
-        cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-        cv::Mat cpu_dst;
-        cldst.download(cpu_dst);
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-        if(j == 0)
-            continue;
-        totalgputick=t1+totalgputick;
-        totalcputick=t0+totalcputick;
-        totalgputick_kernel=t2+totalgputick_kernel;
-
     }
-    cout << "average cpu runtime is  " << totalcputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
-    cout << "average gpu runtime is  " << totalgputick/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel/((double)cvGetTickFrequency()* LOOP_TIMES *1000.) << "ms" << endl;
-
-#else
-    //for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    //	{
-    double alpha=2.0,beta=1.0,gama=3.0;
-    cv::ocl::oclMat clmat1(mat1),clmat2(mat2),cldst;
-    //if(j==0){cout<<"no roi:";}else{cout<<"\nwith roi:";};
-    cv::ocl::addWeighted(clmat1,alpha,clmat2,beta,gama, cldst);
-    //	};
-#endif
-
-}
-
-*/
-//********test****************
-
-INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(
-                            Values(CV_8UC1, CV_8UC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1,  CV_32FC4),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(
-                            Values(CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(
-                            Values(CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(
-                            Values(CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(
-                            Values(CV_8U, CV_32S, CV_32F),
-                            Values(false)));
-
-INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(
-                            Values(CV_8U, CV_32S, CV_32F),
-                            Values(false)));
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32FC1, CV_32FC4), Values(false)));
-//Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, MagnitudeSqr, Combine(
-                            Values(CV_32FC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(
-                            Values(CV_8UC1, CV_32SC1, CV_32FC1),
-                            Values(false))); // Values(false) is the reserved parameter
-
-
-
-
-#endif // HAVE_OPENCL
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_blend.cpp b/modules/ocl/perf/perf_blend.cpp
index f78f7d6b2..00034700b 100644
--- a/modules/ocl/perf/perf_blend.cpp
+++ b/modules/ocl/perf/perf_blend.cpp
@@ -44,79 +44,77 @@
 //M*/
 
 #include "precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(Blend, MatType, int)
+///////////// blend ////////////////////////
+template <typename T>
+void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
 {
-    int type;
-    int channels;
-    std::vector<cv::ocl::Info> oclinfo;
+    result_gold.create(img1.size(), img1.type());
 
-    virtual void SetUp()
+    int cn = img1.channels();
+
+    for (int y = 0; y < img1.rows; ++y)
     {
+        const float *weights1_row = weights1.ptr<float>(y);
+        const float *weights2_row = weights2.ptr<float>(y);
+        const T *img1_row = img1.ptr<T>(y);
+        const T *img2_row = img2.ptr<T>(y);
+        T *result_gold_row = result_gold.ptr<T>(y);
 
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-};
-
-TEST_P(Blend, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat img1_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
-    cv::Mat img2_host = randomMat(size, CV_MAKETYPE(type, channels), 0, type == CV_8U ? 255.0 : 1.0);
-    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
-    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
-    cv::ocl::oclMat gimg1(size, CV_MAKETYPE(type, channels)), gimg2(size, CV_MAKETYPE(type, channels)), gweights1(size, CV_32F), gweights2(size, CV_32F);
-    cv::ocl::oclMat gdst(size, CV_MAKETYPE(type, channels));
-
-
-    double totalgputick_all = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++) //LOOP_TIMES=100
-    {
-        t1 = (double)cvGetTickCount();
-        cv::ocl::oclMat gimg1 = cv::ocl::oclMat(img1_host);
-        cv::ocl::oclMat gimg2 = cv::ocl::oclMat(img2_host);
-        cv::ocl::oclMat gweights1 = cv::ocl::oclMat(weights1);
-        cv::ocl::oclMat gweights2 = cv::ocl::oclMat(weights1);
-
-        t2 = (double)cvGetTickCount();
-        cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, gdst);
-        t2 = (double)cvGetTickCount() - t2;
-
-        cv::Mat m;
-        gdst.download(m);
-        t1 = (double)cvGetTickCount() - t1;
-
-        if (j == 0)
+        for (int x = 0; x < img1.cols * cn; ++x)
         {
-            continue;
+            float w1 = weights1_row[x / cn];
+            float w2 = weights2_row[x / cn];
+            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
         }
-
-        totalgputick_all = t1 + totalgputick_all;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-    };
-
-    cout << "average gpu total  runtime is  " << totalgputick_all / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-    cout << "average gpu runtime without data transfering  is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
+    }
 }
+TEST(blend)
+{
+    Mat src1, src2, weights1, weights2, dst;
+    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
 
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-#endif
\ No newline at end of file
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " and CV_32FC1";
+
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(src2, size, size, all_type[j], 0, 256);
+            gen(weights1, size, size, CV_32FC1, 0, 1);
+            gen(weights2, size, size, CV_32FC1, 0, 1);
+
+            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+
+            CPU_ON;
+            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+            CPU_OFF;
+
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            d_weights1.upload(weights1);
+            d_weights2.upload(weights2);
+
+            WARMUP_ON;
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            d_src2.upload(src2);
+            d_weights1.upload(weights1);
+            d_weights2.upload(weights2);
+            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_brute_force_matcher.cpp b/modules/ocl/perf/perf_brute_force_matcher.cpp
new file mode 100644
index 000000000..6562f91e4
--- /dev/null
+++ b/modules/ocl/perf/perf_brute_force_matcher.cpp
@@ -0,0 +1,150 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+//////////////////// BruteForceMatch /////////////////
+TEST(BruteForceMatcher)
+{
+    Mat trainIdx_cpu;
+    Mat distance_cpu;
+    Mat allDist_cpu;
+    Mat nMatches_cpu;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        // Init CPU matcher
+        int desc_len = 64;
+
+        BFMatcher matcher(NORM_L2);
+
+        Mat query;
+        gen(query, size, desc_len, CV_32F, 0, 1);
+
+        Mat train;
+        gen(train, size, desc_len, CV_32F, 0, 1);
+        // Output
+        vector< vector<DMatch> > matches(2);
+        // Init GPU matcher
+        ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
+
+        ocl::oclMat d_query(query);
+        ocl::oclMat d_train(train);
+
+        ocl::oclMat d_trainIdx, d_distance, d_allDist, d_nMatches;
+
+        SUBTEST << size << "; match";
+
+        matcher.match(query, train, matches[0]);
+
+        CPU_ON;
+        matcher.match(query, train, matches[0]);
+        CPU_OFF;
+
+        WARMUP_ON;
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.match(d_query, d_train, matches[0]);
+        GPU_FULL_OFF;
+
+        SUBTEST << size << "; knnMatch";
+
+        matcher.knnMatch(query, train, matches, 2);
+
+        CPU_ON;
+        matcher.knnMatch(query, train, matches, 2);
+        CPU_OFF;
+
+        WARMUP_ON;
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.knnMatch(d_query, d_train, matches, 2);
+        GPU_FULL_OFF;
+
+        SUBTEST << size << "; radiusMatch";
+
+        float max_distance = 2.0f;
+
+        matcher.radiusMatch(query, train, matches, max_distance);
+
+        CPU_ON;
+        matcher.radiusMatch(query, train, matches, max_distance);
+        CPU_OFF;
+
+        d_trainIdx.release();
+
+        WARMUP_ON;
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+        WARMUP_OFF;
+
+        GPU_ON;
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_query.upload(query);
+        d_train.upload(train);
+        d_matcher.radiusMatch(d_query, d_train, matches, max_distance);
+        GPU_FULL_OFF;
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_canny.cpp b/modules/ocl/perf/perf_canny.cpp
index eb895df5e..428e036d0 100644
--- a/modules/ocl/perf/perf_canny.cpp
+++ b/modules/ocl/perf/perf_canny.cpp
@@ -42,112 +42,42 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
 
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-    { \
-    public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-    private: \
-    type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-////////////////////////////////////////////////////////
-// Canny1
-extern std::string workdir;
-IMPLEMENT_PARAM_CLASS(AppertureSize, int);
-IMPLEMENT_PARAM_CLASS(L2gradient, bool);
-
-PARAM_TEST_CASE(Canny1, AppertureSize, L2gradient)
+///////////// Canny ////////////////////////
+TEST(Canny)
 {
-    int apperture_size;
-    bool useL2gradient;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
 
-    virtual void SetUp()
+    if (img.empty())
     {
-        apperture_size = GET_PARAM(0);
-        useL2gradient = GET_PARAM(1);
-
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
-
-TEST_P(Canny1, Performance)
-{
-    cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    double low_thresh = 100.0;
-    double high_thresh = 150.0;
-
-    cv::Mat edges_gold;
-    cv::ocl::oclMat edges;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        edges.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
+        throw runtime_error("can't open aloeL.jpg");
     }
 
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";
 
+    Mat edges(img.size(), CV_8UC1);
 
-}
+    CPU_ON;
+    Canny(img, edges, 50.0, 100.0);
+    CPU_OFF;
 
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny1, testing::Combine(
-                            testing::Values(AppertureSize(3), AppertureSize(5)),
-                            testing::Values(L2gradient(false), L2gradient(true))));
+    ocl::oclMat d_img(img);
+    ocl::oclMat d_edges;
+    ocl::CannyBuf d_buf;
 
+    WARMUP_ON;
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    WARMUP_OFF;
 
+    GPU_ON;
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+     ;
+    GPU_OFF;
 
-#endif  //Have opencl
\ No newline at end of file
+    GPU_FULL_ON;
+    d_img.upload(img);
+    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    d_edges.download(edges);
+    GPU_FULL_OFF;
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_color.cpp b/modules/ocl/perf/perf_color.cpp
new file mode 100644
index 000000000..e32a1839d
--- /dev/null
+++ b/modules/ocl/perf/perf_color.cpp
@@ -0,0 +1,91 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// cvtColor////////////////////////
+TEST(cvtColor)
+{
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC4};
+    std::string type_name[] = {"CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            gen(src, size, size, all_type[j], 0, 256);
+            SUBTEST << size << "x" << size << "; " << type_name[j] << " ; CV_RGBA2GRAY";
+
+            cvtColor(src, dst, CV_RGBA2GRAY, 4);
+
+            CPU_ON;
+            cvtColor(src, dst, CV_RGBA2GRAY, 4);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+
+
+    }
+
+
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_columnsum.cpp b/modules/ocl/perf/perf_columnsum.cpp
index 96ea26a50..d2e3b45e5 100644
--- a/modules/ocl/perf/perf_columnsum.cpp
+++ b/modules/ocl/perf/perf_columnsum.cpp
@@ -15,8 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//	   Fangfang Bai fangfang@multicorewareinc.com
-//
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -31,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -43,78 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
 
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-///////////////////////////////////////////////////////////////////////////////
-/// ColumnSum
-
-#ifdef HAVE_OPENCL
-
-////////////////////////////////////////////////////////////////////////
-// ColumnSum
-
-PARAM_TEST_CASE(ColumnSum)
+///////////// columnSum////////////////////////
+TEST(columnSum)
 {
-    cv::Mat src;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
+        SUBTEST << size << 'x' << size << "; CV_32FC1";
+
+        gen(src, size, size, CV_32FC1, 0, 256);
+
+        CPU_ON;
+        dst.create(src.size(), src.type());
+
+        for (int i = 1; i < src.rows; ++i)
+        {
+            for (int j = 0; j < src.cols; ++j)
+            {
+                dst.at<float>(i, j) = src.at<float>(i, j) += src.at<float>(i - 1, j);
+            }
+        }
+
+        CPU_OFF;
+
+        d_src.upload(src);
+        WARMUP_ON;
+        ocl::columnSum(d_src, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::columnSum(d_src, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::columnSum(d_src, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
     }
-};
-
-TEST_F(ColumnSum, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat src = randomMat(size, CV_32FC1);
-    cv::ocl::oclMat d_dst;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat d_src(src);
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::columnSum(d_src, d_dst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        d_dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-
-}
-
-
-
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_fft.cpp b/modules/ocl/perf/perf_fft.cpp
index c9c19d0d4..50be2546e 100644
--- a/modules/ocl/perf/perf_fft.cpp
+++ b/modules/ocl/perf/perf_fft.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Fangfangbai, fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,85 +42,48 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-using namespace std;
-#ifdef HAVE_CLAMDFFT
-////////////////////////////////////////////////////////////////////////////
-// Dft
-PARAM_TEST_CASE(Dft, cv::Size, bool)
+
+///////////// dft ////////////////////////
+TEST(dft)
 {
-    cv::Size dft_size;
-    bool	 dft_rows;
-    vector<cv::ocl::Info> info;
-    virtual void SetUp()
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_32FC1, CV_32FC2};
+    std::string type_name[] = {"CV_32FC1", "CV_32FC2"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        dft_size = GET_PARAM(0);
-        dft_rows = GET_PARAM(1);
-        cv::ocl::getDevice(info);
-    }
-};
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; complex-to-complex";
 
-TEST_P(Dft, C2C)
-{
-    cv::Mat a = randomMat(dft_size, CV_32FC2, 0.0, 10.0);
-    int flags = 0;
-    flags |= dft_rows ? cv::DFT_ROWS : 0;
+            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(1));
 
-    cv::ocl::oclMat d_b;
+            dft(src, dst);
 
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
+            CPU_ON;
+            dft(src, dst);
+            CPU_OFF;
 
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
+            d_src.upload(src);
 
-        t1 = (double)cvGetTickCount();//gpu start1
+            WARMUP_ON;
+            ocl::dft(d_src, d_dst, Size(size, size));
+            WARMUP_OFF;
 
-        cv::ocl::oclMat ga = cv::ocl::oclMat(a); //upload
+            GPU_ON;
+            ocl::dft(d_src, d_dst, Size(size, size));
+             ;
+            GPU_OFF;
 
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::dft(ga, d_b, a.size(), flags);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        d_b.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::dft(d_src, d_dst, Size(size, size));
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
     }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-
-
-
-TEST_P(Dft, R2CthenC2R)
-{
-    cv::Mat a = randomMat(dft_size, CV_32FC1, 0.0, 10.0);
-
-    int flags = 0;
-    //flags |= dft_rows ? cv::DFT_ROWS : 0; // not supported yet
-
-    cv::ocl::oclMat d_b, d_c;
-
-    cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), flags);
-    cv::ocl::dft(d_b, d_c, a.size(), flags + cv::DFT_INVERSE + cv::DFT_REAL_OUTPUT);
-
-    EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
-}
-
-//INSTANTIATE_TEST_CASE_P(ocl_DFT, Dft, testing::Combine(
-//						testing::Values(cv::Size(1280, 1024), cv::Size(1920, 1080),cv::Size(1800, 1500)),
-//						testing::Values(false, true)));
-
-#endif // HAVE_CLAMDFFT
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp
index 100a1c59d..e9646c77e 100644
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@@ -10,15 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Zero Lin, Zero.Lin@amd.com
-//    Zhang Ying, zhangying913@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -33,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -45,1165 +42,331 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-//using namespace cv::ocl;
-
-PARAM_TEST_CASE(FilterTestBase, MatType, bool)
+///////////// Blur////////////////////////
+TEST(Blur)
 {
-    int type;
-    cv::Scalar val;
+    Mat src1, dst;
+    ocl::oclMat d_src1, d_dst;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
+    Size ksize = Size(3, 3);
+    int bordertype = BORDER_CONSTANT;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-    cv::ocl::oclMat gdst1_whole; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdst1;   //bak
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
 
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        dst1  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
 
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+            blur(src1, dst, ksize, Point(-1, -1), bordertype);
+
+            CPU_ON;
+            blur(src1, dst, ksize, Point(-1, -1), bordertype);
+            CPU_OFF;
+
+            d_src1.upload(src1);
+
+            WARMUP_ON;
+            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
     }
-
-    void random_roi()
-    {
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        //randomize ROI
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-        dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
-
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmask = mask_roi;
-    }
-
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// blur
-
-PARAM_TEST_CASE(Blur, MatType, cv::Size, int)
+}
+///////////// Laplacian////////////////////////
+TEST(Laplacian)
 {
-    int type;
-    cv::Size ksize;
-    int bordertype;
+    Mat src1, dst;
+    ocl::oclMat d_src1, d_dst;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
+    int ksize = 3;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        bordertype = GET_PARAM(2);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-
-    void Has_roi(int b)
-    {
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src1, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+
+
+            Laplacian(src1, dst, -1, ksize, 1);
+
+            CPU_ON;
+            Laplacian(src1, dst, -1, ksize, 1);
+            CPU_OFF;
+
+            d_src1.upload(src1);
+
+            WARMUP_ON;
+            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src1.upload(src1);
+            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
 
     }
-
-};
-
-TEST_P(Blur, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::blur(mat1_roi, dst_roi, ksize, Point(-1, -1), bordertype);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::blur(gmat1, gdst, ksize, Point(-1, -1), bordertype);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::blur(gmat1, gdst, ksize, Point(-1, -1), bordertype);
-    };
-#endif
-
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//Laplacian
-
-PARAM_TEST_CASE(LaplacianTestBase, MatType, int)
+///////////// Erode ////////////////////
+TEST(Erode)
 {
-    int type;
-    int ksize;
+    Mat src, dst, ker;
+    ocl::oclMat d_src, d_dst;
 
-    //src mat
-    cv::Mat mat;
-    cv::Mat dst;
+    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat dst_roi;
-    std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat  = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols =  mat.cols - 1;
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            dstx    = 1;
-            dsty    = 1;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(256));
+            ker = getStructuringElement(MORPH_RECT, Size(3, 3));
+
+            erode(src, dst, ker);
+
+            CPU_ON;
+            erode(src, dst, ker);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::erode(d_src, d_dst, ker);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::erode(d_src, d_dst, ker);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::erode(d_src, d_dst, ker);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx = 0;
-            srcy = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
 
     }
-
-};
-
-struct Laplacian : LaplacianTestBase {};
-
-TEST_P(Laplacian, Accuracy)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::Laplacian(mat_roi, dst_roi, -1, ksize, 1);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat = mat_roi;
-
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1);
-    };
-#endif
 }
 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// erode & dilate
-
-PARAM_TEST_CASE(ErodeDilateBase, MatType, bool)
+///////////// Sobel ////////////////////////
+TEST(Sobel)
 {
-    int type;
-    //int iterations;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    //erode or dilate kernel
-    cv::Mat kernel;
+    int dx = 1;
+    int dy = 1;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        //  iterations = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //		rng.fill(kernel, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
-        kernel = randomMat(rng, Size(3, 3), CV_8UC1, 0, 3, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            Sobel(src, dst, -1, dx, dy);
+
+            CPU_ON;
+            Sobel(src, dst, -1, dx, dy);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::Sobel(d_src, d_dst, -1, dx, dy);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::Sobel(d_src, d_dst, -1, dx, dy);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::Sobel(d_src, d_dst, -1, dx, dy);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
 
     }
-
-};
-
-// erode
-
-struct Erode : ErodeDilateBase {};
-
-TEST_P(Erode, Mat)
+}
+///////////// Scharr ////////////////////////
+TEST(Scharr)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int dx = 1;
+    int dy = 0;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::erode(mat1_roi, dst_roi, kernel);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            Scharr(src, dst, -1, dx, dy);
 
-            gmat1 = mat1_roi;
+            CPU_ON;
+            Scharr(src, dst, -1, dx, dy);
+            CPU_OFF;
 
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::erode(gmat1, gdst, kernel);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            d_src.upload(src);
 
-            if(j == 0)
-                continue;
+            WARMUP_ON;
+            ocl::Scharr(d_src, d_dst, -1, dx, dy);
+            WARMUP_OFF;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            GPU_ON;
+            ocl::Scharr(d_src, d_dst, -1, dx, dy);
+             ;
+            GPU_OFF;
 
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::Scharr(d_src, d_dst, -1, dx, dy);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::erode(gmat1, gdst, kernel);
-    };
-#endif
-
 }
 
-// dilate
-
-struct Dilate : ErodeDilateBase {};
-
-TEST_P(Dilate, Mat)
+///////////// GaussianBlur ////////////////////////
+TEST(GaussianBlur)
 {
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::dilate(mat1_roi, dst_roi, kernel);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            gen(src, size, size, all_type[j], 0, 256);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::dilate(gmat1, gdst, kernel);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            GaussianBlur(src, dst, Size(9, 9), 0);
 
-            if(j == 0)
-                continue;
+            CPU_ON;
+            GaussianBlur(src, dst, Size(9, 9), 0);
+            CPU_OFF;
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst(src.size(), src.type());
+            ocl::oclMat d_buf;
 
+            WARMUP_ON;
+            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::dilate(gmat1, gdst, kernel);
-    };
-#endif
-
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Sobel
-
-PARAM_TEST_CASE(Sobel, MatType, int, int, int, int)
+///////////// filter2D////////////////////////
+TEST(filter2D)
 {
-    int type;
-    int dx, dy, ksize, bordertype;
+    Mat src;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        dx = GET_PARAM(1);
-        dy = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        bordertype = GET_PARAM(4);
-        dx = 2;
-        dy = 0;
+        int all_type[] = {CV_8UC1, CV_8UC4};
+        std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        if(b)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
+            gen(src, size, size, all_type[j], 0, 256);
+
+            for (int ksize = 3; ksize <= 15; ksize = 2*ksize+1)
+            {
+                SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;
+
+                Mat kernel;
+                gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
+
+                Mat dst;
+                cv::filter2D(src, dst, -1, kernel);
+
+                CPU_ON;
+                cv::filter2D(src, dst, -1, kernel);
+                CPU_OFF;
+
+                ocl::oclMat d_src(src);
+                ocl::oclMat d_dst;
+
+                WARMUP_ON;
+                ocl::filter2D(d_src, d_dst, -1, kernel);
+                WARMUP_OFF;
+
+                GPU_ON;
+                ocl::filter2D(d_src, d_dst, -1, kernel);
+                 ;
+                GPU_OFF;
+
+                GPU_FULL_ON;
+                d_src.upload(src);
+                ocl::filter2D(d_src, d_dst, -1, kernel);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+
         }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
 
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
 
     }
-
-};
-
-TEST_P(Sobel, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::Sobel(mat1_roi, dst_roi, -1, dx, dy, ksize, /*scale*/0.00001,/*delta*/0, bordertype);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::Sobel(gmat1, gdst, -1, dx, dy, ksize,/*scale*/0.00001,/*delta*/0, bordertype);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::Sobel(gmat1, gdst, -1, dx, dy, ksize,/*scale*/0.00001,/*delta*/0, bordertype);
-    };
-#endif
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Scharr
-
-PARAM_TEST_CASE(Scharr, MatType, int, int, int)
-{
-    int type;
-    int dx, dy, bordertype;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        dx = GET_PARAM(1);
-        dy = GET_PARAM(2);
-        bordertype = GET_PARAM(3);
-        dx = 1;
-        dy = 0;
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-    }
-};
-
-TEST_P(Scharr, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::Scharr(mat1_roi, dst_roi, -1, dx, dy, /*scale*/1,/*delta*/0, bordertype);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::Scharr(gmat1, gdst, -1, dx, dy,/*scale*/1,/*delta*/0, bordertype);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::Scharr(gmat1, gdst, -1, dx, dy,/*scale*/1,/*delta*/0, bordertype);
-    };
-#endif
-
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// GaussianBlur
-
-PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int)
-{
-    int type;
-    cv::Size ksize;
-    int bordertype;
-
-    double sigma1, sigma2;
-
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        bordertype = GET_PARAM(2);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        sigma1 = rng.uniform(0.1, 1.0);
-        sigma2 = rng.uniform(0.1, 1.0);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            roicols =  mat1.cols - 1;
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-    }
-
-};
-
-TEST_P(GaussianBlur, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::GaussianBlur(mat1_roi, dst_roi, ksize, sigma1, sigma2, bordertype);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype);
-    };
-#endif
-
-}
-
-//************test**********
-
-INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                        Values(cv::Size(3, 3)/*, cv::Size(5, 5), cv::Size(7, 7)*/),
-                        Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
-
-
-INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(1/*, 3*/)));
-
-//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
-
-INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
-
-//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
-
-INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
-
-
-INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_32FC1),
-                        Values(1, 2), Values(0, 1), Values(3, 5), Values((MatType)cv::BORDER_CONSTANT,
-                                (MatType)cv::BORDER_REPLICATE)));
-
-
-INSTANTIATE_TEST_CASE_P(Filter, Scharr, Combine(
-                            Values(CV_8UC1,  CV_32FC1), Values(0, 1), Values(0, 1),
-                            Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
-
-INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
-                            Values(CV_8UC1,  CV_32FC1),
-                            Values(cv::Size(3, 3), cv::Size(5, 5)),
-                            Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
-
-
-#endif // HAVE_OPENCL
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_gemm.cpp b/modules/ocl/perf/perf_gemm.cpp
index c3dcab34f..930ecb046 100644
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/ocl/perf/perf_gemm.cpp
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Fangfang Bai, fangfang@multicorewareinc.com
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -41,73 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-
 #include "precomp.hpp"
-using namespace std;
-#ifdef HAVE_CLAMDBLAS
-////////////////////////////////////////////////////////////////////////////
-// GEMM
-PARAM_TEST_CASE(Gemm, int, cv::Size, int)
+
+///////////// gemm ////////////////////////
+TEST(gemm)
 {
-    int      type;
-    cv::Size mat_size;
-    int		 flags;
-    vector<cv::ocl::Info> info;
-    virtual void SetUp()
+    Mat src1, src2, src3, dst;
+    ocl::oclMat d_src1, d_src2, d_src3, d_dst;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type     = GET_PARAM(0);
-        mat_size = GET_PARAM(1);
-        flags    = GET_PARAM(2);
+        SUBTEST << size << 'x' << size;
 
-        cv::ocl::getDevice(info);
+        gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+        gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+        gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+
+        gemm(src1, src2, 1.0, src3, 1.0, dst);
+
+        CPU_ON;
+        gemm(src1, src2, 1.0, src3, 1.0, dst);
+        CPU_OFF;
+
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        d_src3.upload(src3);
+
+        WARMUP_ON;
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src1.upload(src1);
+        d_src2.upload(src2);
+        d_src3.upload(src3);
+        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
     }
-};
-
-TEST_P(Gemm, Performance)
-{
-    cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
-    cv::ocl::oclMat ocl_dst;
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t1 = 0;
-    double t2 = 0;
-
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ga = cv::ocl::oclMat(a);//upload
-        cv::ocl::oclMat gb = cv::ocl::oclMat(b);//upload
-        cv::ocl::oclMat gc = cv::ocl::oclMat(c);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::gemm(ga, gb, 1.0, gc, 1.0, ocl_dst, flags);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        ocl_dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-
-
-INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
-                            testing::Values(CV_32FC1, CV_32FC2/* , CV_64FC1, CV_64FC2*/),
-                            testing::Values(cv::Size(512, 512), cv::Size(1024, 1024)),
-                            testing::Values(0, (int)cv::GEMM_1_T, (int)cv::GEMM_2_T, (int)(cv::GEMM_1_T + cv::GEMM_2_T))));
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_haar.cpp b/modules/ocl/perf/perf_haar.cpp
index 525b8fb49..5a909ace4 100644
--- a/modules/ocl/perf/perf_haar.cpp
+++ b/modules/ocl/perf/perf_haar.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,133 +42,97 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-#include "opencv2/objdetect/objdetect.hpp"
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
+///////////// Haar ////////////////////////
+namespace cv
+{
+namespace ocl
+{
 
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv;
-extern std::string workdir;
 struct getRect
 {
-    Rect operator ()(const CvAvgComp &e) const
+    Rect operator()(const CvAvgComp &e) const
     {
         return e.rect;
     }
 };
 
-PARAM_TEST_CASE(HaarTestBase, int, int)
+class CascadeClassifier_GPU : public OclCascadeClassifier
 {
-    //std::vector<cv::ocl::Info> oclinfo;
-    cv::ocl::OclCascadeClassifier cascade, nestedCascade;
-    cv::CascadeClassifier cpucascade, cpunestedCascade;
-    //    Mat img;
-
-    double scale;
-    int index;
-
-    virtual void SetUp()
+public:
+    void detectMultiScale(oclMat &image,
+                          CV_OUT std::vector<cv::Rect>& faces,
+                          double scaleFactor = 1.1,
+                          int minNeighbors = 3, int flags = 0,
+                          Size minSize = Size(),
+                          Size maxSize = Size())
     {
-        scale = 1.0;
-        index = 0;
-        string cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml";
-
-        if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
-        {
-            cout << "ERROR: Could not load classifier cascade" << endl;
-            return;
-        }
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath("E:\\");
+        (void)maxSize;
+        MemStorage storage(cvCreateMemStorage(0));
+        //CvMat img=image;
+        CvSeq *objs = oclHaarDetectObjects(image, storage, scaleFactor, minNeighbors, flags, minSize);
+        vector<CvAvgComp> vecAvgComp;
+        Seq<CvAvgComp>(objs).copyTo(vecAvgComp);
+        faces.resize(vecAvgComp.size());
+        std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
     }
+
 };
 
-////////////////////////////////faceDetect/////////////////////////////////////////////////
-
-struct Haar : HaarTestBase {};
-
-TEST_F(Haar, FaceDetect)
-{
-    string imgName = workdir + "lena.jpg";
-    Mat img = imread( imgName, 1 );
-
-    if(img.empty())
-    {
-        std::cout << imgName << std::endl;
-        return ;
-    }
-
-    //int i = 0;
-    double t = 0;
-    vector<Rect> faces, oclfaces;
-
-    // const static Scalar colors[] =  { CV_RGB(0, 0, 255),
-    //                                   CV_RGB(0, 128, 255),
-    //                                   CV_RGB(0, 255, 255),
-    //                                   CV_RGB(0, 255, 0),
-    //                                   CV_RGB(255, 128, 0),
-    //                                   CV_RGB(255, 255, 0),
-    //                                   CV_RGB(255, 0, 0),
-    //                                   CV_RGB(255, 0, 255)
-    //                                 } ;
-
-    Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
-    MemStorage storage(cvCreateMemStorage(0));
-    cvtColor( img, gray, CV_BGR2GRAY );
-    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    equalizeHist( smallImg, smallImg );
-
-    t = (double)cvGetTickCount();
-    for(int k = 0; k < LOOP_TIMES; k++)
-    {
-        cpucascade.detectMultiScale( smallImg, faces,  1.1,
-                                     3, 0
-                                     | CV_HAAR_SCALE_IMAGE
-                                     , Size(30, 30), Size(0, 0) );
-    }
-    t = (double)cvGetTickCount() - t ;
-    printf( "cpudetection time = %g ms\n", t / (LOOP_TIMES * (double)cvGetTickFrequency() * 1000.) );
-
-    cv::ocl::oclMat image;
-    CvSeq *_objects=NULL;
-    t = (double)cvGetTickCount();
-    for(int k = 0; k < LOOP_TIMES; k++)
-    {
-        image.upload(smallImg);
-        _objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
-                   3, 0
-                   | CV_HAAR_SCALE_IMAGE
-                   , Size(30, 30), Size(0, 0) );
-    }
-    t = (double)cvGetTickCount() - t ;
-    printf( "ocldetection time = %g ms\n", t / (LOOP_TIMES * (double)cvGetTickFrequency() * 1000.) );
-    vector<CvAvgComp> vecAvgComp;
-    Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
-    oclfaces.resize(vecAvgComp.size());
-    std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
-
-    //for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
-    //{
-    //	Mat smallImgROI;
-    //	Point center;
-    //	Scalar color = colors[i%8];
-    //	int radius;
-    //	center.x = cvRound((r->x + r->width*0.5)*scale);
-    //	center.y = cvRound((r->y + r->height*0.5)*scale);
-    //	radius = cvRound((r->width + r->height)*0.25*scale);
-    //	circle( img, center, radius, color, 3, 8, 0 );
-    //}
-    //namedWindow("result");
-    //imshow("result",img);
-    //waitKey(0);
-    //destroyAllWindows();
-
 }
-#endif // HAVE_OPENCL
+}
+TEST(Haar)
+{
+    Mat img = imread(abspath("basketball1.png"), CV_LOAD_IMAGE_GRAYSCALE);
+
+    if (img.empty())
+    {
+        throw runtime_error("can't open basketball1.png");
+    }
+
+    CascadeClassifier faceCascadeCPU;
+
+    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
+    {
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
+    }
+
+    vector<Rect> faces;
+
+    SUBTEST << img.cols << "x" << img.rows << "; scale image";
+    CPU_ON;
+    faceCascadeCPU.detectMultiScale(img, faces,
+                                    1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    CPU_OFF;
+
+    ocl::CascadeClassifier_GPU faceCascade;
+
+    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
+    {
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
+    }
+
+    ocl::oclMat d_img(img);
+
+    faces.clear();
+
+    WARMUP_ON;
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    WARMUP_OFF;
+
+    faces.clear();
+
+    GPU_ON;
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+     ;
+    GPU_OFF;
+
+    GPU_FULL_ON;
+    d_img.upload(img);
+    faceCascade.detectMultiScale(d_img, faces,
+                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
+    GPU_FULL_OFF;
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_hog.cpp b/modules/ocl/perf/perf_hog.cpp
index fd58808a8..b74077ff4 100644
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -42,125 +42,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
 
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-extern std::string workdir;
-
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-    { \
-    public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-    private: \
-    type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-IMPLEMENT_PARAM_CLASS(WinSizw48, bool);
-
-PARAM_TEST_CASE(HOG, WinSizw48, bool)
+///////////// HOG////////////////////////
+TEST(HOG)
 {
-    bool is48;
-    vector<float> detector;
-    virtual void SetUp()
+    Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);
+
+    if (src.empty())
     {
-        is48 = GET_PARAM(0);
-        if(is48)
-        {
-            detector = cv::ocl::HOGDescriptor::getPeopleDetector48x96();
-        }
-        else
-        {
-            detector = cv::ocl::HOGDescriptor::getPeopleDetector64x128();
-        }
-    }
-};
-
-TEST_P(HOG, Performance)
-{
-    cv::Mat img = readImage(workdir + "lena.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    // define HOG related arguments
-    float scale = 1.05f;
-    //int nlevels = 13;
-    int gr_threshold = 8;
-    float hit_threshold = 1.4f;
-    //bool hit_threshold_auto = true;
-
-    int win_width = is48 ? 48 : 64;
-    int win_stride_width = 8;
-    int win_stride_height = 8;
-
-    bool gamma_corr = true;
-
-    Size win_size(win_width, win_width * 2); //(64, 128) or (48, 96)
-    Size win_stride(win_stride_width, win_stride_height);
-
-    cv::ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::ocl::HOGDescriptor::DEFAULT_NLEVELS);
-
-    gpu_hog.setSVMDetector(detector);
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        ocl::oclMat d_src(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-
-        vector<Rect> found;
-        gpu_hog.detectMultiScale(d_src, found, hit_threshold, win_stride,
-                                 Size(0, 0), scale, gr_threshold);
-
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        // no download time for HOG
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
+        throw runtime_error("can't open road.png");
     }
 
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
 
+    cv::HOGDescriptor hog;
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    std::vector<cv::Rect> found_locations;
 
-INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, testing::Combine(testing::Values(WinSizw48(false), WinSizw48(true)), testing::Values(false)));
+    SUBTEST << 768 << 'x' << 576 << "; road.png";
 
-#endif  //Have opencl
\ No newline at end of file
+    hog.detectMultiScale(src, found_locations);
+
+    CPU_ON;
+    hog.detectMultiScale(src, found_locations);
+    CPU_OFF;
+
+    cv::ocl::HOGDescriptor ocl_hog;
+    ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+    ocl::oclMat d_src;
+    d_src.upload(src);
+
+    WARMUP_ON;
+    ocl_hog.detectMultiScale(d_src, found_locations);
+    WARMUP_OFF;
+
+    GPU_ON;
+    ocl_hog.detectMultiScale(d_src, found_locations);
+     ;
+    GPU_OFF;
+
+    GPU_FULL_ON;
+    d_src.upload(src);
+    ocl_hog.detectMultiScale(d_src, found_locations);
+    GPU_FULL_OFF;
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp
index bc54cb275..756f69556 100644
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@ -10,18 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan, lyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Zailong Wu, bullet@yeah.net
-//    Xu Pang, pangxu010@163.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -36,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -48,949 +42,290 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-
-MatType nulltype = -1;
-
-#define ONE_TYPE(type)  testing::ValuesIn(typeVector(type))
-#define NULL_TYPE  testing::ValuesIn(typeVector(nulltype))
-
-
-vector<MatType> typeVector(MatType type)
+///////////// equalizeHist ////////////////////////
+TEST(equalizeHist)
 {
-    vector<MatType> v;
-    v.push_back(type);
-    return v;
-}
+    Mat src, dst;
+    int all_type[] = {CV_8UC1};
+    std::string type_name[] = {"CV_8UC1"};
 
-
-PARAM_TEST_CASE(ImgprocTestBase, MatType, MatType, MatType, MatType, MatType, bool)
-{
-    int type1, type2, type3, type4, type5;
-    cv::Scalar val;
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int dstx;
-    int dsty;
-    int dst1x;
-    int dst1y;
-    int maskx;
-    int masky;
-
-    //mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mask;
-    cv::Mat dst;
-    cv::Mat dst1; //bak, for two outputs
-
-    //mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    cv::Mat dst1_roi; //bak
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl mat
-    cv::ocl::oclMat clmat1;
-    cv::ocl::oclMat clmat2;
-    cv::ocl::oclMat clmask;
-    cv::ocl::oclMat cldst;
-    cv::ocl::oclMat cldst1; //bak
-
-    //ocl mat with roi
-    cv::ocl::oclMat clmat1_roi;
-    cv::ocl::oclMat clmat2_roi;
-    cv::ocl::oclMat clmask_roi;
-    cv::ocl::oclMat cldst_roi;
-    cv::ocl::oclMat cldst1_roi;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type1 = GET_PARAM(0);
-        type2 = GET_PARAM(1);
-        type3 = GET_PARAM(2);
-        type4 = GET_PARAM(3);
-        type5 = GET_PARAM(4);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-        double min = 1, max = 20;
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums>0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-        if(type1 != nulltype)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            mat1 = randomMat(rng, size, type1, min, max, false);
-            clmat1 = mat1;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            equalizeHist(src, dst);
+
+            CPU_ON;
+            equalizeHist(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
+            ocl::oclMat d_hist;
+            ocl::oclMat d_buf;
+
+            WARMUP_ON;
+            ocl::equalizeHist(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::equalizeHist(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::equalizeHist(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(type2 != nulltype)
-        {
-            mat2 = randomMat(rng, size, type2, min, max, false);
-            clmat2 = mat2;
-        }
-        if(type3 != nulltype)
-        {
-            dst  = randomMat(rng, size, type3, min, max, false);
-            cldst = dst;
-        }
-        if(type4 != nulltype)
-        {
-            dst1 = randomMat(rng, size, type4, min, max, false);
-            cldst1 = dst1;
-        }
-        if(type5 != nulltype)
-        {
-            mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-            cv::threshold(mask, mask, 0.5, 255., type5);
-            clmask = mask;
-        }
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-    }
-
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1; //start
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src2x   = 1;
-            src1y   = 1;
-            src2y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            dst1x    = 1;
-            dst1y    = 1;
-            maskx	 = 1;
-            masky	= 1;
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src2x = 0;
-            src1y = 0;
-            src2y = 0;
-            dstx = 0;
-            dsty = 0;
-            dst1x  = 0;
-            dst1y  = 0;
-            maskx	 = 0;
-            masky	= 0;
-        };
-
-        if(type1 != nulltype)
-        {
-            mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-            //clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
-        }
-        if(type2 != nulltype)
-        {
-            mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-            //clmat2_roi = clmat2(Rect(src2x,src2y,roicols,roirows));
-        }
-        if(type3 != nulltype)
-        {
-            dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-            //cldst_roi = cldst(Rect(dstx,dsty,roicols,roirows));
-        }
-        if(type4 != nulltype)
-        {
-            dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
-            //cldst1_roi = cldst1(Rect(dst1x,dst1y,roicols,roirows));
-        }
-        if(type5 != nulltype)
-        {
-            mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-            //clmask_roi = clmask(Rect(maskx,masky,roicols,roirows));
-        }
-    }
-
-    void random_roi()
-    {
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        //randomize ROI
-        roicols = rng.uniform(1, mat1.cols);
-        roirows = rng.uniform(1, mat1.rows);
-        src1x   = rng.uniform(0, mat1.cols - roicols);
-        src1y   = rng.uniform(0, mat1.rows - roirows);
-        src2x   = rng.uniform(0, mat2.cols - roicols);
-        src2y   = rng.uniform(0, mat2.rows - roirows);
-        dstx    = rng.uniform(0, dst.cols  - roicols);
-        dsty    = rng.uniform(0, dst.rows  - roirows);
-        dst1x    = rng.uniform(0, dst1.cols  - roicols);
-        dst1y    = rng.uniform(0, dst1.rows  - roirows);
-        maskx   = rng.uniform(0, mask.cols - roicols);
-        masky   = rng.uniform(0, mask.rows - roirows);
-
-        if(type1 != nulltype)
-        {
-            mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-            //clmat1_roi = clmat1(Rect(src1x,src1y,roicols,roirows));
-        }
-        if(type2 != nulltype)
-        {
-            mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-            //clmat2_roi = clmat2(Rect(src2x,src2y,roicols,roirows));
-        }
-        if(type3 != nulltype)
-        {
-            dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-            //cldst_roi = cldst(Rect(dstx,dsty,roicols,roirows));
-        }
-        if(type4 != nulltype)
-        {
-            dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
-            //cldst1_roi = cldst1(Rect(dst1x,dst1y,roicols,roirows));
-        }
-        if(type5 != nulltype)
-        {
-            mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-            //clmask_roi = clmask(Rect(maskx,masky,roicols,roirows));
-        }
-    }
-};
-////////////////////////////////equalizeHist//////////////////////////////////////////
-
-struct equalizeHist : ImgprocTestBase {};
-
-TEST_P(equalizeHist, MatType)
-{
-    if (mat1.type() != CV_8UC1 || mat1.type() != dst.type())
-    {
-        cout << "Unsupported type" << endl;
-        EXPECT_DOUBLE_EQ(0.0, 0.0);
-    }
-    else
-    {
-#ifndef PRINT_KERNEL_RUN_TIME
-        double totalcputick = 0;
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        double t0 = 0;
-        double t1 = 0;
-        double t2 = 0;
-        for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-        {
-            totalcputick = 0;
-            totalgputick = 0;
-            totalgputick_kernel = 0;
-            for(int j = 0; j < LOOP_TIMES + 1; j ++)
-            {
-                Has_roi(k);
-
-                t0 = (double)cvGetTickCount();//cpu start
-                cv::equalizeHist(mat1_roi, dst_roi);
-                t0 = (double)cvGetTickCount() - t0;//cpu end
-
-                t1 = (double)cvGetTickCount();//gpu start1
-                if(type1 != nulltype)
-                {
-                    clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                }
-                cldst_roi = cldst(Rect(dstx, dsty, roicols, roirows));
-                t2 = (double)cvGetTickCount(); //kernel
-                cv::ocl::equalizeHist(clmat1_roi, cldst_roi);
-                t2 = (double)cvGetTickCount() - t2;//kernel
-                cv::Mat cpu_cldst;
-                //cldst.download(cpu_cldst);//download
-                t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-                if(j == 0)
-                    continue;
-
-                totalgputick = t1 + totalgputick;
-                totalcputick = t0 + totalcputick;
-                totalgputick_kernel = t2 + totalgputick_kernel;
-
-            }
-            if(k == 0)
-            {
-                cout << "no roi\n";
-            }
-            else
-            {
-                cout << "with roi\n";
-            };
-            cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        }
-#else
-        for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-        {
-            Has_roi(j);
-            if(type1 != nulltype)
-            {
-                clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-            }
-            if(j == 0)
-            {
-                cout << "no roi:";
-            }
-            else
-            {
-                cout << "\nwith roi:";
-            };
-            cv::ocl::equalizeHist(clmat1_roi, cldst_roi);
-        };
-#endif
-    }
-}
-
-
-////////////////////////////////bilateralFilter////////////////////////////////////////////
-
-struct bilateralFilter : ImgprocTestBase {};
-
-TEST_P(bilateralFilter, Mat)
-{
-    double sigmacolor = 50.0;
-    int radius = 9;
-    int d = 2 * radius + 1;
-    double sigmaspace = 20.0;
-    int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,cv::BORDER_REFLECT,cv::BORDER_WRAP,cv::BORDER_REFLECT_101*/};
-    const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
-
-    if (mat1.depth() != CV_8U || mat1.type() != dst.type())
-    {
-        cout << "Unsupported type" << endl;
-        EXPECT_DOUBLE_EQ(0.0, 0.0);
-    }
-    else
-    {
-        for(size_t i = 0; i < sizeof(bordertype) / sizeof(int); i++)
-        {
-            cout << borderstr[i] << endl;
-#ifndef PRINT_KERNEL_RUN_TIME
-            double totalcputick = 0;
-            double totalgputick = 0;
-            double totalgputick_kernel = 0;
-            double t0 = 0;
-            double t1 = 0;
-            double t2 = 0;
-            for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-            {
-                totalcputick = 0;
-                totalgputick = 0;
-                totalgputick_kernel = 0;
-                for(int j = 0; j < LOOP_TIMES + 1; j ++)
-                {
-                    Has_roi(k);
-                    if(((bordertype[i] != cv::BORDER_CONSTANT) && (bordertype[i] != cv::BORDER_REPLICATE) && (mat1_roi.cols <= radius)) || (mat1_roi.cols <= radius) || (mat1_roi.rows <= radius) || (mat1_roi.rows <= radius))
-                    {
-                        continue;
-                    }
-                    t0 = (double)cvGetTickCount();//cpu start
-                    cv::bilateralFilter(mat1_roi, dst_roi, d, sigmacolor, sigmaspace, bordertype[i]);
-                    t0 = (double)cvGetTickCount() - t0;//cpu end
-
-                    t1 = (double)cvGetTickCount();//gpu start1
-                    if(type1 != nulltype)
-                    {
-                        clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                    }
-                    t2 = (double)cvGetTickCount(); //kernel
-                    cv::ocl::bilateralFilter(clmat1_roi, cldst_roi, d, sigmacolor, sigmaspace, bordertype[i]);
-                    t2 = (double)cvGetTickCount() - t2;//kernel
-                    cv::Mat cpu_cldst;
-                    cldst.download(cpu_cldst);//download
-                    t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-                    if(j == 0)
-                        continue;
-
-                    totalgputick = t1 + totalgputick;
-                    totalcputick = t0 + totalcputick;
-                    totalgputick_kernel = t2 + totalgputick_kernel;
-
-                }
-                if(k == 0)
-                {
-                    cout << "no roi\n";
-                }
-                else
-                {
-                    cout << "with roi\n";
-                };
-                cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-                cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-                cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            }
-
-#else
-            for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-            {
-                Has_roi(j);
-                if(type1 != nulltype)
-                {
-                    clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                };
-                if(j == 0)
-                {
-                    cout << "no roi:";
-                }
-                else
-                {
-                    cout << "\nwith roi:";
-                };
-                cv::ocl::bilateralFilter(clmat1_roi, cldst_roi, d, sigmacolor, sigmaspace, bordertype[i]);
-            };
-
-#endif
-        };
 
     }
 }
-
-////////////////////////////////copyMakeBorder////////////////////////////////////////////
-
-struct CopyMakeBorder : ImgprocTestBase {};
-
-TEST_P(CopyMakeBorder, Mat)
+/////////// CopyMakeBorder //////////////////////
+TEST(CopyMakeBorder)
 {
-    int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT, cv::BORDER_WRAP, cv::BORDER_REFLECT_101};
-    //const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
-    int top = 5;
-    int bottom = 5;
-    int left = 6;
-    int right = 6;
-    if (mat1.type() != dst.type())
+    Mat src, dst;
+    ocl::oclMat d_dst;
+
+    int bordertype = BORDER_CONSTANT;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        cout << "Unsupported type" << endl;
-        EXPECT_DOUBLE_EQ(0.0, 0.0);
-    }
-    else
-    {
-        for(size_t i = 0; i < sizeof(bordertype) / sizeof(int); i++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-#ifndef PRINT_KERNEL_RUN_TIME
-            double totalcputick = 0;
-            double totalgputick = 0;
-            double totalgputick_kernel = 0;
-            double t0 = 0;
-            double t1 = 0;
-            double t2 = 0;
-            for(int k = LOOPROISTART; k < 1; k++) //don't support roi perf test
-            {
-                totalcputick = 0;
-                totalgputick = 0;
-                totalgputick_kernel = 0;
-                for(int j = 0; j < LOOP_TIMES + 1; j ++)
-                {
-                    Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-                    t0 = (double)cvGetTickCount();//cpu start
-                    cv::copyMakeBorder(mat1_roi, dst_roi, top, bottom, left, right, bordertype[i] | cv::BORDER_ISOLATED, cv::Scalar(1.0));
-                    t0 = (double)cvGetTickCount() - t0;//cpu end
 
-                    t1 = (double)cvGetTickCount();//gpu start1
-                    if(type1 != nulltype)
-                    {
-                        clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                    }
-                    t2 = (double)cvGetTickCount(); //kernel
-                    cv::ocl::copyMakeBorder(clmat1_roi, cldst_roi, top, bottom, left, right,  bordertype[i] | cv::BORDER_ISOLATED, cv::Scalar(1.0));
-                    t2 = (double)cvGetTickCount() - t2;//kernel
-                    cv::Mat cpu_cldst;
-                    cldst.download(cpu_cldst);//download
-                    t1 = (double)cvGetTickCount() - t1;//gpu end1
+            gen(src, size, size, all_type[j], 0, 256);
 
-                    if(j == 0)
-                        continue;
+            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
 
-                    totalgputick = t1 + totalgputick;
-                    totalcputick = t0 + totalcputick;
-                    totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+
+            WARMUP_ON;
+            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-                }
-                if(k == 0)
-                {
-                    cout << "no roi\n";
-                }
-                else
-                {
-                    cout << "with roi\n";
-                };
-                cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-                cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-                cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-            }
-#else
-            for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-            {
-                Has_roi(j);
-                if(type1 != nulltype)
-                {
-                    clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-                };
-                if(j == 0)
-                {
-                    cout << "no roi:";
-                }
-                else
-                {
-                    cout << "\nwith roi:";
-                };
-                cv::ocl::copyMakeBorder(clmat1_roi, cldst_roi, top, bottom, left, right,  bordertype[i] | cv::BORDER_ISOLATED, cv::Scalar(1.0));
-            };
-#endif
-        };
     }
 }
-
-////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
-
-struct cornerMinEigenVal : ImgprocTestBase {};
-
-TEST_P(cornerMinEigenVal, Mat)
+///////////// cornerMinEigenVal ////////////////////////
+TEST(cornerMinEigenVal)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, dst;
+    ocl::oclMat d_dst;
+
+    int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
+    int borderType = BORDER_REFLECT;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            int blockSize = 7, apertureSize = 3; //1 + 2 * (rand() % 4);
-            int borderType = cv::BORDER_REFLECT;
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::cornerMinEigenVal(mat1_roi, dst_roi, blockSize, apertureSize, borderType);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            if(type1 != nulltype)
-            {
-                clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-            }
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::cornerMinEigenVal(clmat1_roi, cldst_roi, blockSize, apertureSize, borderType);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_cldst;
-            cldst.download(cpu_cldst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
 
-            if(j == 0)
-                continue;
+            gen(src, size, size, all_type[j], 0, 256);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
 
+            CPU_ON;
+            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+
+            WARMUP_ON;
+            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
-        int borderType = cv::BORDER_REFLECT;
-        if(type1 != nulltype)
-        {
-            clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-        };
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::cornerMinEigenVal(clmat1_roi, cldst_roi, blockSize, apertureSize, borderType);
-    };
-#endif
 }
-
-
-////////////////////////////////cornerHarris//////////////////////////////////////////
-
-struct cornerHarris : ImgprocTestBase {};
-
-TEST_P(cornerHarris, Mat)
+///////////// cornerHarris ////////////////////////
+TEST(cornerHarris)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            int blockSize = 7, apertureSize = 3;
-            int borderType = cv::BORDER_REFLECT;
-            double kk = 2;
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::cornerHarris(mat1_roi, dst_roi, blockSize, apertureSize, kk, borderType);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; BORDER_REFLECT";
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            if(type1 != nulltype)
-            {
-                clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-            }
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::cornerHarris(clmat1_roi, cldst_roi, blockSize, apertureSize, kk, borderType);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_cldst;
-            cldst.download(cpu_cldst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            gen(src, size, size, all_type[j], 0, 1);
 
-            if(j == 0)
-                continue;
+            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
+            CPU_OFF;
 
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        double kk = 2;
-        int blockSize = 7, apertureSize = 3;
-        int borderType = cv::BORDER_REFLECT;
-        if(type1 != nulltype)
-        {
-            clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-        };
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::cornerHarris(clmat1_roi, cldst_roi, blockSize, apertureSize, kk, borderType);
-    };
-#endif
-
 }
-
-
-////////////////////////////////integral/////////////////////////////////////////////////
-
-struct integral : ImgprocTestBase {};
-
-TEST_P(integral, Mat)
+///////////// integral ////////////////////////
+TEST(integral)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, sum;
+    ocl::oclMat d_src, d_sum, d_buf;
+
+    int all_type[] = {CV_8UC1};
+    std::string type_name[] = {"CV_8UC1"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::integral(mat1_roi, dst_roi, dst1_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            SUBTEST << size << 'x' << size << "; " << type_name[j]  ;
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            if(type1 != nulltype)
-            {
-                clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-            }
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::integral(clmat1_roi, cldst_roi, cldst1_roi);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_cldst;
-            cv::Mat cpu_cldst1;
-            cldst.download(cpu_cldst);//download
-            cldst1.download(cpu_cldst1);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            gen(src, size, size, all_type[j], 0, 256);
 
-            if(j == 0)
-                continue;
+            integral(src, sum);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            integral(src, sum);
+            CPU_OFF;
 
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::integral(d_src, d_sum);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::integral(d_src, d_sum);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::integral(d_src, d_sum);
+            d_sum.download(sum);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        if(type1 != nulltype)
-        {
-            clmat1_roi = clmat1(Rect(src1x, src1y, roicols, roirows));
-        };
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::integral(clmat1_roi, cldst_roi, cldst1_roi);
-    };
-#endif
 }
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// warpAffine  & warpPerspective
-
-PARAM_TEST_CASE(WarpTestBase, MatType, int)
+///////////// WarpAffine ////////////////////////
+TEST(WarpAffine)
 {
-    int type;
-    cv::Size size;
-    int interpolation;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int src_roicols;
-    int src_roirows;
-    int dst_roicols;
-    int dst_roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        //dsize = GET_PARAM(1);
-        interpolation = GET_PARAM(1);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        size = cv::Size(MWIDTH, MHEIGHT);
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            src_roicols =  mat1.cols - 1; //start
-            src_roirows = mat1.rows - 1;
-            dst_roicols = dst.cols - 1;
-            dst_roirows = dst.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-
-        }
-        else
-        {
-            src_roicols = mat1.cols;
-            src_roirows = mat1.rows;
-            dst_roicols = dst.cols;
-            dst_roirows = dst.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-
-        };
-        mat1_roi = mat1(Rect(src1x, src1y, src_roicols, src_roirows));
-        dst_roi  = dst(Rect(dstx, dsty, dst_roicols, dst_roirows));
-
-
-    }
-
-};
-
-/////warpAffine
-
-struct WarpAffine : WarpTestBase {};
-
-TEST_P(WarpAffine, Mat)
-{
     static const double coeffs[2][3] =
     {
         {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
         {sin(3.14 / 6), cos(3.14 / 6), -100.0}
     };
     Mat M(2, 3, CV_64F, (void *)coeffs);
+    int interpolation = INTER_NEAREST;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::warpAffine(mat1_roi, dst_roi, M, size, interpolation);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+            Size size1 = Size(size, size);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
+            warpAffine(src, dst, M, size1, interpolation);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::warpAffine(gmat1, gdst, M, size, interpolation);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            CPU_ON;
+            warpAffine(src, dst, M, size1, interpolation);
+            CPU_OFF;
 
-            if(j == 0)
-                continue;
+            d_src.upload(src);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::warpAffine(gmat1, gdst, M, size, interpolation);
-    };
-#endif
-
 }
-
-
-// warpPerspective
-
-struct WarpPerspective : WarpTestBase {};
-
-TEST_P(WarpPerspective, Mat)
+///////////// WarpPerspective ////////////////////////
+TEST(WarpPerspective)
 {
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
     static const double coeffs[3][3] =
     {
         {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
@@ -998,1154 +333,576 @@ TEST_P(WarpPerspective, Mat)
         {0.0, 0.0, 1.0}
     };
     Mat M(3, 3, CV_64F, (void *)coeffs);
+    int interpolation = INTER_NEAREST;
 
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::warpPerspective(mat1_roi, dst_roi, M, size, interpolation);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
+            gen(dst, size, size, all_type[j], 0, 256);
+            Size size1 = Size(size, size);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
+            warpPerspective(src, dst, M, size1, interpolation);
 
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::warpPerspective(gmat1, gdst, M, size, interpolation);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
+            CPU_ON;
+            warpPerspective(src, dst, M, size1, interpolation);
+            CPU_OFF;
 
-            if(j == 0)
-                continue;
+            d_src.upload(src);
 
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            WARMUP_ON;
+            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
+            WARMUP_OFF;
 
+            GPU_ON;
+            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::warpPerspective(gmat1, gdst, M, size, interpolation);
-    };
-#endif
-
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// remap
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int)
+///////////// resize ////////////////////////
+TEST(resize)
 {
-    int srcType;
-    int map1Type;
-    int map2Type;
-    cv::Scalar val;
-
-    int interpolation;
-    int bordertype;
-
-    cv::Mat src;
-    cv::Mat dst;
-    cv::Mat map1;
-    cv::Mat map2;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
 
-    int src_roicols;
-    int src_roirows;
-    int dst_roicols;
-    int dst_roirows;
-    int map1_roicols;
-    int map1_roirows;
-    int map2_roicols;
-    int map2_roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-    int map1x;
-    int map1y;
-    int map2x;
-    int map2y;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    cv::Mat src_roi;
-    cv::Mat dst_roi;
-    cv::Mat map1_roi;
-    cv::Mat map2_roi;
-
-    //ocl mat for testing
-    cv::ocl::oclMat gdst;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gsrc_roi;
-    cv::ocl::oclMat gdst_roi;
-    cv::ocl::oclMat gmap1_roi;
-    cv::ocl::oclMat gmap2_roi;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        srcType = GET_PARAM(0);
-        map1Type = GET_PARAM(1);
-        map2Type = GET_PARAM(2);
-        interpolation = GET_PARAM(3);
-        bordertype = GET_PARAM(4);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size srcSize = cv::Size(MWIDTH, MHEIGHT);
-        cv::Size map1Size = cv::Size(MWIDTH, MHEIGHT);
-        double min = 5, max = 16;
-
-        if(srcType != nulltype)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            src = randomMat(rng, srcSize, srcType, min, max, false);
-        }
-        if((map1Type == CV_16SC2 && map2Type == nulltype) || (map1Type == CV_32FC2 && map2Type == nulltype))
-        {
-            map1 = randomMat(rng, map1Size, map1Type, min, max, false);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; up";
 
-        }
-        else if (map1Type == CV_32FC1 && map2Type == CV_32FC1)
-        {
-            map1 = randomMat(rng, map1Size, map1Type, min, max, false);
-            map2 = randomMat(rng, map1Size, map1Type, min, max, false);
+            gen(src, size, size, all_type[j], 0, 256);
+
+            resize(src, dst, Size(), 2.0, 2.0);
+
+            CPU_ON;
+            resize(src, dst, Size(), 2.0, 2.0);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
 
+    }
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; down";
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            resize(src, dst, Size(), 0.5, 0.5);
+
+            CPU_ON;
+            resize(src, dst, Size(), 0.5, 0.5);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
+
+    }
+}
+///////////// threshold////////////////////////
+TEST(threshold)
+{
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
+
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";
+
+        gen(src, size, size, CV_8U, 0, 100);
+
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
+
+        CPU_ON;
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
+        CPU_OFF;
+
+        d_src.upload(src);
+
+        WARMUP_ON;
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
+
+    }
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; 32FC1; THRESH_TRUNC [NPP]";
+
+        gen(src, size, size, CV_32FC1, 0, 100);
+
+        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+
+        CPU_ON;
+        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        CPU_OFF;
+
+        d_src.upload(src);
+
+        WARMUP_ON;
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
+    }
+}
+///////////// meanShiftFiltering////////////////////////
+TEST(meanShiftFiltering)
+{
+    int sp = 10, sr = 10;
+    Mat src, dst;
+
+    ocl::oclMat d_src, d_dst;
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; 8UC3 vs 8UC4";
+
+        gen(src, size, size, CV_8UC3, Scalar::all(0), Scalar::all(256));
+
+        pyrMeanShiftFiltering(src, dst, sp, sr);
+
+        CPU_ON;
+        pyrMeanShiftFiltering(src, dst, sp, sr);
+        CPU_OFF;
+
+        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
+
+        d_src.upload(src);
+
+        WARMUP_ON;
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
+        d_dst.download(dst);
+        GPU_FULL_OFF;
+    }
+}
+///////////// meanShiftProc////////////////////////
+COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size size, int sp, int sr, int maxIter, float eps, int *tab)
+{
+
+    int isr2 = sr * sr;
+    int c0, c1, c2, c3;
+    int iter;
+    uchar *ptr = NULL;
+    uchar *pstart = NULL;
+    int revx = 0, revy = 0;
+    c0 = sptr[0];
+    c1 = sptr[1];
+    c2 = sptr[2];
+    c3 = sptr[3];
+
+    // iterate meanshift procedure
+    for (iter = 0; iter < maxIter; iter++)
+    {
+        int count = 0;
+        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
+
+        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
+        int minx = x0 - sp;
+        int miny = y0 - sp;
+        int maxx = x0 + sp;
+        int maxy = y0 + sp;
+
+        //deal with the image boundary
+        if (minx < 0)
+        {
+            minx = 0;
+        }
+
+        if (miny < 0)
+        {
+            miny = 0;
+        }
+
+        if (maxx >= size.width)
+        {
+            maxx = size.width - 1;
+        }
+
+        if (maxy >= size.height)
+        {
+            maxy = size.height - 1;
+        }
+
+        if (iter == 0)
+        {
+            pstart = sptr;
+        }
         else
-            cout << "The wrong input type" << endl;
-
-        dst = randomMat(rng, map1Size, srcType, min, max, false);
-        switch (src.channels())
         {
-        case 1:
-            val = cv::Scalar(rng.uniform(0.0, 10.0), 0, 0, 0);
-            break;
-        case 2:
-            val = cv::Scalar(rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), 0, 0);
-            break;
-        case 3:
-            val = cv::Scalar(rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), 0);
-            break;
-        case 4:
-            val = cv::Scalar(rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0), rng.uniform(0.0, 10.0));
+            pstart = pstart + revy * sstep + (revx << 2); //point to the new position
+        }
+
+        ptr = pstart;
+        ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row
+
+        for (int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
+        {
+            int rowCount = 0;
+            int x = minx;
+#if CV_ENABLE_UNROLLED
+
+            for (; x + 4 <= maxx; x += 4, ptr += 16)
+            {
+                int t0, t1, t2;
+                t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
+
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x;
+                    rowCount++;
+                }
+
+                t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];
+
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x + 1;
+                    rowCount++;
+                }
+
+                t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];
+
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x + 2;
+                    rowCount++;
+                }
+
+                t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];
+
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x + 3;
+                    rowCount++;
+                }
+            }
+
+#endif
+
+            for (; x <= maxx; x++, ptr += 4)
+            {
+                int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
+
+                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
+                {
+                    s0 += t0;
+                    s1 += t1;
+                    s2 += t2;
+                    sx += x;
+                    rowCount++;
+                }
+            }
+
+            if (rowCount == 0)
+            {
+                continue;
+            }
+
+            count += rowCount;
+            sy += y * rowCount;
+        }
+
+        if (count == 0)
+        {
             break;
         }
 
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        //if you want to use undefault device, set it here
-        //setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
-    {
-        if(b)
+        int x1 = sx / count;
+        int y1 = sy / count;
+        s0 = s0 / count;
+        s1 = s1 / count;
+        s2 = s2 / count;
+
+        bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
+                        tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
+
+        //revise the pointer corresponding to the new (y0,x0)
+        revx = x1 - x0;
+        revy = y1 - y0;
+
+        x0 = x1;
+        y0 = y1;
+        c0 = s0;
+        c1 = s1;
+        c2 = s2;
+
+        if (stopFlag)
         {
-            //randomize ROI
-            dst_roicols = dst.cols - 1;
-            dst_roirows = dst.rows - 1;
-
-            src_roicols = src.cols - 1;
-            src_roirows = src.rows - 1;
-
-
-            srcx = 1;
-            srcy = 1;
-            dstx = 1;
-            dsty = 1;
+            break;
         }
-        else
-        {
-            dst_roicols = dst.cols;
-            dst_roirows = dst.rows;
+    } //for iter
 
-            src_roicols = src.cols;
-            src_roirows = src.rows;
-
-
-            srcx = 0;
-            srcy = 0;
-            dstx = 0;
-            dsty = 0;
-        }
-        map1_roicols = dst_roicols;
-        map1_roirows = dst_roirows;
-        map2_roicols = dst_roicols;
-        map2_roirows = dst_roirows;
-        map1x = dstx;
-        map1y = dsty;
-        map2x = dstx;
-        map2y = dsty;
-
-        if((map1Type == CV_16SC2 && map2Type == nulltype) || (map1Type == CV_32FC2 && map2Type == nulltype))
-        {
-            map1_roi = map1(Rect(map1x, map1y, map1_roicols, map1_roirows));
-            gmap1_roi = map1_roi;
-        }
-
-        else if (map1Type == CV_32FC1 && map2Type == CV_32FC1)
-        {
-            map1_roi = map1(Rect(map1x, map1y, map1_roicols, map1_roirows));
-            map2_roi = map2(Rect(map2x, map2y, map2_roicols, map2_roirows));
-            gmap1_roi = map1_roi;
-            gmap2_roi = map2_roi;
-        }
-        dst_roi = dst(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        src_roi = dst(Rect(srcx, srcy, src_roicols, src_roirows));
-
-    }
-};
-
-TEST_P(Remap, Mat)
-{
-    if((interpolation == 1 && map1Type == CV_16SC2) || (map1Type == CV_32FC1 && map2Type == nulltype) || (map1Type == CV_16SC2 && map2Type == CV_32FC1) || (map1Type == CV_32FC2 && map2Type == CV_32FC1))
-    {
-        cout << "LINEAR don't support the map1Type and map2Type" << endl;
-        return;
-    }
-    int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
-    const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
-    cout << borderstr[0] << endl;
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = 0; k < 2; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::remap(src_roi, dst_roi, map1_roi, map2_roi, interpolation, bordertype[0], val);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start
-            gsrc_roi = src_roi;
-            gdst = dst;
-            gdst_roi = gdst(Rect(dstx, dsty, dst_roicols, dst_roirows));
-
-            t2 = (double)cvGetTickCount();//kernel
-            cv::ocl::remap(gsrc_roi, gdst_roi, gmap1_roi, gmap2_roi, interpolation, bordertype[0], val);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-
-            cv::Mat cpu_dst;
-            gdst.download(cpu_dst);
-
-            t1 = (double)cvGetTickCount() - t1;//gpu end
-
-            if (j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = 0; j < 2; j ++)
-    {
-        Has_roi(j);
-        gdst = dst;
-        gdst_roi = gdst(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        gsrc_roi = src_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::remap(gsrc_roi, gdst_roi, gmap1_roi, gmap2_roi, interpolation, bordertype[0], val);
-    };
-#endif
+    dptr[0] = (uchar)c0;
+    dptr[1] = (uchar)c1;
+    dptr[2] = (uchar)c2;
+    dptr[3] = (uchar)c3;
 
+    COOR coor;
+    coor.x = static_cast<short>(x0);
+    coor.y = static_cast<short>(y0);
+    return coor;
 }
 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// resize
-
-PARAM_TEST_CASE(Resize, MatType, cv::Size, double, double, int)
+void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, cv::TermCriteria crit)
 {
-    int type;
-    cv::Size dsize;
-    double fx, fy;
-    int interpolation;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
-
-    // set up roi
-    int src_roicols;
-    int src_roirows;
-    int dst_roicols;
-    int dst_roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    if (src_roi.empty())
     {
-        type = GET_PARAM(0);
-        dsize = GET_PARAM(1);
-        fx = GET_PARAM(2);
-        fy = GET_PARAM(3);
-        interpolation = GET_PARAM(4);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        if(dsize == cv::Size() && !(fx > 0 && fy > 0))
-        {
-            cout << "invalid dsize and fx fy" << endl;
-            return;
-        }
-
-        if(dsize == cv::Size())
-        {
-            dsize.width = (int)(size.width * fx);
-            dsize.height = (int)(size.height * fy);
-        }
-
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, dsize, type, 5, 16, false);
-
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            src_roicols =  mat1.cols - 1; //start
-            src_roirows = mat1.rows - 1;
-            dst_roicols = dst.cols - 1;
-            dst_roirows = dst.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-
-        }
-        else
-        {
-            src_roicols = mat1.cols;
-            src_roirows = mat1.rows;
-            dst_roicols = dst.cols;
-            dst_roirows = dst.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
-
-        };
-        mat1_roi = mat1(Rect(src1x, src1y, src_roicols, src_roirows));
-        dst_roi  = dst(Rect(dstx, dsty, dst_roicols, dst_roirows));
-
-
+        CV_Error(CV_StsBadArg, "The input image is empty");
     }
 
-};
-
-TEST_P(Resize, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    if (src_roi.depth() != CV_8U || src_roi.channels() != 4)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::resize(mat1_roi, dst_roi, dsize, fx, fy, interpolation);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
-
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::resize(gmat1, gdst, dsize, fx, fy, interpolation);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+        CV_Error(CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
+
+    CV_Assert((src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
+              (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
+    CV_Assert(!(dstCoor_roi.step & 0x3));
+
+    if (!(crit.type & cv::TermCriteria::MAX_ITER))
     {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, dst_roicols, dst_roirows));
-        gmat1 = mat1_roi;
-        if(j == 0)
+        crit.maxCount = 5;
+    }
+
+    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
+    float eps;
+
+    if (!(crit.type & cv::TermCriteria::EPS))
+    {
+        eps = 1.f;
+    }
+
+    eps = (float)std::max(crit.epsilon, 0.0);
+
+    int tab[512];
+
+    for (int i = 0; i < 512; i++)
+    {
+        tab[i] = (i - 255) * (i - 255);
+    }
+
+    uchar *sptr = src_roi.data;
+    uchar *dptr = dst_roi.data;
+    short *dCoorptr = (short *)dstCoor_roi.data;
+    int sstep = (int)src_roi.step;
+    int dstep = (int)dst_roi.step;
+    int dCoorstep = (int)dstCoor_roi.step >> 1;
+    cv::Size size = src_roi.size();
+
+    for (int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
+            dptr += dstep - (size.width << 2), dCoorptr += dCoorstep - (size.width << 1))
+    {
+        for (int j = 0; j < size.width; j++, sptr += 4, dptr += 4, dCoorptr += 2)
         {
-            cout << "no roi:";
+            *((COOR *)dCoorptr) = do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
         }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::resize(gmat1, gdst, dsize, fx, fy, interpolation);
-    };
-#endif
+    }
 
 }
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//threshold
-
-PARAM_TEST_CASE(Threshold, MatType, ThreshOp)
+TEST(meanShiftProc)
 {
-    int type;
-    int threshOp;
+    Mat src, dst, dstCoor_roi;
+    ocl::oclMat d_src, d_dst, d_dstCoor_roi;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
+    TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        threshOp = GET_PARAM(1);
+        SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
+        gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
+        gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
 
-        mat1 = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
+        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
 
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1; //start
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            dstx    = 1;
-            dsty    = 1;
+        CPU_ON;
+        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
+        CPU_OFF;
 
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x = 0;
-            src1y = 0;
-            dstx = 0;
-            dsty = 0;
+        d_src.upload(src);
 
-        };
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
+        WARMUP_ON;
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+        WARMUP_OFF;
 
+        GPU_ON;
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+         ;
+        GPU_OFF;
 
-    }
-};
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
+        d_dst.download(dst);
+        d_dstCoor_roi.download(dstCoor_roi);
+        GPU_FULL_OFF;
 
-TEST_P(Threshold, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            double maxVal = randomDouble(20.0, 127.0);
-            double thresh = randomDouble(0.0, maxVal);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::threshold(mat1_roi, dst_roi, thresh, maxVal, threshOp);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            gmat1 = mat1_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::threshold(gmat1, gdst, thresh, maxVal, threshOp);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        double maxVal = randomDouble(20.0, 127.0);
-        double thresh = randomDouble(0.0, maxVal);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        gmat1 = mat1_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::threshold(gmat1, gdst, thresh, maxVal, threshOp);
-    };
-#endif
-
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//meanShift
-
-PARAM_TEST_CASE(meanShiftTestBase, MatType, MatType, int, int, cv::TermCriteria)
-{
-    int type, typeCoor;
-    int sp, sr;
-    cv::TermCriteria crit;
-    //src mat
-    cv::Mat src;
-    cv::Mat dst;
-    cv::Mat dstCoor;
-
-    //set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat src_roi;
-    cv::Mat dst_roi;
-    cv::Mat dstCoor_roi;
-
-    //ocl dst mat
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gdstCoor;
-
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl mat with roi
-    cv::ocl::oclMat gsrc_roi;
-    cv::ocl::oclMat gdst_roi;
-    cv::ocl::oclMat gdstCoor_roi;
-
-    virtual void SetUp()
-    {
-        type     = GET_PARAM(0);
-        typeCoor = GET_PARAM(1);
-        sp       = GET_PARAM(2);
-        sr       = GET_PARAM(3);
-        crit     = GET_PARAM(4);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-
-        // MWIDTH=256, MHEIGHT=256. defined in utility.hpp
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-
-        src = randomMat(rng, size, type, 5, 16, false);
-        dst = randomMat(rng, size, type, 5, 16, false);
-        dstCoor = randomMat(rng, size, typeCoor, 5, 16, false);
-
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //cv::ocl::setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            //randomize ROI
-            roicols = src.cols - 1;
-            roirows = src.rows - 1;
-            srcx = 1;
-            srcy = 1;
-            dstx = 1;
-            dsty = 1;
-        }
-        else
-        {
-            roicols = src.cols;
-            roirows = src.rows;
-            srcx = 0;
-            srcy = 0;
-            dstx = 0;
-            dsty = 0;
-        };
-
-        src_roi = src(Rect(srcx, srcy, roicols, roirows));
-        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
-        dstCoor_roi = dstCoor(Rect(dstx, dsty, roicols, roirows));
-
-        gdst = dst;
-        gdstCoor = dstCoor;
-    }
-};
-
-/////////////////////////meanShiftFiltering/////////////////////////////
-struct meanShiftFiltering : meanShiftTestBase {};
-
-TEST_P(meanShiftFiltering, Mat)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = 0; k < 2; k++)
-    {
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t1 = (double)cvGetTickCount();//gpu start1
-
-            gsrc_roi = src_roi;
-            gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
-
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::meanShiftFiltering(gsrc_roi, gdst_roi, sp, sr, crit);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-
-            cv::Mat cpu_gdst;
-            gdst.download(cpu_gdst);//download
-
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-
-        gsrc_roi = src_roi;
-        gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::meanShiftFiltering(gsrc_roi, gdst_roi, sp, sr, crit);
-    };
-#endif
-
-}
-
-///////////////////////////meanShiftProc//////////////////////////////////
-struct meanShiftProc : meanShiftTestBase {};
-
-TEST_P(meanShiftProc, Mat)
-{
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = 0; k < 2; k++)
-    {
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t1 = (double)cvGetTickCount();//gpu start1
-
-            gsrc_roi = src_roi;
-            gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
-            gdstCoor_roi = gdstCoor(Rect(dstx, dsty, roicols, roirows));
-
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::meanShiftProc(gsrc_roi, gdst_roi, gdstCoor_roi, sp, sr, crit);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-
-            cv::Mat cpu_gdstCoor;
-            gdstCoor.download(cpu_gdstCoor);//download
-
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-
-        gsrc_roi = src_roi;
-        gdst_roi = gdst(Rect(dstx, dsty, roicols, roirows));  //gdst_roi
-        gdstCoor_roi = gdstCoor(Rect(dstx, dsty, roicols, roirows));
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::meanShiftProc(gsrc_roi, gdst_roi, gdstCoor_roi, sp, sr, crit);
-    };
-#endif
-
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////
-//hist
-
-void calcHistGold(const cv::Mat &src, cv::Mat &hist)
-{
-    hist.create(1, 256, CV_32SC1);
-    hist.setTo(cv::Scalar::all(0));
-
-    int *hist_row = hist.ptr<int>();
-    for (int y = 0; y < src.rows; ++y)
-    {
-        const uchar *src_row = src.ptr(y);
-
-        for (int x = 0; x < src.cols; ++x)
-            ++hist_row[src_row[x]];
     }
 }
 
-PARAM_TEST_CASE(histTestBase, MatType, MatType)
+///////////// remap////////////////////////
+TEST(remap)
 {
-    int type_src;
+    Mat src, dst, xmap, ymap;
+    ocl::oclMat d_src, d_dst, d_xmap, d_ymap;
 
-    //src mat
-    cv::Mat src;
-    cv::Mat dst_hist;
-    //set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    //src mat with roi
-    cv::Mat src_roi;
-    //ocl dst mat, dst_hist and gdst_hist don't have roi
-    cv::ocl::oclMat gdst_hist;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    //ocl mat with roi
-    cv::ocl::oclMat gsrc_roi;
+    int interpolation = INTER_LINEAR;
+    int borderMode = BORDER_CONSTANT;
 
-    //    std::vector<cv::ocl::Info> oclinfo;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type_src   = GET_PARAM(0);
+        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
+        {
+            SUBTEST << size << 'x' << size << "; src " << type_name[t] << "; map CV_32FC1";
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
+            gen(src, size, size, all_type[t], 0, 256);
 
-        src = randomMat(rng, size, type_src, 0, 256, false);
+            xmap.create(size, size, CV_32FC1);
+            dst.create(size, size, CV_32FC1);
+            ymap.create(size, size, CV_32FC1);
+
+            for (int i = 0; i < size; ++i)
+            {
+                float *xmap_row = xmap.ptr<float>(i);
+                float *ymap_row = ymap.ptr<float>(i);
+
+                for (int j = 0; j < size; ++j)
+                {
+                    xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
+                    ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
+                }
+            }
+
+
+            remap(src, dst, xmap, ymap, interpolation, borderMode);
+
+            CPU_ON;
+            remap(src, dst, xmap, ymap, interpolation, borderMode);
+            CPU_OFF;
+
+            d_src.upload(src);
+            d_dst.upload(dst);
+            d_xmap.upload(xmap);
+            d_ymap.upload(ymap);
+
+            WARMUP_ON;
+            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-        //        int devnums = getDevice(oclinfo);
-        //        CV_Assert(devnums > 0);
-        //if you want to use undefault device, set it here
-        //setDevice(oclinfo[0]);
     }
-
-    void Has_roi(int b)
-    {
-        if(b)
-        {
-            //randomize ROI
-            roicols = src.cols - 1;
-            roirows = src.rows - 1;
-            srcx = 1;
-            srcy = 1;
-        }
-        else
-        {
-            roicols = src.cols;
-            roirows = src.rows;
-            srcx = 0;
-            srcy = 0;
-        };
-        src_roi = src(Rect(srcx, srcy, roicols, roirows));
-    }
-};
-
-///////////////////////////calcHist///////////////////////////////////////
-struct calcHist : histTestBase {};
-
-TEST_P(calcHist, Mat)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = 0; k < 2; k++)
-    {
-        double totalcputick = 0;
-        double totalgputick = 0;
-        double totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            calcHistGold(src_roi, dst_hist);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-
-            gsrc_roi = src_roi;
-
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::calcHist(gsrc_roi, gdst_hist);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-
-            cv::Mat cpu_hist;
-            gdst_hist.download(cpu_hist);//download
-
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalcputick = t0 + totalcputick;
-            totalgputick = t1 + totalgputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = 0; j < 2; j ++)
-    {
-        Has_roi(j);
-
-        gsrc_roi = src_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::calcHist(gsrc_roi, gdst_hist);
-    };
-#endif
-}
-
-
-//************test*******************
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, equalizeHist, Combine(
-                            ONE_TYPE(CV_8UC1),
-                            NULL_TYPE,
-                            ONE_TYPE(CV_8UC1),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, bilateralFilter, Combine(
-                            Values(CV_8UC1, CV_8UC3),
-                            NULL_TYPE,
-                            Values(CV_8UC1, CV_8UC3),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, CopyMakeBorder, Combine(
-                            Values(CV_8UC1, CV_8UC4/*, CV_32SC1*/),
-                            NULL_TYPE,
-                            Values(CV_8UC1, CV_8UC4/*,CV_32SC1*/),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, cornerMinEigenVal, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            NULL_TYPE,
-                            ONE_TYPE(CV_32FC1),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, cornerHarris, Combine(
-                            Values(CV_8UC1, CV_32FC1),
-                            NULL_TYPE,
-                            ONE_TYPE(CV_32FC1),
-                            NULL_TYPE,
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, integral, Combine(
-                            ONE_TYPE(CV_8UC1),
-                            NULL_TYPE,
-                            ONE_TYPE(CV_32SC1),
-                            ONE_TYPE(CV_32FC1),
-                            NULL_TYPE,
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(Imgproc, WarpAffine, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR,
-                                   (MatType)cv::INTER_CUBIC, (MatType)(cv::INTER_NEAREST | cv::WARP_INVERSE_MAP),
-                                   (MatType)(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP), (MatType)(cv::INTER_CUBIC | cv::WARP_INVERSE_MAP))));
-
-
-INSTANTIATE_TEST_CASE_P(Imgproc, WarpPerspective, Combine
-                        (Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                         Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR,
-                                (MatType)cv::INTER_CUBIC, (MatType)(cv::INTER_NEAREST | cv::WARP_INVERSE_MAP),
-                                (MatType)(cv::INTER_LINEAR | cv::WARP_INVERSE_MAP), (MatType)(cv::INTER_CUBIC | cv::WARP_INVERSE_MAP))));
-
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Resize, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),  Values(cv::Size()),
-                            Values(0.5/*, 1.5, 2*/), Values(0.5/*, 1.5, 2*/), Values((MatType)cv::INTER_NEAREST, (MatType)cv::INTER_LINEAR)));
-
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Threshold, Combine(
-                            Values(CV_8UC1, CV_32FC1), Values(ThreshOp(cv::THRESH_BINARY),
-                                    ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC),
-                                    ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftFiltering, Combine(
-                            ONE_TYPE(CV_8UC4),
-                            ONE_TYPE(CV_16SC2),//it is no use in meanShiftFiltering
-                            Values(5),
-                            Values(6),
-                            Values(cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 5, 1))
-                        ));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine(
-                            ONE_TYPE(CV_8UC4),
-                            ONE_TYPE(CV_16SC2),
-                            Values(5),
-                            Values(6),
-                            Values(cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 5, 1))
-                        ));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(CV_32FC1, CV_16SC2, CV_32FC2), Values(-1, CV_32FC1),
-                            Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR),
-                            Values((int)cv::BORDER_CONSTANT)));
-
-INSTANTIATE_TEST_CASE_P(histTestBase, calcHist, Combine(
-                            ONE_TYPE(CV_8UC1),
-                            ONE_TYPE(CV_32SC1) //no use
-                        ));
-
-#endif // HAVE_OPENCL
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_match_template.cpp b/modules/ocl/perf/perf_match_template.cpp
index cb5e86bab..2828efe01 100644
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@@ -42,191 +42,105 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
 
-#ifndef MWC_TEST_UTILITY
-#define MWC_TEST_UTILITY
-//////// Utility
-#ifndef DIFFERENT_SIZES
-#else
-#undef DIFFERENT_SIZES
-#endif
-#define DIFFERENT_SIZES testing::Values(cv::Size(256, 256), cv::Size(3000, 3000))
-
-// Param class
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-class name \
-{ \
-public: \
-    name ( type arg = type ()) : val_(arg) {} \
-    operator type () const {return val_;} \
-private: \
-    type val_; \
-}; \
-    inline void PrintTo( name param, std::ostream* os) \
-{ \
-    *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-}
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-#endif // MWC_TEST_UTILITY
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate
-#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
-
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
-
-const char *TEMPLATE_METHOD_NAMES[6] = {"TM_SQDIFF", "TM_SQDIFF_NORMED", "TM_CCORR", "TM_CCORR_NORMED", "TM_CCOEFF", "TM_CCOEFF_NORMED"};
-
-PARAM_TEST_CASE(MatchTemplate, cv::Size, TemplateSize, Channels, TemplateMethod)
+/////////// matchTemplate ////////////////////////
+//void InitMatchTemplate()
+//{
+//	Mat src; gen(src, 500, 500, CV_32F, 0, 1);
+//	Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
+//	ocl::oclMat d_src(src), d_templ(templ), d_dst;
+//	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+//}
+TEST(matchTemplate)
 {
-    cv::Size size;
-    cv::Size templ_size;
-    int cn;
-    int method;
-    //vector<cv::ocl::Info> oclinfo;
+    //InitMatchTemplate();
 
-    virtual void SetUp()
+    Mat src, templ, dst;
+    int templ_size = 5;
+
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        size = GET_PARAM(0);
-        templ_size = GET_PARAM(1);
-        cn = GET_PARAM(2);
-        method = GET_PARAM(3);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
+        int all_type[] = {CV_32FC1, CV_32FC4};
+        std::string type_name[] = {"CV_32FC1", "CV_32FC4"};
+
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
+            {
+                gen(src, size, size, all_type[j], 0, 1);
+
+                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR";
+
+                gen(templ, templ_size, templ_size, all_type[j], 0, 1);
+
+                matchTemplate(src, templ, dst, CV_TM_CCORR);
+
+                CPU_ON;
+                matchTemplate(src, templ, dst, CV_TM_CCORR);
+                CPU_OFF;
+
+                ocl::oclMat d_src(src), d_templ, d_dst;
+
+                d_templ.upload(templ);
+
+                WARMUP_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                WARMUP_OFF;
+
+                GPU_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                 ;
+                GPU_OFF;
+
+                GPU_FULL_ON;
+                d_src.upload(src);
+                d_templ.upload(templ);
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+        }
+
+        int all_type_8U[] = {CV_8UC1};
+        std::string type_name_8U[] = {"CV_8UC1"};
+
+        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
+        {
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
+            {
+                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";
+
+                gen(src, size, size, all_type_8U[j], 0, 255);
+
+                gen(templ, templ_size, templ_size, all_type_8U[j], 0, 255);
+
+                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
+
+                CPU_ON;
+                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
+                CPU_OFF;
+
+                ocl::oclMat d_src(src);
+                ocl::oclMat d_templ(templ), d_dst;
+
+                WARMUP_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                WARMUP_OFF;
+
+                GPU_ON;
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                 ;
+                GPU_OFF;
+
+                GPU_FULL_ON;
+                d_src.upload(src);
+                d_templ.upload(templ);
+                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
+                d_dst.download(dst);
+                GPU_FULL_OFF;
+            }
+        }
     }
-};
-struct MatchTemplate8U : MatchTemplate {};
-
-TEST_P(MatchTemplate8U, Performance)
-{
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
-
-
-
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
-        cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-}
-
-
-struct MatchTemplate32F : MatchTemplate {};
-TEST_P(MatchTemplate32F, Performance)
-{
-    std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
-    std::cout << "Image Size: (" << size.width << ", " << size.height << ")" << std::endl;
-    std::cout << "Template Size: (" << templ_size.width << ", " << templ_size.height << ")" << std::endl;
-    std::cout << "Channels: " << cn << std::endl;
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
-
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
-
-
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);//upload
-        cv::ocl::oclMat ocl_templ = cv::ocl::oclMat(templ);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        dst.download (cpu_dst);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
-    }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-
-}
-
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
-                        testing::Combine(
-                            testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT), cv::Size(1800, 1500)),
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-                            testing::Values(Channels(1), Channels(4)/*, Channels(3)*/),
-                            ALL_TEMPLATE_METHODS
-                        )
-                       );
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
-                            testing::Values(cv::Size(1280, 1024), cv::Size(MWIDTH, MHEIGHT), cv::Size(1800, 1500)),
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-                            testing::Values(Channels(1), Channels(4) /*, Channels(3)*/),
-                            testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-
-#endif //HAVE_OPENCL
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_matrix_operation.cpp b/modules/ocl/perf/perf_matrix_operation.cpp
index ba011f8df..495b2b82c 100644
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,697 +42,140 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv::ocl;
-////////////////////////////////converto/////////////////////////////////////////////////
-PARAM_TEST_CASE(ConvertToTestBase, MatType, MatType)
+///////////// ConvertTo////////////////////////
+TEST(ConvertTo)
 {
-    int type;
-    int dst_type;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    //src mat
-    cv::Mat mat;
-    cv::Mat dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type     = GET_PARAM(0);
-        dst_type = GET_PARAM(1);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] << " to 32FC1";
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            gen(src, size, size, all_type[j], 0, 256);
+            //gen(dst, size, size, all_type[j], 0, 256);
+
+            //d_dst.upload(dst);
+
+            src.convertTo(dst, CV_32FC1);
+
+            CPU_ON;
+            src.convertTo(dst, CV_32FC1);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            d_src.convertTo(d_dst, CV_32FC1);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_src.convertTo(d_dst, CV_32FC1);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.convertTo(d_dst, CV_32FC1);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-        mat = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
     }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            dstx    = 1;
-            dsty    = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            dstx   = 0;
-            dsty   = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat = mat_roi;
-    }
-};
-
-
-struct ConvertTo : ConvertToTestBase {};
-
-TEST_P(ConvertTo, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.convertTo(dst_roi, dst_type);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.convertTo(gdst, dst_type);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat = mat_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.convertTo(gdst, dst_type);
-    };
-#endif
-
 }
-
-
-///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(CopyToTestBase, MatType, bool)
+///////////// copyTo////////////////////////
+TEST(copyTo)
 {
-    int type;
+    Mat src, dst;
+    ocl::oclMat d_src, d_dst;
 
-    cv::Mat mat;
-    cv::Mat mask;
-    cv::Mat dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dstx;
-    int dsty;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst;
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            gen(src, size, size, all_type[j], 0, 256);
+            //gen(dst, size, size, all_type[j], 0, 256);
 
-        mat = randomMat(rng, size, type, 5, 16, false);
-        dst  = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
+            //d_dst.upload(dst);
+
+            src.copyTo(dst);
+
+            CPU_ON;
+            src.copyTo(dst);
+            CPU_OFF;
+
+            d_src.upload(src);
+
+            WARMUP_ON;
+            d_src.copyTo(d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_src.copyTo(d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.copyTo(d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
     }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            dstx    = 1;
-            dsty    = 1;
-            maskx   = 1;
-            masky   = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            dstx   = 0;
-            dsty   = 0;
-            maskx   = 0;
-            masky   = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-        dst_roi  = dst(Rect(dstx, dsty, roicols, roirows));
-
-        //gdst_whole = dst;
-        //gdst = gdst_whole(Rect(dstx,dsty,roicols,roirows));
-
-        //gmat = mat_roi;
-        //gmask = mask_roi;
-    }
-};
-
-struct CopyTo : CopyToTestBase {};
-
-TEST_P(CopyTo, Without_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.copyTo(dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.copyTo(gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat = mat_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.copyTo(gdst);
-    };
-#endif
 }
-
-TEST_P(CopyTo, With_mask)
+///////////// setTo////////////////////////
+TEST(setTo)
 {
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
+    Mat src, dst;
+    Scalar val(1, 2, 3, 4);
+    ocl::oclMat d_src, d_dst;
+
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            Has_roi(k);
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
 
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.copyTo(dst_roi, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
+            gen(src, size, size, all_type[j], 0, 256);
 
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
+            src.setTo(val);
 
-            gmat = mat_roi;
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.copyTo(gdst, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
+            CPU_ON;
+            src.setTo(val);
+            CPU_OFF;
 
+            d_src.upload(src);
+
+            WARMUP_ON;
+            d_src.setTo(val);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_src.setTo(val);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            d_src.setTo(val);
+            GPU_FULL_OFF;
         }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
+
     }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        gmat = mat_roi;
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.copyTo(gdst, gmask);
-    };
-#endif
-}
-
-///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(SetToTestBase, MatType, bool)
-{
-    int type;
-    cv::Scalar val;
-
-    cv::Mat mat;
-    cv::Mat mask;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int maskx;
-    int masky;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-    cv::Mat mask_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gmat_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gmask;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        mat = randomMat(rng, size, type, 5, 16, false);
-        mask = randomMat(rng, size, CV_8UC1, 0, 2,  false);
-
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-        val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
-    }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcy   = 1;
-            maskx   = 1;
-            masky   = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx   = 0;
-            srcy   = 0;
-            maskx   = 0;
-            masky   = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-        mask_roi = mask(Rect(maskx, masky, roicols, roirows));
-
-        //gmat_whole = mat;
-        //gmat = gmat_whole(Rect(srcx,srcy,roicols,roirows));
-
-        //gmask = mask_roi;
-    }
-};
-
-struct SetTo : SetToTestBase {};
-
-TEST_P(SetTo, Without_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.setTo(val);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat_whole = mat;
-            gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.setTo(val);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gmat_whole.download(cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat_whole = mat;
-        gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.setTo(val);
-    };
-#endif
-}
-
-TEST_P(SetTo, With_mask)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-
-            t0 = (double)cvGetTickCount();//cpu start
-            mat_roi.setTo(val, mask_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gmat_whole = mat;
-            gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-
-            gmask = mask_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            gmat.setTo(val, gmask);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gmat_whole.download(cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat_whole = mat;
-        gmat = gmat_whole(Rect(srcx, srcy, roicols, roirows));
-
-        gmask = mask_roi;
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        gmat.setTo(val, gmask);
-    };
-#endif
-}
-PARAM_TEST_CASE(DataTransfer, MatType, bool)
-{
-    int type;
-    cv::Mat mat;
-    cv::ocl::oclMat gmat_whole;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
-        mat = randomMat(rng, size, type, 5, 16, false);
-    }
-};
-TEST_P(DataTransfer, perf)
-{
-    double totaluploadtick = 0;
-    double totaldownloadtick = 0;
-    double totaltick = 0;
-    double t0 = 0;
-    double t1 = 0;
-    cv::Mat cpu_dst;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-        t0 = (double)cvGetTickCount();
-        gmat_whole.upload(mat);//upload
-        t0 = (double)cvGetTickCount() - t0;
-
-        t1 = (double)cvGetTickCount();
-        gmat_whole.download(cpu_dst);//download
-        t1 = (double)cvGetTickCount() - t1;
-
-        if(j == 0)
-            continue;
-        totaluploadtick = t0 + totaluploadtick;
-        totaldownloadtick = t1 + totaldownloadtick;
-    }
-    totaltick = totaluploadtick + totaldownloadtick;
-    cout << "average upload time is  " << totaluploadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average download time is  " << totaldownloadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average data transfer time is  " << totaltick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-}
-//**********test************
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)));
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-INSTANTIATE_TEST_CASE_P(MatrixOperation, DataTransfer, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(false))); // Values(false) is the reserved parameter
-#endif
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_norm.cpp b/modules/ocl/perf/perf_norm.cpp
new file mode 100644
index 000000000..8b7118a6e
--- /dev/null
+++ b/modules/ocl/perf/perf_norm.cpp
@@ -0,0 +1,84 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// norm////////////////////////
+TEST(norm)
+{
+    Mat src, buf;
+    ocl::oclMat d_src, d_buf;
+
+
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    {
+        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
+
+        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+
+        norm(src, NORM_INF);
+
+        CPU_ON;
+        norm(src, NORM_INF);
+        CPU_OFF;
+
+        d_src.upload(src);
+        d_buf.upload(buf);
+
+        WARMUP_ON;
+        ocl::norm(d_src, d_buf, NORM_INF);
+        WARMUP_OFF;
+
+        GPU_ON;
+        ocl::norm(d_src, d_buf, NORM_INF);
+         ;
+        GPU_OFF;
+
+        GPU_FULL_ON;
+        d_src.upload(src);
+        ocl::norm(d_src, d_buf, NORM_INF);
+        GPU_FULL_OFF;
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_pyrdown.cpp b/modules/ocl/perf/perf_pyrdown.cpp
index 2cea4df4a..1d1d2dec1 100644
--- a/modules/ocl/perf/perf_pyrdown.cpp
+++ b/modules/ocl/perf/perf_pyrdown.cpp
@@ -1,4 +1,4 @@
-///////////////////////////////////////////////////////////////////////////////////////
+/*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    fangfang bai, fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,96 +42,46 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
-#include <iomanip>
 
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-PARAM_TEST_CASE(PyrDown, MatType, int)
+///////////// pyrDown //////////////////////
+TEST(pyrDown)
 {
-    int type;
-    int channels;
-    //src mat
-    cv::Mat mat1;
-    cv::Mat dst;
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gdst;
-
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-
-
-};
-
-#define VARNAME(A) string(#A);
-
-////////////////////////////////PyrDown/////////////////////////////////////////////////
-TEST_P(PyrDown, Mat)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::RNG &rng = TS::ptr()->get_rng();
-    mat1 = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-
-
-    cv::ocl::oclMat gdst;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat gmat1(mat1);
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::pyrDown(gmat1, gdst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        gdst.download(cpu_dst);
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if (j == 0)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            continue;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            pyrDown(src, dst);
+
+            CPU_ON;
+            pyrDown(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
+
+            WARMUP_ON;
+            ocl::pyrDown(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::pyrDown(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrDown(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
     }
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-}
-
-//********test****************
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-
-
-#endif // HAVE_OPENCL
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_pyrlk.cpp b/modules/ocl/perf/perf_pyrlk.cpp
new file mode 100644
index 000000000..f7fc22b9d
--- /dev/null
+++ b/modules/ocl/perf/perf_pyrlk.cpp
@@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+///////////// PyrLKOpticalFlow ////////////////////////
+TEST(PyrLKOpticalFlow)
+{
+    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
+    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
+
+    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
+    {
+        Mat frame0 = imread(abspath(images1[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
+
+        if (frame0.empty())
+        {
+            std::string errstr = "can't open " + images1[i];
+            throw runtime_error(errstr);
+        }
+
+        Mat frame1 = imread(abspath(images2[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
+
+        if (frame1.empty())
+        {
+            std::string errstr = "can't open " + images2[i];
+            throw runtime_error(errstr);
+        }
+
+        Mat gray_frame;
+
+        if (i == 0)
+        {
+            cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
+        }
+
+        for (int points = Min_Size; points <= Max_Size; points *= Multiple)
+        {
+            if (i == 0)
+                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
+            else
+                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
+            Mat nextPts_cpu;
+            Mat status_cpu;
+
+            vector<Point2f> pts;
+            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
+
+            vector<Point2f> nextPts;
+            vector<unsigned char> status;
+
+            vector<float> err;
+
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+
+            CPU_ON;
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
+            CPU_OFF;
+
+            ocl::PyrLKOpticalFlow d_pyrLK;
+
+            ocl::oclMat d_frame0(frame0);
+            ocl::oclMat d_frame1(frame1);
+
+            ocl::oclMat d_pts;
+            Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
+            d_pts.upload(pts_mat);
+
+            ocl::oclMat d_nextPts;
+            ocl::oclMat d_status;
+            ocl::oclMat d_err;
+
+            WARMUP_ON;
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+            WARMUP_OFF;
+
+            GPU_ON;
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_frame0.upload(frame0);
+            d_frame1.upload(frame1);
+            d_pts.upload(pts_mat);
+            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
+
+            if (!d_nextPts.empty())
+            {
+                d_nextPts.download(nextPts_cpu);
+            }
+
+            if (!d_status.empty())
+            {
+                d_status.download(status_cpu);
+            }
+
+            GPU_FULL_OFF;
+        }
+
+    }
+}
diff --git a/modules/ocl/perf/perf_pyrup.cpp b/modules/ocl/perf/perf_pyrup.cpp
index a023353ed..d3b3003a2 100644
--- a/modules/ocl/perf/perf_pyrup.cpp
+++ b/modules/ocl/perf/perf_pyrup.cpp
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    fangfang bai fangfang@multicorewareinc.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,81 +42,46 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-#include "opencv2/core/core.hpp"
 #include "precomp.hpp"
-#include <iomanip>
-#ifdef HAVE_OPENCL
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
 
-
-PARAM_TEST_CASE(PyrUp, MatType, int)
+///////////// pyrUp ////////////////////////
+TEST(pyrUp)
 {
-    int type;
-    int channels;
-    //std::vector<cv::ocl::Info> oclinfo;
+    Mat src, dst;
+    int all_type[] = {CV_8UC1, CV_8UC4};
+    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
 
-    virtual void SetUp()
+    for (int size = 500; size <= 2000; size *= 2)
     {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-    }
-};
-
-TEST_P(PyrUp, Performance)
-{
-    cv::Size size(MWIDTH, MHEIGHT);
-    cv::Mat src = randomMat(size, CV_MAKETYPE(type, channels));
-    cv::Mat dst_gold;
-    cv::ocl::oclMat dst;
-
-
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-
-    double t1 = 0;
-    double t2 = 0;
-
-    for (int j = 0; j < LOOP_TIMES + 1; j ++)
-    {
-        t1 = (double)cvGetTickCount();//gpu start1
-
-        cv::ocl::oclMat srcMat = cv::ocl::oclMat(src);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        cv::ocl::pyrUp(srcMat, dst);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_dst;
-        dst.download(cpu_dst); //download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if (j == 0)
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
         {
-            continue;
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+
+            gen(src, size, size, all_type[j], 0, 256);
+
+            pyrUp(src, dst);
+
+            CPU_ON;
+            pyrUp(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(src);
+            ocl::oclMat d_dst;
+
+            WARMUP_ON;
+            ocl::pyrUp(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::pyrUp(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::pyrUp(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
         }
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
-
     }
-
-
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, Combine(
-                            Values(CV_8U, CV_32F), Values(1, 4)));
-
-#endif // HAVE_OPENCL
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_split_merge.cpp b/modules/ocl/perf/perf_split_merge.cpp
index 67a3d24ae..48ff1ff15 100644
--- a/modules/ocl/perf/perf_split_merge.cpp
+++ b/modules/ocl/perf/perf_split_merge.cpp
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Fangfang Bai, fangfang@multicorewareinc.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -42,446 +42,109 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
 #include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv::ocl;
-PARAM_TEST_CASE(MergeTestBase, MatType, int)
+///////////// Merge////////////////////////
+TEST(Merge)
 {
-    int type;
-    int channels;
+    Mat dst;
+    ocl::oclMat d_dst;
 
-    //src mat
-    cv::Mat mat1;
-    cv::Mat mat2;
-    cv::Mat mat3;
-    cv::Mat mat4;
+    int channels = 4;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-    //dst mat
-    cv::Mat dst;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int src1x;
-    int src1y;
-    int src2x;
-    int src2y;
-    int src3x;
-    int src3y;
-    int src4x;
-    int src4y;
-    int dstx;
-    int dsty;
-
-    //src mat with roi
-    cv::Mat mat1_roi;
-    cv::Mat mat2_roi;
-    cv::Mat mat3_roi;
-    cv::Mat mat4_roi;
-
-    //dst mat with roi
-    cv::Mat dst_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat1;
-    cv::ocl::oclMat gmat2;
-    cv::ocl::oclMat gmat3;
-    cv::ocl::oclMat gmat4;
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+            Size size1 = Size(size, size);
+            std::vector<Mat> src(channels);
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            for (int i = 0; i < channels; ++i)
+            {
+                src[i] = Mat(size1, all_type[j], cv::Scalar::all(i));
+            }
+
+            merge(src, dst);
+
+            CPU_ON;
+            merge(src, dst);
+            CPU_OFF;
+
+            std::vector<ocl::oclMat> d_src(channels);
+
+            for (int i = 0; i < channels; ++i)
+            {
+                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
+            }
+
+            WARMUP_ON;
+            ocl::merge(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::merge(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+
+            for (int i = 0; i < channels; ++i)
+            {
+                d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
+            }
+
+            ocl::merge(d_src, d_dst);
+            d_dst.download(dst);
+            GPU_FULL_OFF;
+        }
 
-        mat1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        mat4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
     }
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat1.cols - 1; //start
-            roirows = mat1.rows - 1;
-            src1x   = 1;
-            src1y   = 1;
-            src2x   = 1;
-            src2y   = 1;
-            src3x   = 1;
-            src3y   = 1;
-            src4x   = 1;
-            src4y   = 1;
-            dstx    = 1;
-            dsty    = 1;
-
-        }
-        else
-        {
-            roicols = mat1.cols;
-            roirows = mat1.rows;
-            src1x   = 0;
-            src1y   = 0;
-            src2x   = 0;
-            src2y   = 0;
-            src3x   = 0;
-            src3y   = 0;
-            src4x   = 0;
-            src4y   = 0;
-            dstx    = 0;
-            dsty    = 0;
-        };
-
-        mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
-        mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
-        mat3_roi = mat3(Rect(src3x, src3y, roicols, roirows));
-        mat4_roi = mat4(Rect(src4x, src4y, roicols, roirows));
-
-
-        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
-    }
-
-};
-
-struct Merge : MergeTestBase {};
-
-TEST_P(Merge, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            std::vector<cv::Mat> dev_src;
-            dev_src.push_back(mat1_roi);
-            dev_src.push_back(mat2_roi);
-            dev_src.push_back(mat3_roi);
-            dev_src.push_back(mat4_roi);
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::merge(dev_src, dst_roi);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1	]
-            gmat1 = mat1_roi;
-            gmat2 = mat2_roi;
-            gmat3 = mat3_roi;
-            gmat4 = mat4_roi;
-            gdst_whole = dst;
-            gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-            std::vector<cv::ocl::oclMat> dev_gsrc;
-            dev_gsrc.push_back(gmat1);
-            dev_gsrc.push_back(gmat2);
-            dev_gsrc.push_back(gmat3);
-            dev_gsrc.push_back(gmat4);
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::merge(dev_gsrc, gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst;
-            gdst_whole.download (cpu_dst);//download
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-            if(j == 0)
-                continue;
-
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        gmat1 = mat1_roi;
-        gmat2 = mat2_roi;
-        gmat3 = mat3_roi;
-        gmat4 = mat4_roi;
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-        std::vector<cv::ocl::oclMat> dev_gsrc;
-        dev_gsrc.push_back(gmat1);
-        dev_gsrc.push_back(gmat2);
-        dev_gsrc.push_back(gmat3);
-        dev_gsrc.push_back(gmat4);
-
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::merge(dev_gsrc, gdst);
-    };
-#endif
 }
 
-
-PARAM_TEST_CASE(SplitTestBase, MatType, int)
+///////////// Split////////////////////////
+TEST(Split)
 {
-    int type;
-    int channels;
+    //int channels = 4;
+    int all_type[] = {CV_8UC1, CV_32FC1};
+    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
 
-    //src mat
-    cv::Mat mat;
-
-    //dstmat
-    cv::Mat dst1;
-    cv::Mat dst2;
-    cv::Mat dst3;
-    cv::Mat dst4;
-
-    // set up roi
-    int roicols;
-    int roirows;
-    int srcx;
-    int srcy;
-    int dst1x;
-    int dst1y;
-    int dst2x;
-    int dst2y;
-    int dst3x;
-    int dst3y;
-    int dst4x;
-    int dst4y;
-
-    //src mat with roi
-    cv::Mat mat_roi;
-
-    //dst mat with roi
-    cv::Mat dst1_roi;
-    cv::Mat dst2_roi;
-    cv::Mat dst3_roi;
-    cv::Mat dst4_roi;
-    //std::vector<cv::ocl::Info> oclinfo;
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst1_whole;
-    cv::ocl::oclMat gdst2_whole;
-    cv::ocl::oclMat gdst3_whole;
-    cv::ocl::oclMat gdst4_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat;
-    cv::ocl::oclMat gdst1;
-    cv::ocl::oclMat gdst2;
-    cv::ocl::oclMat gdst3;
-    cv::ocl::oclMat gdst4;
-
-    virtual void SetUp()
+    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
     {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
+        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
+        {
+            SUBTEST << size << 'x' << size << "; " << type_name[j];
+            Size size1 = Size(size, size);
 
-        cv::RNG &rng = TS::ptr()->get_rng();
-        cv::Size size(MWIDTH, MHEIGHT);
+            Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
+
+            std::vector<cv::Mat> dst;
+
+            split(src, dst);
+
+            CPU_ON;
+            split(src, dst);
+            CPU_OFF;
+
+            ocl::oclMat d_src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
+            std::vector<cv::ocl::oclMat> d_dst;
+
+            WARMUP_ON;
+            ocl::split(d_src, d_dst);
+            WARMUP_OFF;
+
+            GPU_ON;
+            ocl::split(d_src, d_dst);
+             ;
+            GPU_OFF;
+
+            GPU_FULL_ON;
+            d_src.upload(src);
+            ocl::split(d_src, d_dst);
+            GPU_FULL_OFF;
+        }
 
-        mat  = randomMat(rng, size, CV_MAKETYPE(type, channels), 5, 16, false);
-        dst1 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst2 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst3 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst4 = randomMat(rng, size, CV_MAKETYPE(type, 1), 5, 16, false);
-        //int devnums = getDevice(oclinfo);
-        //CV_Assert(devnums > 0);
-        ////if you want to use undefault device, set it here
-        ////setDevice(oclinfo[0]);
-        //setBinpath(CLBINPATH);
     }
-
-    void Has_roi(int b)
-    {
-        //cv::RNG& rng = TS::ptr()->get_rng();
-        if(b)
-        {
-            //randomize ROI
-            roicols =  mat.cols - 1; //start
-            roirows = mat.rows - 1;
-            srcx   = 1;
-            srcx   = 1;
-            dst1x    = 1;
-            dst1y    = 1;
-            dst2x    = 1;
-            dst2y    = 1;
-            dst3x    = 1;
-            dst3y    = 1;
-            dst4x    = 1;
-            dst4y    = 1;
-        }
-        else
-        {
-            roicols = mat.cols;
-            roirows = mat.rows;
-            srcx = 0;
-            srcy = 0;
-            dst1x = 0;
-            dst1y = 0;
-            dst2x    = 0;
-            dst2y    = 0;
-            dst3x    = 0;
-            dst3y    = 0;
-            dst4x    = 0;
-            dst4y    = 0;
-        };
-
-        mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-
-        dst1_roi = dst1(Rect(dst1x, dst1y, roicols, roirows));
-        dst2_roi = dst2(Rect(dst2x, dst2y, roicols, roirows));
-        dst3_roi = dst3(Rect(dst3x, dst3y, roicols, roirows));
-        dst4_roi = dst4(Rect(dst4x, dst4y, roicols, roirows));
-    }
-
-};
-
-struct Split : SplitTestBase {};
-
-TEST_P(Split, Accuracy)
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    double totalcputick = 0;
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
-    double t0 = 0;
-    double t1 = 0;
-    double t2 = 0;
-    for(int k = LOOPROISTART; k < LOOPROIEND; k++)
-    {
-        totalcputick = 0;
-        totalgputick = 0;
-        totalgputick_kernel = 0;
-        for(int j = 0; j < LOOP_TIMES + 1; j ++)
-        {
-            Has_roi(k);
-            cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
-            cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
-            t0 = (double)cvGetTickCount();//cpu start
-            cv::split(mat_roi, dev_dst);
-            t0 = (double)cvGetTickCount() - t0;//cpu end
-
-            t1 = (double)cvGetTickCount();//gpu start1
-            gdst1_whole = dst1;
-            gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
-
-            gdst2_whole = dst2;
-            gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
-
-            gdst3_whole = dst3;
-            gdst3 = gdst3_whole(Rect(dst3x, dst3y, roicols, roirows));
-
-            gdst4_whole = dst4;
-            gdst4 = gdst4_whole(Rect(dst4x, dst4y, roicols, roirows));
-
-            gmat = mat_roi;
-            t2 = (double)cvGetTickCount(); //kernel
-            cv::ocl::split(gmat, dev_gdst);
-            t2 = (double)cvGetTickCount() - t2;//kernel
-            cv::Mat cpu_dst1;
-            cv::Mat cpu_dst2;
-            cv::Mat cpu_dst3;
-            cv::Mat cpu_dst4;
-            gdst1_whole.download(cpu_dst1);
-            gdst2_whole.download(cpu_dst2);
-            gdst3_whole.download(cpu_dst3);
-            gdst4_whole.download(cpu_dst4);
-            t1 = (double)cvGetTickCount() - t1;//gpu end1
-            if(j == 0)
-                continue;
-            totalgputick = t1 + totalgputick;
-            totalcputick = t0 + totalcputick;
-            totalgputick_kernel = t2 + totalgputick_kernel;
-
-        }
-        if(k == 0)
-        {
-            cout << "no roi\n";
-        }
-        else
-        {
-            cout << "with roi\n";
-        };
-        cout << "average cpu runtime is  " << totalcputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-        cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    }
-#else
-    for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
-    {
-        Has_roi(j);
-        //cv::Mat         dev_dst[4]  = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
-        cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
-        gdst1_whole = dst1;
-        gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
-
-        gdst2_whole = dst2;
-        gdst2 = gdst2_whole(Rect(dst2x, dst2y, roicols, roirows));
-
-        gdst3_whole = dst3;
-        gdst3 = gdst3_whole(Rect(dst3x, dst3y, roicols, roirows));
-
-        gdst4_whole = dst4;
-        gdst4 = gdst4_whole(Rect(dst4x, dst4y, roicols, roirows));
-        gmat = mat_roi;
-        if(j == 0)
-        {
-            cout << "no roi:";
-        }
-        else
-        {
-            cout << "\nwith roi:";
-        };
-        cv::ocl::split(gmat, dev_gdst);
-    };
-#endif
 }
-
-//*************test*****************
-INSTANTIATE_TEST_CASE_P(SplitMerge, Merge, Combine(
-                            Values(CV_8UC4, CV_32FC4), Values(1, 4)));
-
-INSTANTIATE_TEST_CASE_P(SplitMerge, Split , Combine(
-                            Values(CV_8U, CV_32S, CV_32F), Values(1, 4)));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/precomp.cpp b/modules/ocl/perf/precomp.cpp
index 7d287004e..e35a07145 100644
--- a/modules/ocl/perf/precomp.cpp
+++ b/modules/ocl/perf/precomp.cpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -41,4 +42,321 @@
 
 #include "precomp.hpp"
 
+// This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
+// All images needed in this test are in samples/gpu folder.
+// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory
+void TestSystem::run()
+{
+    if (is_list_mode_)
+    {
+        for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+        {
+            cout << (*it)->name() << endl;
+        }
+
+        return;
+    }
+
+    // Run test initializers
+    for (vector<Runnable *>::iterator it = inits_.begin(); it != inits_.end(); ++it)
+    {
+        if ((*it)->name().find(test_filter_, 0) != string::npos)
+        {
+            (*it)->run();
+        }
+    }
+
+    printHeading();
+    writeHeading();
+
+    // Run tests
+    for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
+    {
+        try
+        {
+            if ((*it)->name().find(test_filter_, 0) != string::npos)
+            {
+                cout << endl << (*it)->name() << ":\n";
+
+                setCurrentTest((*it)->name());
+                //fprintf(record_,"%s\n",(*it)->name().c_str());
+
+                (*it)->run();
+                finishCurrentSubtest();
+            }
+        }
+        catch (const Exception &)
+        {
+            // Message is printed via callback
+            resetCurrentSubtest();
+        }
+        catch (const runtime_error &e)
+        {
+            printError(e.what());
+            resetCurrentSubtest();
+        }
+    }
+
+    printSummary();
+    writeSummary();
+}
+
+
+void TestSystem::finishCurrentSubtest()
+{
+    if (cur_subtest_is_empty_)
+        // There is no need to print subtest statistics
+    {
+        return;
+    }
+
+    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
+    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
+
+    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
+    speedup_total_ += speedup;
+
+    double fullspeedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_full_elapsed_);
+    speedup_full_total_ += fullspeedup;
+
+    if (speedup > top_)
+    {
+        speedup_faster_count_++;
+    }
+    else if (speedup < bottom_)
+    {
+        speedup_slower_count_++;
+    }
+    else
+    {
+        speedup_equal_count_++;
+    }
+
+    if (fullspeedup > top_)
+    {
+        speedup_full_faster_count_++;
+    }
+    else if (fullspeedup < bottom_)
+    {
+        speedup_full_slower_count_++;
+    }
+    else
+    {
+        speedup_full_equal_count_++;
+    }
+
+    // compute min, max and
+    std::sort(gpu_times_.begin(), gpu_times_.end());
+    double gpu_min = gpu_times_.front() / getTickFrequency() * 1000.0;
+    double gpu_max = gpu_times_.back() / getTickFrequency() * 1000.0;
+    double deviation = 0;
+
+    if (gpu_times_.size() > 1)
+    {
+        double sum = 0;
+
+        for (size_t i = 0; i < gpu_times_.size(); i++)
+        {
+            int64 diff = gpu_times_[i] - static_cast<int64>(gpu_elapsed_);
+            double diff_time = diff * 1000 / getTickFrequency();
+            sum += diff_time * diff_time;
+        }
+
+        deviation = std::sqrt(sum / gpu_times_.size());
+    }
+
+    printMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
+    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
+
+    num_subtests_called_++;
+    resetCurrentSubtest();
+}
+
+
+double TestSystem::meanTime(const vector<int64> &samples)
+{
+    double sum = accumulate(samples.begin(), samples.end(), 0.);
+    return sum / samples.size();
+}
+
+
+void TestSystem::printHeading()
+{
+    cout << endl;
+    cout << setiosflags(ios_base::left);
+    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
+         << setw(14) << "SPEEDUP" << setw(14) << "GPUTOTAL, ms" << setw(14) << "TOTALSPEEDUP"
+         << "DESCRIPTION\n";
+
+    cout << resetiosflags(ios_base::left);
+}
+
+void TestSystem::writeHeading()
+{
+    if (!record_)
+    {
+        recordname_ += "_OCL.csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "NAME,DESCRIPTION,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
+
+    fflush(record_);
+}
+
+void TestSystem::printSummary()
+{
+    cout << setiosflags(ios_base::fixed);
+    cout << "\naverage GPU speedup: x"
+         << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
+         << endl;
+    cout << "\nGPU exceeded: "
+         << setprecision(3) << speedup_faster_count_
+         << "\nGPU passed: "
+         << setprecision(3) << speedup_equal_count_
+         << "\nGPU failed: "
+         << setprecision(3) << speedup_slower_count_
+         << endl;
+    cout << "\nGPU exceeded rate: "
+         << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPU passed rate: "
+         << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPU failed rate: "
+         << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << endl;
+    cout << "\naverage GPUTOTAL speedup: x"
+         << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
+         << endl;
+    cout << "\nGPUTOTAL exceeded: "
+         << setprecision(3) << speedup_full_faster_count_
+         << "\nGPUTOTAL passed: "
+         << setprecision(3) << speedup_full_equal_count_
+         << "\nGPUTOTAL failed: "
+         << setprecision(3) << speedup_full_slower_count_
+         << endl;
+    cout << "\nGPUTOTAL exceeded rate: "
+         << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPUTOTAL passed rate: "
+         << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << "\nGPUTOTAL failed rate: "
+         << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+         << "%"
+         << endl;
+    cout << resetiosflags(ios_base::fixed);
+}
+
+
+void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
+{
+    cout << TAB << setiosflags(ios_base::left);
+    stringstream stream;
+
+    stream << cpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << gpu_time;
+    cout << setw(10) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << speedup;
+    cout << setw(14) << stream.str();
+
+    stream.str("");
+    stream << gpu_full_time;
+    cout << setw(14) << stream.str();
+
+    stream.str("");
+    stream << "x" << setprecision(3) << fullspeedup;
+    cout << setw(14) << stream.str();
+
+    cout << cur_subtest_description_.str();
+    cout << resetiosflags(ios_base::left) << endl;
+}
+
+void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
+{
+    if (!record_)
+    {
+        recordname_ += ".csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
+            cur_subtest_description_.str().c_str(),
+            cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
+            gpu_min, gpu_max, std_dev);
+
+    if (itname_changed_)
+    {
+        itname_changed_ = false;
+    }
+
+    fflush(record_);
+}
+
+void TestSystem::writeSummary()
+{
+    if (!record_)
+    {
+        recordname_ += ".csv";
+        record_ = fopen(recordname_.c_str(), "w");
+    }
+
+    fprintf(record_, "\nAverage GPU speedup: %.3f\n"
+            "exceeded: %d (%.3f%%)\n"
+            "passed: %d (%.3f%%)\n"
+            "failed: %d (%.3f%%)\n"
+            "\nAverage GPUTOTAL speedup: %.3f\n"
+            "exceeded: %d (%.3f%%)\n"
+            "passed: %d (%.3f%%)\n"
+            "failed: %d (%.3f%%)\n",
+            speedup_total_ / std::max(1, num_subtests_called_),
+            speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_total_ / std::max(1, num_subtests_called_),
+            speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
+            speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
+           );
+    fflush(record_);
+}
+
+void TestSystem::printError(const std::string &msg)
+{
+	if(msg != "CL_INVALID_BUFFER_SIZE")
+	{
+		cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
+	}
+}
+
+void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
+{
+    mat.create(rows, cols, type);
+    RNG rng(0);
+    rng.fill(mat, RNG::UNIFORM, low, high);
+}
+
+
+string abspath(const string &relpath)
+{
+    return TestSystem::instance().workingDir() + relpath;
+}
+
+
+int CV_CDECL cvErrorCallback(int /*status*/, const char * /*func_name*/,
+                             const char *err_msg, const char * /*file_name*/,
+                             int /*line*/, void * /*userdata*/)
+{
+    TestSystem::instance().printError(err_msg);
+    return 0;
+}
+
 
diff --git a/modules/ocl/perf/precomp.hpp b/modules/ocl/perf/precomp.hpp
index 34eea555f..819ac5925 100644
--- a/modules/ocl/perf/precomp.hpp
+++ b/modules/ocl/perf/precomp.hpp
@@ -7,12 +7,13 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
-//
+
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +22,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -39,43 +40,352 @@
 //
 //M*/
 
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  if defined __clang__ || defined __APPLE__
-#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
-#    pragma GCC diagnostic ignored "-Wextra"
-#  endif
-#endif
-
-#ifndef __OPENCV_TEST_PRECOMP_HPP__
-#define __OPENCV_TEST_PRECOMP_HPP__
-
-#include <cmath>
-#include <cstdio>
+#include <iomanip>
+#include <stdexcept>
+#include <string>
 #include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-#include <limits>
-#include <algorithm>
-#include <iterator>
-#include <string>
-#include <cstdarg>
-#include "cvconfig.h"
+#include <cstdio>
+#include <vector>
+#include <numeric>
 #include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
-//#include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
 #include "opencv2/video/video.hpp"
-#include "opencv2/ts/ts.hpp"
-#include "opencv2/ts/ts_perf.hpp"
+#include "opencv2/objdetect/objdetect.hpp"
+#include "opencv2/features2d/features2d.hpp"
 #include "opencv2/ocl/ocl.hpp"
-//#include "opencv2/nonfree/nonfree.hpp"
 
-#include "utility.hpp"
-#include "interpolation.hpp"
-//#include "add_test_info.h"
-//#define  PERF_TEST_OCL 1
+#define Min_Size 1000
+#define Max_Size 4000
+#define Multiple 2
+#define TAB "    "
 
-#endif
+using namespace std;
+using namespace cv;
 
+void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
+string abspath(const string &relpath);
+int CV_CDECL cvErrorCallback(int, const char *, const char *, const char *, int, void *);
+typedef struct
+{
+    short x;
+    short y;
+} COOR;
+COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep,
+                  cv::Size size, int sp, int sr, int maxIter, float eps, int *tab);
+void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi,
+                    int sp, int sr, cv::TermCriteria crit);
+
+class Runnable
+{
+public:
+    explicit Runnable(const std::string &runname): name_(runname) {}
+    virtual ~Runnable() {}
+
+    const std::string &name() const
+    {
+        return name_;
+    }
+
+    virtual void run() = 0;
+
+private:
+    std::string name_;
+};
+
+class TestSystem
+{
+public:
+    static TestSystem &instance()
+    {
+        static TestSystem me;
+        return me;
+    }
+
+    void setWorkingDir(const std::string &val)
+    {
+        working_dir_ = val;
+    }
+    const std::string &workingDir() const
+    {
+        return working_dir_;
+    }
+
+    void setTestFilter(const std::string &val)
+    {
+        test_filter_ = val;
+    }
+    const std::string &testFilter() const
+    {
+        return test_filter_;
+    }
+
+    void setNumIters(int num_iters)
+    {
+        num_iters_ = num_iters;
+    }
+    void setGPUWarmupIters(int num_iters)
+    {
+        gpu_warmup_iters_ = num_iters;
+    }
+    void setCPUIters(int num_iters)
+    {
+        cpu_num_iters_ = num_iters;
+    }
+
+    void setTopThreshold(double top)
+    {
+        top_ = top;
+    }
+    void setBottomThreshold(double bottom)
+    {
+        bottom_ = bottom;
+    }
+
+    void addInit(Runnable *init)
+    {
+        inits_.push_back(init);
+    }
+    void addTest(Runnable *test)
+    {
+        tests_.push_back(test);
+    }
+    void run();
+
+    // It's public because OpenCV callback uses it
+    void printError(const std::string &msg);
+
+    std::stringstream &startNewSubtest()
+    {
+        finishCurrentSubtest();
+        return cur_subtest_description_;
+    }
+
+    bool stop() const
+    {
+        return cur_iter_idx_ >= num_iters_;
+    }
+
+    bool cpu_stop() const
+    {
+        return cur_iter_idx_ >= cpu_num_iters_;
+    }
+
+    bool warmupStop()
+    {
+        return cur_warmup_idx_++ >= gpu_warmup_iters_;
+    }
+
+    void warmupComplete()
+    {
+        cur_warmup_idx_ = 0;
+    }
+
+    void cpuOn()
+    {
+        cpu_started_ = cv::getTickCount();
+    }
+    void cpuOff()
+    {
+        int64 delta = cv::getTickCount() - cpu_started_;
+        cpu_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void cpuComplete()
+    {
+        cpu_elapsed_ += meanTime(cpu_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    void gpuOn()
+    {
+        gpu_started_ = cv::getTickCount();
+    }
+    void gpuOff()
+    {
+        int64 delta = cv::getTickCount() - gpu_started_;
+        gpu_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void gpuComplete()
+    {
+        gpu_elapsed_ += meanTime(gpu_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    void gpufullOn()
+    {
+        gpu_full_started_ = cv::getTickCount();
+    }
+    void gpufullOff()
+    {
+        int64 delta = cv::getTickCount() - gpu_full_started_;
+        gpu_full_times_.push_back(delta);
+        ++cur_iter_idx_;
+    }
+    void gpufullComplete()
+    {
+        gpu_full_elapsed_ += meanTime(gpu_full_times_);
+        cur_subtest_is_empty_ = false;
+        cur_iter_idx_ = 0;
+    }
+
+    bool isListMode() const
+    {
+        return is_list_mode_;
+    }
+    void setListMode(bool value)
+    {
+        is_list_mode_ = value;
+    }
+
+    void setRecordName(const std::string &name)
+    {
+        recordname_ = name;
+    }
+
+    void setCurrentTest(const std::string &name)
+    {
+        itname_ = name;
+        itname_changed_ = true;
+    }
+
+private:
+    TestSystem():
+        cur_subtest_is_empty_(true), cpu_elapsed_(0),
+        gpu_elapsed_(0), gpu_full_elapsed_(0), speedup_total_(0.0),
+        num_subtests_called_(0),
+        speedup_faster_count_(0), speedup_slower_count_(0), speedup_equal_count_(0),
+        speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
+        num_iters_(10), cpu_num_iters_(2),
+        gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
+        record_(0), recordname_("performance"), itname_changed_(true)
+    {
+        cpu_times_.reserve(num_iters_);
+        gpu_times_.reserve(num_iters_);
+        gpu_full_times_.reserve(num_iters_);
+    }
+
+    void finishCurrentSubtest();
+    void resetCurrentSubtest()
+    {
+        cpu_elapsed_ = 0;
+        gpu_elapsed_ = 0;
+        gpu_full_elapsed_ = 0;
+        cur_subtest_description_.str("");
+        cur_subtest_is_empty_ = true;
+        cur_iter_idx_ = 0;
+        cpu_times_.clear();
+        gpu_times_.clear();
+        gpu_full_times_.clear();
+    }
+
+    double meanTime(const std::vector<int64> &samples);
+
+    void printHeading();
+    void printSummary();
+    void printMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f, double speedup = 0.0f, double fullspeedup = 0.0f);
+
+    void writeHeading();
+    void writeSummary();
+    void writeMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
+                      double speedup = 0.0f, double fullspeedup = 0.0f,
+                      double gpu_min = 0.0f, double gpu_max = 0.0f, double std_dev = 0.0f);
+
+    std::string working_dir_;
+    std::string test_filter_;
+
+    std::vector<Runnable *> inits_;
+    std::vector<Runnable *> tests_;
+
+    std::stringstream cur_subtest_description_;
+    bool cur_subtest_is_empty_;
+
+    int64 cpu_started_;
+    int64 gpu_started_;
+    int64 gpu_full_started_;
+    double cpu_elapsed_;
+    double gpu_elapsed_;
+    double gpu_full_elapsed_;
+
+    double speedup_total_;
+    double speedup_full_total_;
+    int num_subtests_called_;
+
+    int speedup_faster_count_;
+    int speedup_slower_count_;
+    int speedup_equal_count_;
+
+    int speedup_full_faster_count_;
+    int speedup_full_slower_count_;
+    int speedup_full_equal_count_;
+
+    bool is_list_mode_;
+
+    double top_;
+    double bottom_;
+
+    int num_iters_;
+    int cpu_num_iters_;		//there's no need to set cpu running same times with gpu
+    int gpu_warmup_iters_;	//gpu warm up times, default is 1
+    int cur_iter_idx_;
+    int cur_warmup_idx_;	//current gpu warm up times
+    std::vector<int64> cpu_times_;
+    std::vector<int64> gpu_times_;
+    std::vector<int64> gpu_full_times_;
+
+    FILE *record_;
+    std::string recordname_;
+    std::string itname_;
+    bool itname_changed_;
+};
+
+
+#define GLOBAL_INIT(name) \
+struct name##_init: Runnable { \
+	name##_init(): Runnable(#name) { \
+	TestSystem::instance().addInit(this); \
+} \
+	void run(); \
+} name##_init_instance; \
+	void name##_init::run()
+
+
+#define TEST(name) \
+struct name##_test: Runnable { \
+	name##_test(): Runnable(#name) { \
+	TestSystem::instance().addTest(this); \
+} \
+	void run(); \
+} name##_test_instance; \
+	void name##_test::run()
+
+#define SUBTEST TestSystem::instance().startNewSubtest()
+
+#define CPU_ON \
+	while (!TestSystem::instance().cpu_stop()) { \
+	TestSystem::instance().cpuOn()
+#define CPU_OFF \
+	TestSystem::instance().cpuOff(); \
+	} TestSystem::instance().cpuComplete()
+
+#define GPU_ON \
+	while (!TestSystem::instance().stop()) { \
+	TestSystem::instance().gpuOn()
+#define GPU_OFF \
+	TestSystem::instance().gpuOff(); \
+	} TestSystem::instance().gpuComplete()
+
+#define GPU_FULL_ON \
+	while (!TestSystem::instance().stop()) { \
+	TestSystem::instance().gpufullOn()
+#define GPU_FULL_OFF \
+	TestSystem::instance().gpufullOff(); \
+	} TestSystem::instance().gpufullComplete()
+
+#define WARMUP_ON \
+	while (!TestSystem::instance().warmupStop()) {
+#define WARMUP_OFF \
+	} TestSystem::instance().warmupComplete()
diff --git a/modules/ocl/perf/utility.cpp b/modules/ocl/perf/utility.cpp
deleted file mode 100644
index b7fbe4fa0..000000000
--- a/modules/ocl/perf/utility.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#define VARNAME(A) #A
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-using namespace cvtest;
-
-
-//std::string generateVarList(int first,...)
-//{
-//	vector<std::string> varname;
-//
-//	va_list argp;
-//	string s;
-//	stringstream ss;
-//	va_start(argp,first);
-//	int i=first;
-//	while(i!=-1)
-//	{
-//		ss<<i<<",";
-//		i=va_arg(argp,int);
-//	};
-//	s=ss.str();
-//	va_end(argp);
-//	return s;
-//};
-
-//std::string generateVarList(int& p1,int& p2)
-//{
-//	stringstream ss;
-//	ss<<VARNAME(p1)<<":"<<src1x<<","<<VARNAME(p2)<<":"<<src1y;
-//	return ss.str();
-//};
-
-int randomInt(int minVal, int maxVal)
-{
-    RNG &rng = TS::ptr()->get_rng();
-    return rng.uniform(minVal, maxVal);
-}
-
-double randomDouble(double minVal, double maxVal)
-{
-    RNG &rng = TS::ptr()->get_rng();
-    return rng.uniform(minVal, maxVal);
-}
-
-Size randomSize(int minVal, int maxVal)
-{
-    return cv::Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
-}
-
-Scalar randomScalar(double minVal, double maxVal)
-{
-    return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
-}
-
-Mat randomMat(Size size, int type, double minVal, double maxVal)
-{
-    return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
-}
-
-
-
-
-
-
-
-/*
-void showDiff(InputArray gold_, InputArray actual_, double eps)
-{
-    Mat gold;
-    if (gold_.kind() == _InputArray::MAT)
-        gold = gold_.getMat();
-    else
-        gold_.getGpuMat().download(gold);
-
-    Mat actual;
-    if (actual_.kind() == _InputArray::MAT)
-        actual = actual_.getMat();
-    else
-        actual_.getGpuMat().download(actual);
-
-    Mat diff;
-    absdiff(gold, actual, diff);
-    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
-
-    namedWindow("gold", WINDOW_NORMAL);
-    namedWindow("actual", WINDOW_NORMAL);
-    namedWindow("diff", WINDOW_NORMAL);
-
-    imshow("gold", gold);
-    imshow("actual", actual);
-    imshow("diff", diff);
-
-    waitKey();
-}
-*/
-
-/*
-bool supportFeature(const DeviceInfo& info, FeatureSet feature)
-{
-    return TargetArchs::builtWith(feature) && info.supports(feature);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
-
-vector<DeviceInfo> devices(FeatureSet feature)
-{
-    const vector<DeviceInfo>& d = devices();
-
-    vector<DeviceInfo> devs_filtered;
-
-    if (TargetArchs::builtWith(feature))
-    {
-        devs_filtered.reserve(d.size());
-
-        for (size_t i = 0, size = d.size(); i < size; ++i)
-        {
-            const DeviceInfo& info = d[i];
-
-            if (info.supports(feature))
-                devs_filtered.push_back(info);
-        }
-    }
-
-    return devs_filtered;
-}
-*/
-
-vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
-{
-    vector<MatType> v;
-
-    v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
-
-    for (int depth = depth_start; depth <= depth_end; ++depth)
-    {
-        for (int cn = cn_start; cn <= cn_end; ++cn)
-        {
-            v.push_back(CV_MAKETYPE(depth, cn));
-        }
-    }
-
-    return v;
-}
-
-const vector<MatType> &all_types()
-{
-    static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
-
-    return v;
-}
-
-Mat readImage(const string &fileName, int flags)
-{
-    return imread(string(cvtest::TS::ptr()->get_data_path()) + fileName, flags);
-}
-
-Mat readImageType(const string &fname, int type)
-{
-    Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
-    if (CV_MAT_CN(type) == 4)
-    {
-        Mat temp;
-        cvtColor(src, temp, cv::COLOR_BGR2BGRA);
-        swap(src, temp);
-    }
-    src.convertTo(src, CV_MAT_DEPTH(type));
-    return src;
-}
-
-double checkNorm(const Mat &m)
-{
-    return norm(m, NORM_INF);
-}
-
-double checkNorm(const Mat &m1, const Mat &m2)
-{
-    return norm(m1, m2, NORM_INF);
-}
-
-double checkSimilarity(const Mat &m1, const Mat &m2)
-{
-    Mat diff;
-    matchTemplate(m1, m2, diff, CV_TM_CCORR_NORMED);
-    return std::abs(diff.at<float>(0, 0) - 1.f);
-}
-
-/*
-void cv::ocl::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    (*os) << info.name();
-}
-*/
-
-void PrintTo(const Inverse &inverse, std::ostream *os)
-{
-    if (inverse)
-        (*os) << "inverse";
-    else
-        (*os) << "direct";
-}
diff --git a/modules/ocl/perf/utility.hpp b/modules/ocl/perf/utility.hpp
deleted file mode 100644
index 7d34b6731..000000000
--- a/modules/ocl/perf/utility.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_UTILITY_HPP__
-#define __OPENCV_TEST_UTILITY_HPP__
-//#define PRINT_KERNEL_RUN_TIME
-#ifdef PRINT_KERNEL_RUN_TIME
-#define LOOP_TIMES 1
-#else
-#define LOOP_TIMES 1
-#endif
-#define MWIDTH 1920
-#define MHEIGHT 1080
-#define CLBINPATH ".\\"
-#define LOOPROISTART 0
-#define LOOPROIEND 1
-int randomInt(int minVal, int maxVal);
-double randomDouble(double minVal, double maxVal);
-
-//std::string generateVarList(int first,...);
-std::string generateVarList(int &p1, int &p2);
-cv::Size randomSize(int minVal, int maxVal);
-cv::Scalar randomScalar(double minVal, double maxVal);
-cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
-
-void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
-
-//! return true if device supports specified feature and gpu module was built with support the feature.
-//bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
-
-//! return all devices compatible with current gpu module build.
-//const std::vector<cv::ocl::DeviceInfo>& devices();
-//! return all devices compatible with current gpu module build which support specified feature.
-//std::vector<cv::ocl::DeviceInfo> devices(cv::gpu::FeatureSet feature);
-
-//! read image from testdata folder.
-cv::Mat readImage(const std::string &fileName, int flags = cv::IMREAD_COLOR);
-cv::Mat readImageType(const std::string &fname, int type);
-
-double checkNorm(const cv::Mat &m);
-double checkNorm(const cv::Mat &m1, const cv::Mat &m2);
-double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
-
-#define EXPECT_MAT_NORM(mat, eps) \
-{ \
-    EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
-}
-
-/*#define EXPECT_MAT_NEAR(mat1, mat2, eps) \
-{ \
-   ASSERT_EQ(mat1.type(), mat2.type()); \
-   ASSERT_EQ(mat1.size(), mat2.size()); \
-   EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
-}*/
-
-#define EXPECT_MAT_NEAR(mat1, mat2, eps,s) \
-{ \
-    ASSERT_EQ(mat1.type(), mat2.type()); \
-    ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps)<<s; \
-}
-
-#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
-{ \
-    ASSERT_EQ(mat1.type(), mat2.type()); \
-    ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkSimilarity(cv::Mat(mat1), cv::Mat(mat2)), eps); \
-}
-
-namespace cv
-{
-    namespace ocl
-    {
-        // void PrintTo(const DeviceInfo& info, std::ostream* os);
-    }
-}
-
-using perf::MatDepth;
-using perf::MatType;
-
-//! return vector with types from specified range.
-std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
-
-//! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
-const std::vector<MatType> &all_types();
-
-class Inverse
-{
-public:
-    inline Inverse(bool val = false) : val_(val) {}
-
-    inline operator bool() const
-    {
-        return val_;
-    }
-
-private:
-    bool val_;
-};
-
-void PrintTo(const Inverse &useRoi, std::ostream *os);
-
-CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
-
-CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
-
-enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
-CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
-
-CV_ENUM(ReduceOp, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
-
-CV_FLAGS(GemmFlags, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
-
-CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
-
-CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC)
-
-CV_ENUM(Border, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-
-CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WARP_INVERSE_MAP)
-
-CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
-
-CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
-
-void  run_perf_test();
-
-#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-#define DEVICES(feature) testing::ValuesIn(devices(feature))
-
-#define ALL_TYPES testing::ValuesIn(all_types())
-#define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
-
-#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
-
-#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
-
-#endif // __OPENCV_TEST_UTILITY_HPP__
diff --git a/samples/ocl/performance.cpp b/samples/ocl/performance.cpp
deleted file mode 100644
index 695516f14..000000000
--- a/samples/ocl/performance.cpp
+++ /dev/null
@@ -1,4397 +0,0 @@
-#include <iomanip>
-#include <stdexcept>
-#include <string>
-#include <iostream>
-#include <cstdio>
-#include <vector>
-#include <numeric>
-#include "opencv2/core/core.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/video/video.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
-#include "opencv2/objdetect/objdetect.hpp"
-#include "opencv2/features2d/features2d.hpp"
-#define USE_OPENCL
-#ifdef USE_OPENCL
-#include "opencv2/ocl/ocl.hpp"
-#include "opencv2/nonfree/ocl.hpp"
-#endif
-
-#define TAB "    "
-
-using namespace std;
-using namespace cv;
-
-// This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
-// All images needed in this test are in samples/gpu folder.
-// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory
-
-void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
-string abspath(const string &relpath);
-int CV_CDECL cvErrorCallback(int, const char *, const char *, const char *, int, void *);
-typedef struct
-{
-    short x;
-    short y;
-} COOR;
-COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep,
-                  cv::Size size, int sp, int sr, int maxIter, float eps, int *tab);
-void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi,
-                    int sp, int sr, cv::TermCriteria crit);
-
-class Runnable
-{
-public:
-    explicit Runnable(const std::string &runname): name_(runname) {}
-    virtual ~Runnable() {}
-
-    const std::string &name() const
-    {
-        return name_;
-    }
-
-    virtual void run() = 0;
-
-private:
-    std::string name_;
-};
-
-class TestSystem
-{
-public:
-    static TestSystem &instance()
-    {
-        static TestSystem me;
-        return me;
-    }
-
-    void setWorkingDir(const std::string &val)
-    {
-        working_dir_ = val;
-    }
-    const std::string &workingDir() const
-    {
-        return working_dir_;
-    }
-
-    void setTestFilter(const std::string &val)
-    {
-        test_filter_ = val;
-    }
-    const std::string &testFilter() const
-    {
-        return test_filter_;
-    }
-
-    void setNumIters(int num_iters)
-    {
-        num_iters_ = num_iters;
-    }
-    void setGPUWarmupIters(int num_iters)
-    {
-        gpu_warmup_iters_ = num_iters;
-    }
-    void setCPUIters(int num_iters)
-    {
-        cpu_num_iters_ = num_iters;
-    }
-
-    void setTopThreshold(double top)
-    {
-        top_ = top;
-    }
-    void setBottomThreshold(double bottom)
-    {
-        bottom_ = bottom;
-    }
-
-    void addInit(Runnable *init)
-    {
-        inits_.push_back(init);
-    }
-    void addTest(Runnable *test)
-    {
-        tests_.push_back(test);
-    }
-    void run();
-
-    // It's public because OpenCV callback uses it
-    void printError(const std::string &msg);
-
-    std::stringstream &startNewSubtest()
-    {
-        finishCurrentSubtest();
-        return cur_subtest_description_;
-    }
-
-    bool stop() const
-    {
-        return cur_iter_idx_ >= num_iters_;
-    }
-
-    bool cpu_stop() const
-    {
-        return cur_iter_idx_ >= cpu_num_iters_;
-    }
-
-    bool warmupStop()
-    {
-        return cur_warmup_idx_++ >= gpu_warmup_iters_;
-    }
-
-    void warmupComplete()
-    {
-        cur_warmup_idx_ = 0;
-    }
-
-    void cpuOn()
-    {
-        cpu_started_ = cv::getTickCount();
-    }
-    void cpuOff()
-    {
-        int64 delta = cv::getTickCount() - cpu_started_;
-        cpu_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void cpuComplete()
-    {
-        cpu_elapsed_ += meanTime(cpu_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    void gpuOn()
-    {
-        gpu_started_ = cv::getTickCount();
-    }
-    void gpuOff()
-    {
-        int64 delta = cv::getTickCount() - gpu_started_;
-        gpu_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void gpuComplete()
-    {
-        gpu_elapsed_ += meanTime(gpu_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    void gpufullOn()
-    {
-        gpu_full_started_ = cv::getTickCount();
-    }
-    void gpufullOff()
-    {
-        int64 delta = cv::getTickCount() - gpu_full_started_;
-        gpu_full_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void gpufullComplete()
-    {
-        gpu_full_elapsed_ += meanTime(gpu_full_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    bool isListMode() const
-    {
-        return is_list_mode_;
-    }
-    void setListMode(bool value)
-    {
-        is_list_mode_ = value;
-    }
-
-    void setRecordName(const std::string &name)
-    {
-        recordname_ = name;
-    }
-
-    void setCurrentTest(const std::string &name)
-    {
-        itname_ = name;
-        itname_changed_ = true;
-    }
-
-private:
-    TestSystem():
-        cur_subtest_is_empty_(true), cpu_elapsed_(0),
-        gpu_elapsed_(0), gpu_full_elapsed_(0), speedup_total_(0.0),
-        num_subtests_called_(0),
-        speedup_faster_count_(0), speedup_slower_count_(0), speedup_equal_count_(0),
-        speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
-        num_iters_(10), cpu_num_iters_(2),
-        gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
-        record_(0), recordname_("performance"), itname_changed_(true)
-    {
-        cpu_times_.reserve(num_iters_);
-        gpu_times_.reserve(num_iters_);
-        gpu_full_times_.reserve(num_iters_);
-    }
-
-    void finishCurrentSubtest();
-    void resetCurrentSubtest()
-    {
-        cpu_elapsed_ = 0;
-        gpu_elapsed_ = 0;
-        gpu_full_elapsed_ = 0;
-        cur_subtest_description_.str("");
-        cur_subtest_is_empty_ = true;
-        cur_iter_idx_ = 0;
-        cpu_times_.clear();
-        gpu_times_.clear();
-        gpu_full_times_.clear();
-    }
-
-    double meanTime(const std::vector<int64> &samples);
-
-    void printHeading();
-    void printSummary();
-    void printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup);
-
-    void writeHeading();
-    void writeSummary();
-    void writeMetrics(double cpu_time, double gpu_time, double gpu_full_time,
-                      double speedup, double fullspeedup,
-                      double gpu_min, double gpu_max, double std_dev);
-
-    std::string working_dir_;
-    std::string test_filter_;
-
-    std::vector<Runnable *> inits_;
-    std::vector<Runnable *> tests_;
-
-    std::stringstream cur_subtest_description_;
-    bool cur_subtest_is_empty_;
-
-    int64 cpu_started_;
-    int64 gpu_started_;
-    int64 gpu_full_started_;
-    double cpu_elapsed_;
-    double gpu_elapsed_;
-    double gpu_full_elapsed_;
-
-    double speedup_total_;
-    double speedup_full_total_;
-    int num_subtests_called_;
-
-    int speedup_faster_count_;
-    int speedup_slower_count_;
-    int speedup_equal_count_;
-
-    int speedup_full_faster_count_;
-    int speedup_full_slower_count_;
-    int speedup_full_equal_count_;
-
-    bool is_list_mode_;
-
-    double top_;
-    double bottom_;
-
-    int num_iters_;
-    int cpu_num_iters_;		//there's no need to set cpu running same times with gpu
-    int gpu_warmup_iters_;	//gpu warm up times, default is 1
-    int cur_iter_idx_;
-    int cur_warmup_idx_;	//current gpu warm up times
-    std::vector<int64> cpu_times_;
-    std::vector<int64> gpu_times_;
-    std::vector<int64> gpu_full_times_;
-
-    FILE *record_;
-    std::string recordname_;
-    std::string itname_;
-    bool itname_changed_;
-};
-
-
-#define GLOBAL_INIT(name) \
-    struct name##_init: Runnable { \
-        name##_init(): Runnable(#name) { \
-            TestSystem::instance().addInit(this); \
-        } \
-        void run(); \
-    } name##_init_instance; \
-    void name##_init::run()
-
-
-#define TEST(name) \
-    struct name##_test: Runnable { \
-        name##_test(): Runnable(#name) { \
-            TestSystem::instance().addTest(this); \
-        } \
-        void run(); \
-    } name##_test_instance; \
-    void name##_test::run()
-
-#define SUBTEST TestSystem::instance().startNewSubtest()
-
-#define CPU_ON \
-    while (!TestSystem::instance().cpu_stop()) { \
-        TestSystem::instance().cpuOn()
-#define CPU_OFF \
-        TestSystem::instance().cpuOff(); \
-    } TestSystem::instance().cpuComplete()
-
-#define GPU_ON \
-    while (!TestSystem::instance().stop()) { \
-        TestSystem::instance().gpuOn()
-#define GPU_OFF \
-        TestSystem::instance().gpuOff(); \
-    } TestSystem::instance().gpuComplete()
-
-#define GPU_FULL_ON \
-    while (!TestSystem::instance().stop()) { \
-        TestSystem::instance().gpufullOn()
-#define GPU_FULL_OFF \
-        TestSystem::instance().gpufullOff(); \
-    } TestSystem::instance().gpufullComplete()
-
-#define WARMUP_ON \
-    while (!TestSystem::instance().warmupStop()) {
-#define WARMUP_OFF \
-    } TestSystem::instance().warmupComplete()
-
-void TestSystem::run()
-{
-    if (is_list_mode_)
-    {
-        for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
-        {
-            cout << (*it)->name() << endl;
-        }
-
-        return;
-    }
-
-    // Run test initializers
-    for (vector<Runnable *>::iterator it = inits_.begin(); it != inits_.end(); ++it)
-    {
-        if ((*it)->name().find(test_filter_, 0) != string::npos)
-        {
-            (*it)->run();
-        }
-    }
-
-    printHeading();
-    writeHeading();
-
-    // Run tests
-    for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
-    {
-        try
-        {
-            if ((*it)->name().find(test_filter_, 0) != string::npos)
-            {
-                cout << endl << (*it)->name() << ":\n";
-
-                setCurrentTest((*it)->name());
-                //fprintf(record_,"%s\n",(*it)->name().c_str());
-
-                (*it)->run();
-                finishCurrentSubtest();
-            }
-        }
-        catch (const Exception &)
-        {
-            // Message is printed via callback
-            resetCurrentSubtest();
-        }
-        catch (const runtime_error &e)
-        {
-            printError(e.what());
-            resetCurrentSubtest();
-        }
-    }
-
-#ifdef USE_OPENCL
-    printSummary();
-    writeSummary();
-#endif
-}
-
-
-void TestSystem::finishCurrentSubtest()
-{
-    if (cur_subtest_is_empty_)
-        // There is no need to print subtest statistics
-    {
-        return;
-    }
-
-    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
-    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
-    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
-
-    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
-    speedup_total_ += speedup;
-
-    double fullspeedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_full_elapsed_);
-    speedup_full_total_ += fullspeedup;
-
-    if (speedup > top_)
-    {
-        speedup_faster_count_++;
-    }
-    else if (speedup < bottom_)
-    {
-        speedup_slower_count_++;
-    }
-    else
-    {
-        speedup_equal_count_++;
-    }
-
-    if (fullspeedup > top_)
-    {
-        speedup_full_faster_count_++;
-    }
-    else if (fullspeedup < bottom_)
-    {
-        speedup_full_slower_count_++;
-    }
-    else
-    {
-        speedup_full_equal_count_++;
-    }
-
-    // compute min, max and
-    std::sort(gpu_times_.begin(), gpu_times_.end());
-    double gpu_min = gpu_times_.front() / getTickFrequency() * 1000.0;
-    double gpu_max = gpu_times_.back() / getTickFrequency() * 1000.0;
-    double deviation = 0;
-
-    if (gpu_times_.size() > 1)
-    {
-        double sum = 0;
-
-        for (size_t i = 0; i < gpu_times_.size(); i++)
-        {
-            int64 diff = gpu_times_[i] - static_cast<int64>(gpu_elapsed_);
-            double diff_time = diff * 1000 / getTickFrequency();
-            sum += diff_time * diff_time;
-        }
-
-        deviation = std::sqrt(sum / gpu_times_.size());
-    }
-
-    printMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
-    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
-
-    num_subtests_called_++;
-    resetCurrentSubtest();
-}
-
-
-double TestSystem::meanTime(const vector<int64> &samples)
-{
-    double sum = accumulate(samples.begin(), samples.end(), 0.);
-    return sum / samples.size();
-}
-
-
-void TestSystem::printHeading()
-{
-    cout << endl;
-    cout << setiosflags(ios_base::left);
-#ifdef USE_OPENCL
-    cout << TAB << setw(10) << "CPU, ms" << setw(10) << "GPU, ms"
-         << setw(14) << "SPEEDUP" << setw(14) << "GPUTOTAL, ms" << setw(14) << "TOTALSPEEDUP"
-         << "DESCRIPTION\n";
-#else
-    cout << TAB << setw(10) << "CPU, ms\n";
-#endif
-    cout << resetiosflags(ios_base::left);
-}
-
-void TestSystem::writeHeading()
-{
-    if (!record_)
-    {
-#ifdef USE_OPENCL
-        recordname_ += "_OCL.csv";
-#else
-        recordname_ += "_CPU.csv";
-#endif
-        record_ = fopen(recordname_.c_str(), "w");
-    }
-
-#ifdef USE_OPENCL
-    fprintf(record_, "NAME,DESCRIPTION,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
-#else
-    fprintf(record_, "NAME,DESCRIPTION,CPU (ms)\n");
-#endif
-    fflush(record_);
-}
-
-void TestSystem::printSummary()
-{
-    cout << setiosflags(ios_base::fixed);
-    cout << "\naverage GPU speedup: x"
-         << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
-         << endl;
-    cout << "\nGPU exceeded: "
-         << setprecision(3) << speedup_faster_count_
-         << "\nGPU passed: "
-         << setprecision(3) << speedup_equal_count_
-         << "\nGPU failed: "
-         << setprecision(3) << speedup_slower_count_
-         << endl;
-    cout << "\nGPU exceeded rate: "
-         << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPU passed rate: "
-         << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPU failed rate: "
-         << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << endl;
-    cout << "\naverage GPUTOTAL speedup: x"
-         << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
-         << endl;
-    cout << "\nGPUTOTAL exceeded: "
-         << setprecision(3) << speedup_full_faster_count_
-         << "\nGPUTOTAL passed: "
-         << setprecision(3) << speedup_full_equal_count_
-         << "\nGPUTOTAL failed: "
-         << setprecision(3) << speedup_full_slower_count_
-         << endl;
-    cout << "\nGPUTOTAL exceeded rate: "
-         << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPUTOTAL passed rate: "
-         << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << "\nGPUTOTAL failed rate: "
-         << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
-         << "%"
-         << endl;
-    cout << resetiosflags(ios_base::fixed);
-}
-
-
-void TestSystem::printMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
-{
-    cout << TAB << setiosflags(ios_base::left);
-    stringstream stream;
-
-    stream << cpu_time;
-    cout << setw(10) << stream.str();
-#ifdef USE_OPENCL
-    stream.str("");
-    stream << gpu_time;
-    cout << setw(10) << stream.str();
-
-    stream.str("");
-    stream << "x" << setprecision(3) << speedup;
-    cout << setw(14) << stream.str();
-
-    stream.str("");
-    stream << gpu_full_time;
-    cout << setw(14) << stream.str();
-
-    stream.str("");
-    stream << "x" << setprecision(3) << fullspeedup;
-    cout << setw(14) << stream.str();
-#endif
-    cout << cur_subtest_description_.str();
-    cout << resetiosflags(ios_base::left) << endl;
-}
-
-void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
-{
-    if (!record_)
-    {
-        recordname_ += ".csv";
-        record_ = fopen(recordname_.c_str(), "w");
-    }
-
-#ifdef USE_OPENCL
-    fprintf(record_, "%s,%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n", itname_changed_ ? itname_.c_str() : "",
-            cur_subtest_description_.str().c_str(),
-            cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
-            gpu_min, gpu_max, std_dev);
-#else
-    fprintf(record_, "%s,%s,%.3f\n",
-            itname_changed_ ? itname_.c_str() : "", cur_subtest_description_.str().c_str(), cpu_time);
-#endif
-
-    if (itname_changed_)
-    {
-        itname_changed_ = false;
-    }
-
-    fflush(record_);
-}
-
-void TestSystem::writeSummary()
-{
-    if (!record_)
-    {
-        recordname_ += ".csv";
-        record_ = fopen(recordname_.c_str(), "w");
-    }
-
-    fprintf(record_, "\nAverage GPU speedup: %.3f\n"
-            "exceeded: %d (%.3f%%)\n"
-            "passed: %d (%.3f%%)\n"
-            "failed: %d (%.3f%%)\n"
-            "\nAverage GPUTOTAL speedup: %.3f\n"
-            "exceeded: %d (%.3f%%)\n"
-            "passed: %d (%.3f%%)\n"
-            "failed: %d (%.3f%%)\n",
-            speedup_total_ / std::max(1, num_subtests_called_),
-            speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_total_ / std::max(1, num_subtests_called_),
-            speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
-            speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
-           );
-    fflush(record_);
-}
-
-void TestSystem::printError(const std::string &msg)
-{
-    cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
-}
-
-void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
-{
-    mat.create(rows, cols, type);
-    RNG rng(0);
-    rng.fill(mat, RNG::UNIFORM, low, high);
-}
-
-
-string abspath(const string &relpath)
-{
-    return TestSystem::instance().workingDir() + relpath;
-}
-
-
-int CV_CDECL cvErrorCallback(int /*status*/, const char * /*func_name*/,
-                             const char *err_msg, const char * /*file_name*/,
-                             int /*line*/, void * /*userdata*/)
-{
-    TestSystem::instance().printError(err_msg);
-    return 0;
-}
-
-/////////// matchTemplate ////////////////////////
-//void InitMatchTemplate()
-//{
-//	Mat src; gen(src, 500, 500, CV_32F, 0, 1);
-//	Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
-//#ifdef USE_OPENCL
-//	ocl::oclMat d_src(src), d_templ(templ), d_dst;
-//	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-//#endif
-//}
-TEST(matchTemplate)
-{
-    //InitMatchTemplate();
-
-    Mat src, templ, dst;
-    int templ_size = 5;
-
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        int all_type[] = {CV_32FC1, CV_32FC4};
-        std::string type_name[] = {"CV_32FC1", "CV_32FC4"};
-
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
-            {
-                gen(src, size, size, all_type[j], 0, 1);
-
-                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR";
-
-                gen(templ, templ_size, templ_size, all_type[j], 0, 1);
-
-                matchTemplate(src, templ, dst, CV_TM_CCORR);
-
-                CPU_ON;
-                matchTemplate(src, templ, dst, CV_TM_CCORR);
-                CPU_OFF;
-
-#ifdef USE_OPENCL
-                ocl::oclMat d_src(src), d_templ, d_dst;
-
-                d_templ.upload(templ);
-
-                WARMUP_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-                WARMUP_OFF;
-
-                GPU_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-                GPU_OFF;
-
-                GPU_FULL_ON;
-                d_src.upload(src);
-                d_templ.upload(templ);
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-                d_dst.download(dst);
-                GPU_FULL_OFF;
-#endif
-            }
-        }
-
-        int all_type_8U[] = {CV_8UC1};
-        std::string type_name_8U[] = {"CV_8UC1"};
-
-        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
-        {
-            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
-            {
-                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";
-
-                gen(src, size, size, all_type_8U[j], 0, 255);
-
-                gen(templ, templ_size, templ_size, all_type_8U[j], 0, 255);
-
-                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
-
-                CPU_ON;
-                matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
-                CPU_OFF;
-
-#ifdef USE_OPENCL
-                ocl::oclMat d_src(src);
-                ocl::oclMat d_templ(templ), d_dst;
-
-                WARMUP_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
-                WARMUP_OFF;
-
-                GPU_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
-                GPU_OFF;
-
-                GPU_FULL_ON;
-                d_src.upload(src);
-                d_templ.upload(templ);
-                ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR_NORMED);
-                d_dst.download(dst);
-                GPU_FULL_OFF;
-#endif
-            }
-        }
-    }
-}
-
-///////////// PyrLKOpticalFlow ////////////////////////
-TEST(PyrLKOpticalFlow)
-{
-    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
-    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};
-
-    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
-    {
-        Mat frame0 = imread(abspath(images1[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
-
-        if (frame0.empty())
-        {
-            std::string errstr = "can't open " + images1[i];
-            throw runtime_error(errstr);
-        }
-
-        Mat frame1 = imread(abspath(images2[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
-
-        if (frame1.empty())
-        {
-            std::string errstr = "can't open " + images2[i];
-            throw runtime_error(errstr);
-        }
-
-        Mat gray_frame;
-
-        if (i == 0)
-        {
-            cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
-        }
-
-        for (int points = 1000; points <= 4000; points *= 2)
-        {
-            if (i == 0)
-                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
-            else
-                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
-            Mat nextPts_cpu;
-            Mat status_cpu;
-
-            vector<Point2f> pts;
-            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
-
-            vector<Point2f> nextPts;
-            vector<unsigned char> status;
-
-            vector<float> err;
-
-            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-
-            CPU_ON;
-            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            ocl::PyrLKOpticalFlow d_pyrLK;
-
-            ocl::oclMat d_frame0(frame0);
-            ocl::oclMat d_frame1(frame1);
-
-            ocl::oclMat d_pts;
-            Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
-            d_pts.upload(pts_mat);
-
-            ocl::oclMat d_nextPts;
-            ocl::oclMat d_status;
-            ocl::oclMat d_err;
-
-            WARMUP_ON;
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_frame0.upload(frame0);
-            d_frame1.upload(frame1);
-            d_pts.upload(pts_mat);
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-
-            if (!d_nextPts.empty())
-            {
-                d_nextPts.download(nextPts_cpu);
-            }
-
-            if (!d_status.empty())
-            {
-                d_status.download(status_cpu);
-            }
-
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-
-///////////// pyrDown //////////////////////
-TEST(pyrDown)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            pyrDown(src, dst);
-
-            CPU_ON;
-            pyrDown(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
-
-            WARMUP_ON;
-            ocl::pyrDown(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::pyrDown(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pyrDown(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-    }
-}
-
-///////////// pyrUp ////////////////////////
-TEST(pyrUp)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 500; size <= 2000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            pyrUp(src, dst);
-
-            CPU_ON;
-            pyrUp(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
-
-            WARMUP_ON;
-            ocl::pyrUp(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::pyrUp(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pyrUp(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-    }
-}
-
-///////////// Canny ////////////////////////
-TEST(Canny)
-{
-    Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
-
-    if (img.empty())
-    {
-        throw runtime_error("can't open aloeL.jpg");
-    }
-
-    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";
-
-    Mat edges(img.size(), CV_8UC1);
-
-    CPU_ON;
-    Canny(img, edges, 50.0, 100.0);
-    CPU_OFF;
-
-#ifdef USE_OPENCL
-    ocl::oclMat d_img(img);
-    ocl::oclMat d_edges;
-    ocl::CannyBuf d_buf;
-
-    WARMUP_ON;
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    WARMUP_OFF;
-
-    GPU_ON;
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_img.upload(img);
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    d_edges.download(edges);
-    GPU_FULL_OFF;
-#endif
-}
-
-///////////// Haar ////////////////////////
-#ifdef USE_OPENCL
-namespace cv
-{
-namespace ocl
-{
-
-struct getRect
-{
-    Rect operator()(const CvAvgComp &e) const
-    {
-        return e.rect;
-    }
-};
-
-class CascadeClassifier_GPU : public OclCascadeClassifier
-{
-public:
-    void detectMultiScale(oclMat &image,
-                          CV_OUT std::vector<cv::Rect>& faces,
-                          double scaleFactor = 1.1,
-                          int minNeighbors = 3, int flags = 0,
-                          Size minSize = Size(),
-                          Size maxSize = Size())
-    {
-        (void)maxSize;
-        MemStorage storage(cvCreateMemStorage(0));
-        //CvMat img=image;
-        CvSeq *objs = oclHaarDetectObjects(image, storage, scaleFactor, minNeighbors, flags, minSize);
-        vector<CvAvgComp> vecAvgComp;
-        Seq<CvAvgComp>(objs).copyTo(vecAvgComp);
-        faces.resize(vecAvgComp.size());
-        std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
-    }
-
-};
-
-}
-}
-#endif
-TEST(Haar)
-{
-    Mat img = imread(abspath("basketball1.png"), CV_LOAD_IMAGE_GRAYSCALE);
-
-    if (img.empty())
-    {
-        throw runtime_error("can't open basketball1.png");
-    }
-
-    CascadeClassifier faceCascadeCPU;
-
-    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
-    {
-        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
-    }
-
-    vector<Rect> faces;
-
-    SUBTEST << img.cols << "x" << img.rows << "; scale image";
-    CPU_ON;
-    faceCascadeCPU.detectMultiScale(img, faces,
-                                    1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-    CPU_OFF;
-
-#ifdef USE_OPENCL
-    ocl::CascadeClassifier_GPU faceCascade;
-
-    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
-    {
-        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
-    }
-
-    ocl::oclMat d_img(img);
-
-    faces.clear();
-
-    WARMUP_ON;
-    faceCascade.detectMultiScale(d_img, faces,
-                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-    WARMUP_OFF;
-
-    faces.clear();
-
-    GPU_ON;
-    faceCascade.detectMultiScale(d_img, faces,
-                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_img.upload(img);
-    faceCascade.detectMultiScale(d_img, faces,
-                                 1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-    GPU_FULL_OFF;
-#endif
-}
-
-///////////// blend ////////////////////////
-template <typename T>
-void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
-{
-    result_gold.create(img1.size(), img1.type());
-
-    int cn = img1.channels();
-
-    for (int y = 0; y < img1.rows; ++y)
-    {
-        const float *weights1_row = weights1.ptr<float>(y);
-        const float *weights2_row = weights2.ptr<float>(y);
-        const T *img1_row = img1.ptr<T>(y);
-        const T *img2_row = img2.ptr<T>(y);
-        T *result_gold_row = result_gold.ptr<T>(y);
-
-        for (int x = 0; x < img1.cols * cn; ++x)
-        {
-            float w1 = weights1_row[x / cn];
-            float w2 = weights2_row[x / cn];
-            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
-        }
-    }
-}
-TEST(blend)
-{
-    Mat src1, src2, weights1, weights2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " and CV_32FC1";
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(weights1, size, size, CV_32FC1, 0, 1);
-            gen(weights2, size, size, CV_32FC1, 0, 1);
-
-            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
-
-            CPU_ON;
-            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            d_weights1.upload(weights1);
-            d_weights2.upload(weights2);
-
-            WARMUP_ON;
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            d_weights1.upload(weights1);
-            d_weights2.upload(weights2);
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-    }
-}
-///////////// columnSum////////////////////////
-TEST(columnSum)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; CV_32FC1";
-
-        gen(src, size, size, CV_32FC1, 0, 256);
-
-        CPU_ON;
-        dst.create(src.size(), src.type());
-
-        for (int i = 1; i < src.rows; ++i)
-        {
-            for (int j = 0; j < src.cols; ++j)
-            {
-                dst.at<float>(i, j) = src.at<float>(i, j) += src.at<float>(i - 1, j);
-            }
-        }
-
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        d_src.upload(src);
-        WARMUP_ON;
-        ocl::columnSum(d_src, d_dst);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::columnSum(d_src, d_dst);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::columnSum(d_src, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-
-///////////// HOG////////////////////////
-TEST(HOG)
-{
-    Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);
-
-    if (src.empty())
-    {
-        throw runtime_error("can't open road.png");
-    }
-
-
-    cv::HOGDescriptor hog;
-    hog.setSVMDetector(hog.getDefaultPeopleDetector());
-    std::vector<cv::Rect> found_locations;
-
-    SUBTEST << 768 << 'x' << 576 << "; road.png";
-
-    hog.detectMultiScale(src, found_locations);
-
-    CPU_ON;
-    hog.detectMultiScale(src, found_locations);
-    CPU_OFF;
-
-#ifdef USE_OPENCL
-    cv::ocl::HOGDescriptor ocl_hog;
-    ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
-    ocl::oclMat d_src;
-    d_src.upload(src);
-
-    WARMUP_ON;
-    ocl_hog.detectMultiScale(d_src, found_locations);
-    WARMUP_OFF;
-
-    GPU_ON;
-    ocl_hog.detectMultiScale(d_src, found_locations);
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_src.upload(src);
-    ocl_hog.detectMultiScale(d_src, found_locations);
-    GPU_FULL_OFF;
-#endif
-}
-
-///////////// SURF ////////////////////////
-
-TEST(SURF)
-{
-    Mat keypoints_cpu;
-    Mat descriptors_cpu;
-
-    Mat src = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
-
-    if (src.empty())
-    {
-        throw runtime_error("can't open aloeL.jpg");
-    }
-
-    SUBTEST << src.cols << "x" << src.rows << "; aloeL.jpg";
-    SURF surf;
-    vector<KeyPoint> keypoints;
-    Mat descriptors;
-
-    surf(src, Mat(), keypoints, descriptors);
-
-    CPU_ON;
-    keypoints.clear();
-    surf(src, Mat(), keypoints, descriptors);
-    CPU_OFF;
-
-#ifdef USE_OPENCL
-    ocl::SURF_OCL d_surf;
-    ocl::oclMat d_src(src);
-    ocl::oclMat d_keypoints;
-    ocl::oclMat d_descriptors;
-
-    WARMUP_ON;
-    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
-    WARMUP_OFF;
-
-    GPU_ON;
-    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_src.upload(src);
-    d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
-
-    if (!d_keypoints.empty())
-    {
-        d_keypoints.download(keypoints_cpu);
-    }
-
-    if (!d_descriptors.empty())
-    {
-        d_descriptors.download(descriptors_cpu);
-    }
-
-    GPU_FULL_OFF;
-#endif
-}
-//////////////////// BruteForceMatch /////////////////
-TEST(BruteForceMatcher)
-{
-    Mat trainIdx_cpu;
-    Mat distance_cpu;
-    Mat allDist_cpu;
-    Mat nMatches_cpu;
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        // Init CPU matcher
-        int desc_len = 64;
-
-        BFMatcher matcher(NORM_L2);
-
-        Mat query;
-        gen(query, size, desc_len, CV_32F, 0, 1);
-
-        Mat train;
-        gen(train, size, desc_len, CV_32F, 0, 1);
-        // Output
-        vector< vector<DMatch> > matches(2);
-#ifdef USE_OPENCL
-        // Init GPU matcher
-        ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
-
-        ocl::oclMat d_query(query);
-        ocl::oclMat d_train(train);
-
-        ocl::oclMat d_trainIdx, d_distance, d_allDist, d_nMatches;
-#endif
-        SUBTEST << size << "; match";
-
-        matcher.match(query, train, matches[0]);
-
-        CPU_ON;
-        matcher.match(query, train, matches[0]);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        WARMUP_ON;
-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        WARMUP_OFF;
-
-        GPU_ON;
-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.match(d_query, d_train, matches[0]);
-        GPU_FULL_OFF;
-#endif
-
-        SUBTEST << size << "; knnMatch";
-
-        matcher.knnMatch(query, train, matches, 2);
-
-        CPU_ON;
-        matcher.knnMatch(query, train, matches, 2);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        WARMUP_ON;
-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
-        WARMUP_OFF;
-
-        GPU_ON;
-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.knnMatch(d_query, d_train, matches, 2);
-        GPU_FULL_OFF;
-#endif
-        SUBTEST << size << "; radiusMatch";
-
-        float max_distance = 2.0f;
-
-        matcher.radiusMatch(query, train, matches, max_distance);
-
-        CPU_ON;
-        matcher.radiusMatch(query, train, matches, max_distance);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        d_trainIdx.release();
-
-        WARMUP_ON;
-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
-        WARMUP_OFF;
-
-        GPU_ON;
-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.radiusMatch(d_query, d_train, matches, max_distance);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// Lut ////////////////////////
-TEST(lut)
-{
-    Mat src, lut, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_lut, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC3};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC3"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(lut, 1, 256, CV_8UC1, 0, 1);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-            LUT(src, lut, dst);
-
-            CPU_ON;
-            LUT(src, lut, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-            d_lut.upload(lut);
-
-            WARMUP_ON;
-            ocl::LUT(d_src, d_lut, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::LUT(d_src, d_lut, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_lut.upload(lut);
-            ocl::LUT(d_src, d_lut, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Exp ////////////////////////
-TEST(Exp)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; CV_32FC1";
-
-        gen(src, size, size, CV_32FC1, 0, 256);
-        gen(dst, size, size, CV_32FC1, 0, 256);
-
-        exp(src, dst);
-
-        CPU_ON;
-        exp(src, dst);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::exp(d_src, d_dst);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::exp(d_src, d_dst);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::exp(d_src, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-
-///////////// LOG ////////////////////////
-TEST(Log)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 32F";
-
-        gen(src, size, size, CV_32F, 1, 10);
-
-        log(src, dst);
-
-        CPU_ON;
-        log(src, dst);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::log(d_src, d_dst);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::log(d_src, d_dst);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::log(d_src, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-
-///////////// Add ////////////////////////
-
-TEST(Add)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 1);
-            gen(src2, size, size, all_type[j], 0, 1);
-
-            add(src1, src2, dst);
-
-            CPU_ON;
-            add(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::add(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::add(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::add(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Mul ////////////////////////
-TEST(Mul)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            multiply(src1, src2, dst);
-
-            CPU_ON;
-            multiply(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::multiply(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::multiply(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::multiply(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Div ////////////////////////
-TEST(Div)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            divide(src1, src2, dst);
-
-            CPU_ON;
-            divide(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::divide(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::divide(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::divide(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Absdiff ////////////////////////
-TEST(Absdiff)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            absdiff(src1, src2, dst);
-
-            CPU_ON;
-            absdiff(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::absdiff(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::absdiff(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::absdiff(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// CartToPolar ////////////////////////
-TEST(CartToPolar)
-{
-    Mat src1, src2, dst, dst1;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-            gen(dst1, size, size, all_type[j], 0, 256);
-
-
-            cartToPolar(src1, src2, dst, dst1, 1);
-
-            CPU_ON;
-            cartToPolar(src1, src2, dst, dst1, 1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::cartToPolar(d_src1, d_src2, d_dst, d_dst1, 1);
-            d_dst.download(dst);
-            d_dst1.download(dst1);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// PolarToCart ////////////////////////
-TEST(PolarToCart)
-{
-    Mat src1, src2, dst, dst1;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst, d_dst1;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-            gen(dst1, size, size, all_type[j], 0, 256);
-
-
-            polarToCart(src1, src2, dst, dst1, 1);
-
-            CPU_ON;
-            polarToCart(src1, src2, dst, dst1, 1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::polarToCart(d_src1, d_src2, d_dst, d_dst1, 1);
-            d_dst.download(dst);
-            d_dst1.download(dst1);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Magnitude ////////////////////////
-TEST(magnitude)
-{
-    Mat x, y, mag;
-#ifdef USE_OPENCL
-    ocl::oclMat d_x, d_y, d_mag;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(x, size, size, all_type[j], 0, 1);
-            gen(y, size, size, all_type[j], 0, 1);
-
-            magnitude(x, y, mag);
-
-            CPU_ON;
-            magnitude(x, y, mag);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_x.upload(x);
-            d_y.upload(y);
-
-            WARMUP_ON;
-            ocl::magnitude(d_x, d_y, d_mag);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::magnitude(d_x, d_y, d_mag);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_x.upload(x);
-            d_y.upload(y);
-            ocl::magnitude(d_x, d_y, d_mag);
-            d_mag.download(mag);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Transpose ////////////////////////
-TEST(Transpose)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-            transpose(src, dst);
-
-            CPU_ON;
-            transpose(src, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::transpose(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::transpose(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::transpose(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Flip ////////////////////////
-TEST(Flip)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; FLIP_BOTH";
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-            flip(src, dst, 0);
-
-            CPU_ON;
-            flip(src, dst, 0);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::flip(d_src, d_dst, 0);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::flip(d_src, d_dst, 0);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::flip(d_src, d_dst, 0);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// minMax ////////////////////////
-TEST(minMax)
-{
-    Mat src;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src;
-#endif
-    double min_val, max_val;
-    Point min_loc, max_loc;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            CPU_ON;
-            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::minMax(d_src, &min_val, &max_val);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::minMax(d_src, &min_val, &max_val);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::minMax(d_src, &min_val, &max_val);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// minMaxLoc ////////////////////////
-TEST(minMaxLoc)
-{
-    Mat src;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src;
-#endif
-    double min_val, max_val;
-    Point min_loc, max_loc;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 1);
-
-            CPU_ON;
-            minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::minMaxLoc(d_src, &min_val, &max_val, &min_loc, &max_loc);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Sum ////////////////////////
-TEST(Sum)
-{
-    Mat src;
-    Scalar cpures, gpures;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            cpures = sum(src);
-
-            CPU_ON;
-            cpures = sum(src);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            gpures = ocl::sum(d_src);
-            WARMUP_OFF;
-
-            GPU_ON;
-            gpures = ocl::sum(d_src);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            gpures = ocl::sum(d_src);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// countNonZero ////////////////////////
-TEST(countNonZero)
-{
-    Mat src;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src;
-#endif
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            countNonZero(src);
-
-            CPU_ON;
-            countNonZero(src);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::countNonZero(d_src);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::countNonZero(d_src);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::countNonZero(d_src);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Phase ////////////////////////
-TEST(Phase)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            phase(src1, src2, dst, 1);
-
-            CPU_ON;
-            phase(src1, src2, dst, 1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::phase(d_src1, d_src2, d_dst, 1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::phase(d_src1, d_src2, d_dst, 1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::phase(d_src1, d_src2, d_dst, 1);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// bitwise_and////////////////////////
-TEST(bitwise_and)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_and(src1, src2, dst);
-
-            CPU_ON;
-            bitwise_and(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::bitwise_and(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_and(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::bitwise_and(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// bitwise_or////////////////////////
-TEST(bitwise_or)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_or(src1, src2, dst);
-
-            CPU_ON;
-            bitwise_or(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::bitwise_or(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// bitwise_xor////////////////////////
-TEST(bitwise_xor)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_xor(src1, src2, dst);
-
-            CPU_ON;
-            bitwise_xor(src1, src2, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::bitwise_xor(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// bitwise_not////////////////////////
-TEST(bitwise_not)
-{
-    Mat src1, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32SC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32SC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            bitwise_not(src1, dst);
-
-            CPU_ON;
-            bitwise_not(src1, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-
-            WARMUP_ON;
-            ocl::bitwise_not(d_src1, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::bitwise_not(d_src1, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            ocl::bitwise_not(d_src1, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// compare////////////////////////
-TEST(compare)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int CMP_EQ = 0;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            compare(src1, src2, dst, CMP_EQ);
-
-            CPU_ON;
-            compare(src1, src2, dst, CMP_EQ);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::compare(d_src1, d_src2, d_dst, CMP_EQ);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// pow ////////////////////////
-TEST(pow)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 100);
-            gen(dst, size, size, all_type[j], 0, 100);
-
-            pow(src, -2.0, dst);
-
-            CPU_ON;
-            pow(src, -2.0, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-            d_dst.upload(dst);
-
-            WARMUP_ON;
-            ocl::pow(d_src, -2.0, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::pow(d_src, -2.0, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pow(d_src, -2.0, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// MagnitudeSqr////////////////////////
-TEST(MagnitudeSqr)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    int all_type[] = {CV_32FC1};
-    std::string type_name[] = {"CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[t];
-
-            gen(src1, size, size, all_type[t], 0, 256);
-            gen(src2, size, size, all_type[t], 0, 256);
-            gen(dst, size, size, all_type[t], 0, 256);
-
-
-            for (int i = 0; i < src1.rows; ++i)
-
-                for (int j = 0; j < src1.cols; ++j)
-                {
-                    float val1 = src1.at<float>(i, j);
-                    float val2 = src2.at<float>(i, j);
-
-                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
-
-                }
-
-            CPU_ON;
-
-            for (int i = 0; i < src1.rows; ++i)
-                for (int j = 0; j < src1.cols; ++j)
-                {
-                    float val1 = src1.at<float>(i, j);
-                    float val2 = src2.at<float>(i, j);
-
-                    ((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
-
-                }
-
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::magnitudeSqr(d_src1, d_src2, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// AddWeighted////////////////////////
-TEST(AddWeighted)
-{
-    Mat src1, src2, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_dst;
-#endif
-    double alpha = 2.0, beta = 1.0, gama = 3.0;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            addWeighted(src1, alpha, src2, beta, gama, dst);
-
-            CPU_ON;
-            addWeighted(src1, alpha, src2, beta, gama, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-
-            WARMUP_ON;
-            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            ocl::addWeighted(d_src1, alpha, d_src2, beta, gama, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Blur////////////////////////
-TEST(Blur)
-{
-    Mat src1, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_dst;
-#endif
-    Size ksize = Size(3, 3);
-    int bordertype = BORDER_CONSTANT;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            blur(src1, dst, ksize, Point(-1, -1), bordertype);
-
-            CPU_ON;
-            blur(src1, dst, ksize, Point(-1, -1), bordertype);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-
-            WARMUP_ON;
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Laplacian////////////////////////
-TEST(Laplacian)
-{
-    Mat src1, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_dst;
-#endif
-    int ksize = 3;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-
-
-            Laplacian(src1, dst, -1, ksize, 1);
-
-            CPU_ON;
-            Laplacian(src1, dst, -1, ksize, 1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src1.upload(src1);
-
-            WARMUP_ON;
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Erode ////////////////////
-TEST(Erode)
-{
-    Mat src, dst, ker;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(256));
-            ker = getStructuringElement(MORPH_RECT, Size(3, 3));
-
-            erode(src, dst, ker);
-
-            CPU_ON;
-            erode(src, dst, ker);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::erode(d_src, d_dst, ker);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::erode(d_src, d_dst, ker);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::erode(d_src, d_dst, ker);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Sobel ////////////////////////
-TEST(Sobel)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int dx = 1;
-    int dy = 1;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            Sobel(src, dst, -1, dx, dy);
-
-            CPU_ON;
-            Sobel(src, dst, -1, dx, dy);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Scharr ////////////////////////
-TEST(Scharr)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int dx = 1;
-    int dy = 0;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            Scharr(src, dst, -1, dx, dy);
-
-            CPU_ON;
-            Scharr(src, dst, -1, dx, dy);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// GaussianBlur ////////////////////////
-TEST(GaussianBlur)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            GaussianBlur(src, dst, Size(9, 9), 0);
-
-            CPU_ON;
-            GaussianBlur(src, dst, Size(9, 9), 0);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst(src.size(), src.type());
-            ocl::oclMat d_buf;
-
-            WARMUP_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::GaussianBlur(d_src, d_dst, Size(9, 9), 0);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// equalizeHist ////////////////////////
-TEST(equalizeHist)
-{
-    Mat src, dst;
-    int all_type[] = {CV_8UC1};
-    std::string type_name[] = {"CV_8UC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            equalizeHist(src, dst);
-
-            CPU_ON;
-            equalizeHist(src, dst);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
-            ocl::oclMat d_hist;
-            ocl::oclMat d_buf;
-
-            WARMUP_ON;
-            ocl::equalizeHist(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::equalizeHist(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::equalizeHist(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-/////////// CopyMakeBorder //////////////////////
-TEST(CopyMakeBorder)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_dst;
-#endif
-    int bordertype = BORDER_CONSTANT;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-
-            CPU_ON;
-            copyMakeBorder(src, dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            CPU_OFF;
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-
-            WARMUP_ON;
-            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::copyMakeBorder(d_src, d_dst, 7, 5, 5, 7, bordertype, cv::Scalar(1.0));
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// cornerMinEigenVal ////////////////////////
-TEST(cornerMinEigenVal)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_dst;
-#endif
-    int blockSize = 7, apertureSize = 1 + 2 * (rand() % 4);
-    int borderType = BORDER_REFLECT;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
-
-            CPU_ON;
-            cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(src);
-
-            WARMUP_ON;
-            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::cornerMinEigenVal(d_src, d_dst, blockSize, apertureSize, borderType);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// cornerHarris ////////////////////////
-TEST(cornerHarris)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; BORDER_REFLECT";
-
-            gen(src, size, size, all_type[j], 0, 1);
-
-            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
-
-            CPU_ON;
-            cornerHarris(src, dst, 5, 7, 0.1, BORDER_REFLECT);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::cornerHarris(d_src, d_dst, 5, 7, 0.1, BORDER_REFLECT);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-
-    }
-}
-///////////// integral ////////////////////////
-TEST(integral)
-{
-    Mat src, sum;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_sum, d_buf;
-#endif
-    int all_type[] = {CV_8UC1};
-    std::string type_name[] = {"CV_8UC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j]  ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            integral(src, sum);
-
-            CPU_ON;
-            integral(src, sum);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::integral(d_src, d_sum);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::integral(d_src, d_sum);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::integral(d_src, d_sum);
-            d_sum.download(sum);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// WarpAffine ////////////////////////
-TEST(WarpAffine)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    static const double coeffs[2][3] =
-    {
-        {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
-        {sin(3.14 / 6), cos(3.14 / 6), -100.0}
-    };
-    Mat M(2, 3, CV_64F, (void *)coeffs);
-    int interpolation = INTER_NEAREST;
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-            Size size1 = Size(size, size);
-
-            warpAffine(src, dst, M, size1, interpolation);
-
-            CPU_ON;
-            warpAffine(src, dst, M, size1, interpolation);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::warpAffine(d_src, d_dst, M, size1, interpolation);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// WarpPerspective ////////////////////////
-TEST(WarpPerspective)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    static const double coeffs[3][3] =
-    {
-        {cos(3.14 / 6), -sin(3.14 / 6), 100.0},
-        {sin(3.14 / 6), cos(3.14 / 6), -100.0},
-        {0.0, 0.0, 1.0}
-    };
-    Mat M(3, 3, CV_64F, (void *)coeffs);
-    int interpolation = INTER_NEAREST;
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
-            Size size1 = Size(size, size);
-
-            warpPerspective(src, dst, M, size1, interpolation);
-
-            CPU_ON;
-            warpPerspective(src, dst, M, size1, interpolation);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::warpPerspective(d_src, d_dst, M, size1, interpolation);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// resize ////////////////////////
-TEST(resize)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; up";
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            resize(src, dst, Size(), 2.0, 2.0);
-
-            CPU_ON;
-            resize(src, dst, Size(), 2.0, 2.0);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::resize(d_src, d_dst, Size(), 2.0, 2.0);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; down";
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            resize(src, dst, Size(), 0.5, 0.5);
-
-            CPU_ON;
-            resize(src, dst, Size(), 0.5, 0.5);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::resize(d_src, d_dst, Size(), 0.5, 0.5);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// threshold////////////////////////
-TEST(threshold)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 8UC1; THRESH_BINARY";
-
-        gen(src, size, size, CV_8U, 0, 100);
-
-        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
-
-        CPU_ON;
-        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 32FC1; THRESH_TRUNC [NPP]";
-
-        gen(src, size, size, CV_32FC1, 0, 100);
-
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
-
-        CPU_ON;
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// meanShiftFiltering////////////////////////
-TEST(meanShiftFiltering)
-{
-    int sp = 10, sr = 10;
-
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 8UC3 vs 8UC4";
-
-        gen(src, size, size, CV_8UC3, Scalar::all(0), Scalar::all(256));
-
-        pyrMeanShiftFiltering(src, dst, sp, sr);
-
-        CPU_ON;
-        pyrMeanShiftFiltering(src, dst, sp, sr);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::meanShiftFiltering(d_src, d_dst, sp, sr);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// meanShiftProc////////////////////////
-COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size size, int sp, int sr, int maxIter, float eps, int *tab)
-{
-
-    int isr2 = sr * sr;
-    int c0, c1, c2, c3;
-    int iter;
-    uchar *ptr = NULL;
-    uchar *pstart = NULL;
-    int revx = 0, revy = 0;
-    c0 = sptr[0];
-    c1 = sptr[1];
-    c2 = sptr[2];
-    c3 = sptr[3];
-
-    // iterate meanshift procedure
-    for (iter = 0; iter < maxIter; iter++)
-    {
-        int count = 0;
-        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
-
-        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-        int minx = x0 - sp;
-        int miny = y0 - sp;
-        int maxx = x0 + sp;
-        int maxy = y0 + sp;
-
-        //deal with the image boundary
-        if (minx < 0)
-        {
-            minx = 0;
-        }
-
-        if (miny < 0)
-        {
-            miny = 0;
-        }
-
-        if (maxx >= size.width)
-        {
-            maxx = size.width - 1;
-        }
-
-        if (maxy >= size.height)
-        {
-            maxy = size.height - 1;
-        }
-
-        if (iter == 0)
-        {
-            pstart = sptr;
-        }
-        else
-        {
-            pstart = pstart + revy * sstep + (revx << 2); //point to the new position
-        }
-
-        ptr = pstart;
-        ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row
-
-        for (int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
-        {
-            int rowCount = 0;
-            int x = minx;
-#if CV_ENABLE_UNROLLED
-
-            for (; x + 4 <= maxx; x += 4, ptr += 16)
-            {
-                int t0, t1, t2;
-                t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-
-                t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 1;
-                    rowCount++;
-                }
-
-                t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 2;
-                    rowCount++;
-                }
-
-                t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 3;
-                    rowCount++;
-                }
-            }
-
-#endif
-
-            for (; x <= maxx; x++, ptr += 4)
-            {
-                int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-
-                if (tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-            }
-
-            if (rowCount == 0)
-            {
-                continue;
-            }
-
-            count += rowCount;
-            sy += y * rowCount;
-        }
-
-        if (count == 0)
-        {
-            break;
-        }
-
-        int x1 = sx / count;
-        int y1 = sy / count;
-        s0 = s0 / count;
-        s1 = s1 / count;
-        s2 = s2 / count;
-
-        bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
-                        tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
-
-        //revise the pointer corresponding to the new (y0,x0)
-        revx = x1 - x0;
-        revy = y1 - y0;
-
-        x0 = x1;
-        y0 = y1;
-        c0 = s0;
-        c1 = s1;
-        c2 = s2;
-
-        if (stopFlag)
-        {
-            break;
-        }
-    } //for iter
-
-    dptr[0] = (uchar)c0;
-    dptr[1] = (uchar)c1;
-    dptr[2] = (uchar)c2;
-    dptr[3] = (uchar)c3;
-
-    COOR coor;
-    coor.x = static_cast<short>(x0);
-    coor.y = static_cast<short>(y0);
-    return coor;
-}
-
-void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, cv::TermCriteria crit)
-{
-
-    if (src_roi.empty())
-    {
-        CV_Error(CV_StsBadArg, "The input image is empty");
-    }
-
-    if (src_roi.depth() != CV_8U || src_roi.channels() != 4)
-    {
-        CV_Error(CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
-    }
-
-    CV_Assert((src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
-              (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
-    CV_Assert(!(dstCoor_roi.step & 0x3));
-
-    if (!(crit.type & cv::TermCriteria::MAX_ITER))
-    {
-        crit.maxCount = 5;
-    }
-
-    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
-    float eps;
-
-    if (!(crit.type & cv::TermCriteria::EPS))
-    {
-        eps = 1.f;
-    }
-
-    eps = (float)std::max(crit.epsilon, 0.0);
-
-    int tab[512];
-
-    for (int i = 0; i < 512; i++)
-    {
-        tab[i] = (i - 255) * (i - 255);
-    }
-
-    uchar *sptr = src_roi.data;
-    uchar *dptr = dst_roi.data;
-    short *dCoorptr = (short *)dstCoor_roi.data;
-    int sstep = (int)src_roi.step;
-    int dstep = (int)dst_roi.step;
-    int dCoorstep = (int)dstCoor_roi.step >> 1;
-    cv::Size size = src_roi.size();
-
-    for (int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
-            dptr += dstep - (size.width << 2), dCoorptr += dCoorstep - (size.width << 1))
-    {
-        for (int j = 0; j < size.width; j++, sptr += 4, dptr += 4, dCoorptr += 2)
-        {
-            *((COOR *)dCoorptr) = do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
-        }
-    }
-
-}
-TEST(meanShiftProc)
-{
-    Mat src, dst, dstCoor_roi;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst, d_dstCoor_roi;
-#endif
-    TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; 8UC4 and CV_16SC2 ";
-
-        gen(src, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dst, size, size, CV_8UC4, Scalar::all(0), Scalar::all(256));
-        gen(dstCoor_roi, size, size, CV_16SC2, Scalar::all(0), Scalar::all(256));
-
-        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
-
-        CPU_ON;
-        meanShiftProc_(src, dst, dstCoor_roi, 5, 6, crit);
-        CPU_OFF;
-#ifdef USE_OPENCL
-        d_src.upload(src);
-
-        WARMUP_ON;
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::meanShiftProc(d_src, d_dst, d_dstCoor_roi, 5, 6, crit);
-        d_dst.download(dst);
-        d_dstCoor_roi.download(dstCoor_roi);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// ConvertTo////////////////////////
-TEST(ConvertTo)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " to 32FC1";
-
-            gen(src, size, size, all_type[j], 0, 256);
-            //gen(dst, size, size, all_type[j], 0, 256);
-
-            //d_dst.upload(dst);
-
-            src.convertTo(dst, CV_32FC1);
-
-            CPU_ON;
-            src.convertTo(dst, CV_32FC1);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            d_src.convertTo(d_dst, CV_32FC1);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_src.convertTo(d_dst, CV_32FC1);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.convertTo(d_dst, CV_32FC1);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// copyTo////////////////////////
-TEST(copyTo)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-            //gen(dst, size, size, all_type[j], 0, 256);
-
-            //d_dst.upload(dst);
-
-            src.copyTo(dst);
-
-            CPU_ON;
-            src.copyTo(dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            d_src.copyTo(d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_src.copyTo(d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.copyTo(d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// setTo////////////////////////
-TEST(setTo)
-{
-    Mat src, dst;
-    Scalar val(1, 2, 3, 4);
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
-
-            src.setTo(val);
-
-            CPU_ON;
-            src.setTo(val);
-            CPU_OFF;
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            d_src.setTo(val);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_src.setTo(val);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.setTo(val);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// Merge////////////////////////
-TEST(Merge)
-{
-    Mat dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_dst;
-#endif
-    int channels = 4;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-            Size size1 = Size(size, size);
-            std::vector<Mat> src(channels);
-
-            for (int i = 0; i < channels; ++i)
-            {
-                src[i] = Mat(size1, all_type[j], cv::Scalar::all(i));
-            }
-
-            merge(src, dst);
-
-            CPU_ON;
-            merge(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            std::vector<ocl::oclMat> d_src(channels);
-
-            for (int i = 0; i < channels; ++i)
-            {
-                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
-            }
-
-            WARMUP_ON;
-            ocl::merge(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::merge(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-
-            for (int i = 0; i < channels; ++i)
-            {
-                d_src[i] = ocl::oclMat(size1, CV_8U, cv::Scalar::all(i));
-            }
-
-            ocl::merge(d_src, d_dst);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// Split////////////////////////
-TEST(Split)
-{
-    //int channels = 4;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-            Size size1 = Size(size, size);
-
-            Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
-
-            std::vector<cv::Mat> dst;
-
-            split(src, dst);
-
-            CPU_ON;
-            split(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            ocl::oclMat d_src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
-            std::vector<cv::ocl::oclMat> d_dst;
-
-            WARMUP_ON;
-            ocl::split(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::split(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::split(d_src, d_dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-
-///////////// norm////////////////////////
-TEST(norm)
-{
-    Mat src, buf;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_buf;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
-
-        gen(src, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
-        gen(buf, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
-
-        norm(src, NORM_INF);
-
-        CPU_ON;
-        norm(src, NORM_INF);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        d_src.upload(src);
-        d_buf.upload(buf);
-
-        WARMUP_ON;
-        ocl::norm(d_src, d_buf, NORM_INF);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::norm(d_src, d_buf, NORM_INF);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src.upload(src);
-        ocl::norm(d_src, d_buf, NORM_INF);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-///////////// remap////////////////////////
-TEST(remap)
-{
-    Mat src, dst, xmap, ymap;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst, d_xmap, d_ymap;
-#endif
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    int interpolation = INTER_LINEAR;
-    int borderMode = BORDER_CONSTANT;
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t t = 0; t < sizeof(all_type) / sizeof(int); t++)
-        {
-            SUBTEST << size << 'x' << size << "; src " << type_name[t] << "; map CV_32FC1";
-
-            gen(src, size, size, all_type[t], 0, 256);
-
-            xmap.create(size, size, CV_32FC1);
-            dst.create(size, size, CV_32FC1);
-            ymap.create(size, size, CV_32FC1);
-
-            for (int i = 0; i < size; ++i)
-            {
-                float *xmap_row = xmap.ptr<float>(i);
-                float *ymap_row = ymap.ptr<float>(i);
-
-                for (int j = 0; j < size; ++j)
-                {
-                    xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
-                    ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
-                }
-            }
-
-
-            remap(src, dst, xmap, ymap, interpolation, borderMode);
-
-            CPU_ON;
-            remap(src, dst, xmap, ymap, interpolation, borderMode);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-            d_dst.upload(dst);
-            d_xmap.upload(xmap);
-            d_ymap.upload(ymap);
-
-            WARMUP_ON;
-            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::remap(d_src, d_dst, d_xmap, d_ymap, interpolation, borderMode);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-///////////// cvtColor////////////////////////
-TEST(cvtColor)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-    int all_type[] = {CV_8UC4};
-    std::string type_name[] = {"CV_8UC4"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            gen(src, size, size, all_type[j], 0, 256);
-            SUBTEST << size << "x" << size << "; " << type_name[j] << " ; CV_RGBA2GRAY";
-
-            cvtColor(src, dst, CV_RGBA2GRAY, 4);
-
-            CPU_ON;
-            cvtColor(src, dst, CV_RGBA2GRAY, 4);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::cvtColor(d_src, d_dst, CV_RGBA2GRAY, 4);
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-
-    }
-
-
-}
-///////////// filter2D////////////////////////
-TEST(filter2D)
-{
-    Mat src;
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        int all_type[] = {CV_8UC1, CV_8UC4};
-        std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            gen(src, size, size, all_type[j], 0, 256);
-
-            for (int ksize = 3; ksize <= 15; ksize = 2*ksize+1)
-            {
-                SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;
-
-                Mat kernel;
-                gen(kernel, ksize, ksize, CV_32FC1, 0.0, 1.0);
-
-                Mat dst;
-                cv::filter2D(src, dst, -1, kernel);
-
-                CPU_ON;
-                cv::filter2D(src, dst, -1, kernel);
-                CPU_OFF;
-#ifdef USE_OPENCL
-                ocl::oclMat d_src(src);
-                ocl::oclMat d_dst;
-
-                WARMUP_ON;
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                WARMUP_OFF;
-
-                GPU_ON;
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                GPU_OFF;
-
-                GPU_FULL_ON;
-                d_src.upload(src);
-                ocl::filter2D(d_src, d_dst, -1, kernel);
-                d_dst.download(dst);
-                GPU_FULL_OFF;
-#endif
-            }
-
-        }
-
-
-    }
-}
-
-
-///////////// dft ////////////////////////
-TEST(dft)
-{
-    Mat src, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src, d_dst;
-#endif
-
-    int all_type[] = {CV_32FC1, CV_32FC2};
-    std::string type_name[] = {"CV_32FC1", "CV_32FC2"};
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; complex-to-complex";
-
-            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(1));
-
-            dft(src, dst);
-
-            CPU_ON;
-            dft(src, dst);
-            CPU_OFF;
-
-#ifdef USE_OPENCL
-            d_src.upload(src);
-
-            WARMUP_ON;
-            ocl::dft(d_src, d_dst, Size(size, size));
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::dft(d_src, d_dst, Size(size, size));
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::dft(d_src, d_dst, Size(size, size));
-            d_dst.download(dst);
-            GPU_FULL_OFF;
-#endif
-        }
-
-    }
-}
-
-///////////// gemm ////////////////////////
-TEST(gemm)
-{
-    Mat src1, src2, src3, dst;
-#ifdef USE_OPENCL
-    ocl::oclMat d_src1, d_src2, d_src3, d_dst;
-#endif
-
-    for (int size = 1000; size <= 4000; size *= 2)
-    {
-        SUBTEST << size << 'x' << size;
-
-        gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
-        gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
-        gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
-
-        gemm(src1, src2, 1.0, src3, 1.0, dst);
-
-        CPU_ON;
-        gemm(src1, src2, 1.0, src3, 1.0, dst);
-        CPU_OFF;
-
-#ifdef USE_OPENCL
-        d_src1.upload(src1);
-        d_src2.upload(src2);
-        d_src3.upload(src3);
-
-        WARMUP_ON;
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        WARMUP_OFF;
-
-        GPU_ON;
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        GPU_OFF;
-
-        GPU_FULL_ON;
-        d_src1.upload(src1);
-        d_src2.upload(src2);
-        d_src3.upload(src3);
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        d_dst.download(dst);
-        GPU_FULL_OFF;
-#endif
-    }
-}
-
-int main(int argc, const char *argv[])
-{
-#ifdef USE_OPENCL
-    vector<ocl::Info> oclinfo;
-    int num_devices = getDevice(oclinfo);
-
-    if (num_devices < 1)
-    {
-        cerr << "no device found\n";
-        return -1;
-    }
-
-    int devidx = 0;
-
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
-        {
-            printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
-        }
-    }
-
-#endif
-    redirectError(cvErrorCallback);
-
-    const char *keys =
-        "{ h | help    | false | print help message }"
-        "{ f | filter  |       | filter for test }"
-        "{ w | workdir |       | set working directory }"
-        "{ l | list    | false | show all tests }"
-        "{ d | device  | 0     | device id }"
-        "{ i | iters   | 10    | iteration count }"
-        "{ m | warmup  | 1     | gpu warm up iteration count}"
-        "{ t | xtop    | 1.1	  | xfactor top boundary}"
-        "{ b | xbottom | 0.9	  | xfactor bottom boundary}"
-        "{ v | verify  | false | only run gpu once to verify if problems occur}";
-
-    CommandLineParser cmd(argc, argv, keys);
-
-    if (cmd.get<bool>("help"))
-    {
-        cout << "Avaible options:" << endl;
-        cmd.printParams();
-        return 0;
-    }
-
-#ifdef USE_OPENCL
-    int device = cmd.get<int>("device");
-
-    if (device < 0 || device >= num_devices)
-    {
-        cerr << "Invalid device ID" << endl;
-        return -1;
-    }
-
-    if (cmd.get<bool>("verify"))
-    {
-        TestSystem::instance().setNumIters(1);
-        TestSystem::instance().setGPUWarmupIters(0);
-        TestSystem::instance().setCPUIters(0);
-    }
-
-    devidx = 0;
-
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
-        {
-            if (device == devidx)
-            {
-                ocl::setDevice(oclinfo[i], (int)j);
-                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
-                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
-                goto END_DEV;
-            }
-        }
-    }
-
-END_DEV:
-
-#endif
-    string filter = cmd.get<string>("filter");
-    string workdir = cmd.get<string>("workdir");
-    bool list = cmd.get<bool>("list");
-    int iters = cmd.get<int>("iters");
-    int wu_iters = cmd.get<int>("warmup");
-    double x_top = cmd.get<double>("xtop");
-    double x_bottom = cmd.get<double>("xbottom");
-
-    TestSystem::instance().setTopThreshold(x_top);
-    TestSystem::instance().setBottomThreshold(x_bottom);
-
-    if (!filter.empty())
-    {
-        TestSystem::instance().setTestFilter(filter);
-    }
-
-    if (!workdir.empty())
-    {
-        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
-        {
-            workdir += '/';
-        }
-
-        TestSystem::instance().setWorkingDir(workdir);
-    }
-
-    if (list)
-    {
-        TestSystem::instance().setListMode(true);
-    }
-
-    TestSystem::instance().setNumIters(iters);
-    TestSystem::instance().setGPUWarmupIters(wu_iters);
-
-    TestSystem::instance().run();
-
-    return 0;
-}