Merge release 2.4.4

2013-03-04 19:29:40 +04:00
parent 52a45ed191 facab40745
commit 9e12b7c3c2
64 changed files with 11575 additions and 3169 deletions
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@@ -3,15 +3,14 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 //////////////////////////////////////////////////////////////////////
 // StereoBM

 typedef std::tr1::tuple<string, string> pair_string;
 DEF_PARAM_TEST_1(ImagePair, pair_string);

-PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
+PERF_TEST_P(ImagePair, Calib3D_StereoBM,
+            Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
 {
    declare.time(5.0);

@@ -28,18 +27,13 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png",
    {
        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);

-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;

-        d_bm(d_imgLeft, d_imgRight, d_dst);
+        TEST_CYCLE() d_bm(d_imgLeft, d_imgRight, dst);

-        TEST_CYCLE()
-        {
-            d_bm(d_imgLeft, d_imgRight, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
@@ -47,12 +41,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png",

        cv::Mat dst;

-        bm(imgLeft, imgRight, dst);
-
-        TEST_CYCLE()
-        {
-            bm(imgLeft, imgRight, dst);
-        }
+        TEST_CYCLE() bm(imgLeft, imgRight, dst);

        CPU_SANITY_CHECK(dst);
    }
@@ -61,7 +50,8 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png",
 //////////////////////////////////////////////////////////////////////
 // StereoBeliefPropagation

-PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
+PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
+            Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
 {
    declare.time(10.0);

@@ -77,29 +67,25 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(pair_string("gpu/
    {
        cv::gpu::StereoBeliefPropagation d_bp(ndisp);

-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;

-        d_bp(d_imgLeft, d_imgRight, d_dst);
+        TEST_CYCLE() d_bp(d_imgLeft, d_imgRight, dst);

-        TEST_CYCLE()
-        {
-            d_bp(d_imgLeft, d_imgRight, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }

 //////////////////////////////////////////////////////////////////////
 // StereoConstantSpaceBP

-PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
+PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP,
+            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
 {
    declare.time(10.0);

@@ -115,29 +101,25 @@ PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(pair_string("gpu/st
    {
        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);

-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;

-        d_csbp(d_imgLeft, d_imgRight, d_dst);
+        TEST_CYCLE() d_csbp(d_imgLeft, d_imgRight, dst);

-        TEST_CYCLE()
-        {
-            d_csbp(d_imgLeft, d_imgRight, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }

 //////////////////////////////////////////////////////////////////////
 // DisparityBilateralFilter

-PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
+PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter,
+            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
 {
    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());
@@ -151,22 +133,17 @@ PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(pair_string("gpu
    {
        cv::gpu::DisparityBilateralFilter d_filter(ndisp);

-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_disp(disp);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_disp(disp);
+        cv::gpu::GpuMat dst;

-        d_filter(d_disp, d_img, d_dst);
+        TEST_CYCLE() d_filter(d_disp, d_img, dst);

-        TEST_CYCLE()
-        {
-            d_filter(d_disp, d_img, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }

@@ -175,45 +152,42 @@ PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(pair_string("gpu

 DEF_PARAM_TEST_1(Count, int);

-PERF_TEST_P(Count, Calib3D_TransformPoints, Values(5000, 10000, 20000))
+PERF_TEST_P(Count, Calib3D_TransformPoints,
+            Values(5000, 10000, 20000))
 {
    const int count = GetParam();

    cv::Mat src(1, count, CV_32FC3);
-    fillRandom(src, -100, 100);
+    declare.in(src, WARMUP_RNG);

    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
+        TEST_CYCLE() cv::gpu::transformPoints(d_src, rvec, tvec, dst);

-        TEST_CYCLE()
-        {
-            cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }

 //////////////////////////////////////////////////////////////////////
 // ProjectPoints

-PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
+PERF_TEST_P(Count, Calib3D_ProjectPoints,
+            Values(5000, 10000, 20000))
 {
    const int count = GetParam();

    cv::Mat src(1, count, CV_32FC3);
-    fillRandom(src, -100, 100);
+    declare.in(src, WARMUP_RNG);

    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
@@ -221,28 +195,18 @@ PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
+        TEST_CYCLE() cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), dst);

-        TEST_CYCLE()
-        {
-            cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-
-        TEST_CYCLE()
-        {
-            cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-        }
+        TEST_CYCLE() cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);

        CPU_SANITY_CHECK(dst);
    }
@@ -251,17 +215,18 @@ PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
 //////////////////////////////////////////////////////////////////////
 // SolvePnPRansac

-PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))
+PERF_TEST_P(Count, Calib3D_SolvePnPRansac,
+            Values(5000, 10000, 20000))
 {
    declare.time(10.0);

    const int count = GetParam();

    cv::Mat object(1, count, CV_32FC3);
-    fillRandom(object, -100, 100);
+    declare.in(object, WARMUP_RNG);

    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fillRandom(camera_mat, 0.5, 1);
+    cv::randu(camera_mat, 0.5, 1);
    camera_mat.at<float>(0, 1) = 0.f;
    camera_mat.at<float>(1, 0) = 0.f;
    camera_mat.at<float>(2, 0) = 0.f;
@@ -269,79 +234,66 @@ PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))

    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));

-    std::vector<cv::Point2f> image_vec;
    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fillRandom(rvec_gold, 0, 1);
+    cv::randu(rvec_gold, 0, 1);
+
    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fillRandom(tvec_gold, 0, 1);
+    cv::randu(tvec_gold, 0, 1);
+
+    std::vector<cv::Point2f> image_vec;
    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);

-    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
+    const cv::Mat image(1, count, CV_32FC2, &image_vec[0]);

    cv::Mat rvec;
    cv::Mat tvec;

    if (PERF_RUN_GPU())
    {
-        cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        TEST_CYCLE() cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);

-        TEST_CYCLE()
-        {
-            cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-        }
+        GPU_SANITY_CHECK(rvec, 1e-3);
+        GPU_SANITY_CHECK(tvec, 1e-3);
    }
    else
    {
-        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        TEST_CYCLE() cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);

-        TEST_CYCLE()
-        {
-            cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-        }
+        CPU_SANITY_CHECK(rvec, 1e-6);
+        CPU_SANITY_CHECK(tvec, 1e-6);
    }
-
-    CPU_SANITY_CHECK(rvec);
-    CPU_SANITY_CHECK(tvec);
 }

 //////////////////////////////////////////////////////////////////////
 // ReprojectImageTo3D

-PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
+PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S)))
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);

    cv::Mat src(size, depth);
-    fillRandom(src, 5.0, 30.0);
+    declare.in(src, WARMUP_RNG);

    cv::Mat Q(4, 4, CV_32FC1);
-    fillRandom(Q, 0.1, 1.0);
+    cv::randu(Q, 0.1, 1.0);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
+        TEST_CYCLE() cv::gpu::reprojectImageTo3D(d_src, dst, Q);

-        TEST_CYCLE()
-        {
-            cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::reprojectImageTo3D(src, dst, Q);
-
-        TEST_CYCLE()
-        {
-            cv::reprojectImageTo3D(src, dst, Q);
-        }
+        TEST_CYCLE() cv::reprojectImageTo3D(src, dst, Q);

        CPU_SANITY_CHECK(dst);
    }
@@ -350,32 +302,27 @@ PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES,
 //////////////////////////////////////////////////////////////////////
 // DrawColorDisp

-PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
+PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S)))
 {
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);

    cv::Mat src(size, type);
-    fillRandom(src, 0, 255);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::drawColorDisp(d_src, d_dst, 255);
+        TEST_CYCLE() cv::gpu::drawColorDisp(d_src, dst, 255);

-        TEST_CYCLE()
-        {
-            cv::gpu::drawColorDisp(d_src, d_dst, 255);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_denoising.cpp
+++ b/modules/gpu/perf/perf_denoising.cpp
@@ -3,8 +3,7 @@
 using namespace std;
 using namespace testing;

-#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::szXGA, perf::sz720p, perf::sz1080p)
-
+#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)

 //////////////////////////////////////////////////////////////////////
 // BilateralFilter
@@ -12,96 +11,86 @@ using namespace testing;
 DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth, MatCn, int);

 PERF_TEST_P(Sz_Depth_Cn_KernelSz, Denoising_BilateralFilter,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values(CV_8U, CV_32F), GPU_CHANNELS_1_3, Values(3, 5, 9)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values(CV_8U, CV_32F),
+                    GPU_CHANNELS_1_3,
+                    Values(3, 5, 9)))
 {
    declare.time(60.0);

-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
-    int kernel_size = GET_PARAM(3);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int kernel_size = GET_PARAM(3);

-    float sigma_color = 7;
-    float sigma_spatial = 5;
-    int borderMode = cv::BORDER_REFLECT101;
+    const float sigma_color = 7;
+    const float sigma_spatial = 5;
+    const int borderMode = cv::BORDER_REFLECT101;

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

-     if (PERF_RUN_GPU())
+    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::bilateralFilter(d_src, d_dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+        TEST_CYCLE() cv::gpu::bilateralFilter(d_src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);

-        TEST_CYCLE()
-        {
-            cv::gpu::bilateralFilter(d_src, d_dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-
-        TEST_CYCLE()
-        {
-            cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        }
+        TEST_CYCLE() cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);

        CPU_SANITY_CHECK(dst);
    }
 }

-
 //////////////////////////////////////////////////////////////////////
 // nonLocalMeans

 DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);

 PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), GPU_CHANNELS_1_3, Values(21), Values(5, 7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(5)))
 {
    declare.time(60.0);

-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int search_widow_size = GET_PARAM(3);
+    const int block_size = GET_PARAM(4);

-    int search_widow_size = GET_PARAM(3);
-    int block_size = GET_PARAM(4);
+    const float h = 10;
+    const int borderMode = cv::BORDER_REFLECT101;

-    float h = 10;
-    int borderMode = cv::BORDER_REFLECT101;
-
-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::nonLocalMeans(d_src, d_dst, h, search_widow_size, block_size, borderMode);
+        TEST_CYCLE() cv::gpu::nonLocalMeans(d_src, dst, h, search_widow_size, block_size, borderMode);

-        TEST_CYCLE()
-        {
-            cv::gpu::nonLocalMeans(d_src, d_dst, h, search_widow_size, block_size, borderMode);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
    }
 }

@@ -112,46 +101,41 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
 DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);

 PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), GPU_CHANNELS_1_3, Values(21), Values(7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(7)))
 {
-    declare.time(150.0);
+    declare.time(60.0);

-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);

-    int search_widow_size = GET_PARAM(2);
-    int block_size = GET_PARAM(3);
-
-    float h = 10;
-    int type = CV_MAKE_TYPE(depth, 1);
+    const float h = 10;
+    const int type = CV_MAKE_TYPE(depth, 1);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
        cv::gpu::FastNonLocalMeansDenoising fnlmd;

-        fnlmd.simpleMethod(d_src, d_dst, h, search_widow_size, block_size);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            fnlmd.simpleMethod(d_src, d_dst, h, search_widow_size, block_size);
-        }
+        TEST_CYCLE() fnlmd.simpleMethod(d_src, dst, h, search_widow_size, block_size);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;
-        cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);

-        TEST_CYCLE()
-        {
-            cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);
-        }
+        TEST_CYCLE() cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);

        CPU_SANITY_CHECK(dst);
    }
@@ -163,47 +147,41 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
 DEF_PARAM_TEST(Sz_Depth_WinSz_BlockSz, cv::Size, MatDepth, int, int);

 PERF_TEST_P(Sz_Depth_WinSz_BlockSz, Denoising_FastNonLocalMeansColored,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), Values(21), Values(7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    Values(21),
+                    Values(7)))
 {
-    declare.time(350.0);
+    declare.time(60.0);

-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);

-    int search_widow_size = GET_PARAM(2);
-    int block_size = GET_PARAM(3);
-
-    float h = 10;
-    int type = CV_MAKE_TYPE(depth, 3);
+    const float h = 10;
+    const int type = CV_MAKE_TYPE(depth, 3);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
        cv::gpu::FastNonLocalMeansDenoising fnlmd;

-        fnlmd.labMethod(d_src, d_dst, h, h, search_widow_size, block_size);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            fnlmd.labMethod(d_src, d_dst, h, h, search_widow_size, block_size);
-        }
+        TEST_CYCLE() fnlmd.labMethod(d_src, dst, h, h, search_widow_size, block_size);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;
-        cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);

-        TEST_CYCLE()
-        {
-            cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);
-        }
+        TEST_CYCLE() cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);

        CPU_SANITY_CHECK(dst);
    }
-}
+}
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -3,139 +3,194 @@
 using namespace std;
 using namespace testing;

-namespace {
+struct KeypointIdxCompare
+{
+    std::vector<cv::KeyPoint>* keypoints;
+
+    explicit KeypointIdxCompare(std::vector<cv::KeyPoint>* _keypoints) : keypoints(_keypoints) {}
+
+    bool operator ()(size_t i1, size_t i2) const
+    {
+        cv::KeyPoint kp1 = (*keypoints)[i1];
+        cv::KeyPoint kp2 = (*keypoints)[i2];
+        if (kp1.pt.x != kp2.pt.x)
+            return kp1.pt.x < kp2.pt.x;
+        if (kp1.pt.y != kp2.pt.y)
+            return kp1.pt.y < kp2.pt.y;
+        if (kp1.response != kp2.response)
+            return kp1.response < kp2.response;
+        return kp1.octave < kp2.octave;
+    }
+};
+
+static void sortKeyPoints(std::vector<cv::KeyPoint>& keypoints, cv::InputOutputArray _descriptors = cv::noArray())
+{
+    std::vector<size_t> indexies(keypoints.size());
+    for (size_t i = 0; i < indexies.size(); ++i)
+        indexies[i] = i;
+
+    std::sort(indexies.begin(), indexies.end(), KeypointIdxCompare(&keypoints));
+
+    std::vector<cv::KeyPoint> new_keypoints;
+    cv::Mat new_descriptors;
+
+    new_keypoints.resize(keypoints.size());
+
+    cv::Mat descriptors;
+    if (_descriptors.needed())
+    {
+        descriptors = _descriptors.getMat();
+        new_descriptors.create(descriptors.size(), descriptors.type());
+    }
+
+    for (size_t i = 0; i < indexies.size(); ++i)
+    {
+        size_t new_idx = indexies[i];
+        new_keypoints[i] = keypoints[new_idx];
+        if (!new_descriptors.empty())
+            descriptors.row((int) new_idx).copyTo(new_descriptors.row((int) i));
+    }
+
+    keypoints.swap(new_keypoints);
+    if (_descriptors.needed())
+        new_descriptors.copyTo(_descriptors);
+}

 //////////////////////////////////////////////////////////////////////
 // SURF

 DEF_PARAM_TEST_1(Image, string);

-PERF_TEST_P(Image, Features2D_SURF, Values<string>("gpu/perf/aloe.png"))
+PERF_TEST_P(Image, Features2D_SURF,
+            Values<string>("gpu/perf/aloe.png"))
 {
    declare.time(50.0);

-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

    if (PERF_RUN_GPU())
    {
        cv::gpu::SURF_GPU d_surf;

-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
        cv::gpu::GpuMat d_keypoints, d_descriptors;

-        d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        TEST_CYCLE() d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);

-        TEST_CYCLE()
-        {
-            d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-        }
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_surf.downloadKeypoints(d_keypoints, gpu_keypoints);

-        GPU_SANITY_CHECK(d_descriptors, 1e-4);
-        GPU_SANITY_CHECK_KEYPOINTS(SURF, d_keypoints);
+        cv::Mat gpu_descriptors(d_descriptors);
+
+        sortKeyPoints(gpu_keypoints, gpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+        SANITY_CHECK(gpu_descriptors, 1e-3);
    }
    else
    {
        cv::SURF surf;

-        std::vector<cv::KeyPoint> keypoints;
-        cv::Mat descriptors;
+        std::vector<cv::KeyPoint> cpu_keypoints;
+        cv::Mat cpu_descriptors;

-        surf(img, cv::noArray(), keypoints, descriptors);
+        TEST_CYCLE() surf(img, cv::noArray(), cpu_keypoints, cpu_descriptors);

-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            surf(img, cv::noArray(), keypoints, descriptors);
-        }
-
-        SANITY_CHECK_KEYPOINTS(keypoints);
-        SANITY_CHECK(descriptors, 1e-4);
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+        SANITY_CHECK(cpu_descriptors);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // FAST

-PERF_TEST_P(Image, Features2D_FAST, Values<string>("gpu/perf/aloe.png"))
+DEF_PARAM_TEST(Image_Threshold_NonMaxSupression, string, int, bool);
+
+PERF_TEST_P(Image_Threshold_NonMaxSupression, Features2D_FAST,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(20),
+                    Bool()))
 {
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

+    const int threshold = GET_PARAM(1);
+    const bool nonMaxSuppersion = GET_PARAM(2);
+
    if (PERF_RUN_GPU())
    {
-        cv::gpu::FAST_GPU d_fast(20);
+        cv::gpu::FAST_GPU d_fast(threshold, nonMaxSuppersion, 0.5);

-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
        cv::gpu::GpuMat d_keypoints;

-        d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+        TEST_CYCLE() d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);

-        TEST_CYCLE()
-        {
-            d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
-        }
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_fast.downloadKeypoints(d_keypoints, gpu_keypoints);

-        GPU_SANITY_CHECK_RESPONSE(FAST, d_keypoints);
+        sortKeyPoints(gpu_keypoints);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
    }
    else
    {
-        std::vector<cv::KeyPoint> keypoints;
+        std::vector<cv::KeyPoint> cpu_keypoints;

-        cv::FAST(img, keypoints, 20);
+        TEST_CYCLE() cv::FAST(img, cpu_keypoints, threshold, nonMaxSuppersion);

-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            cv::FAST(img, keypoints, 20);
-        }
-
-        SANITY_CHECK_KEYPOINTS(keypoints);
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // ORB

-PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.png"))
+DEF_PARAM_TEST(Image_NFeatures, string, int);
+
+PERF_TEST_P(Image_NFeatures, Features2D_ORB,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(4000)))
 {
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

+    const int nFeatures = GET_PARAM(1);
+
    if (PERF_RUN_GPU())
    {
-        cv::gpu::ORB_GPU d_orb(4000);
+        cv::gpu::ORB_GPU d_orb(nFeatures);

-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
        cv::gpu::GpuMat d_keypoints, d_descriptors;

-        d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        TEST_CYCLE() d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);

-        TEST_CYCLE()
-        {
-            d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-        }
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_orb.downloadKeyPoints(d_keypoints, gpu_keypoints);

-        GPU_SANITY_CHECK_KEYPOINTS(ORB, d_keypoints);
-        GPU_SANITY_CHECK(d_descriptors);
+        cv::Mat gpu_descriptors(d_descriptors);
+
+        gpu_keypoints.resize(10);
+        gpu_descriptors = gpu_descriptors.rowRange(0, 10);
+
+        sortKeyPoints(gpu_keypoints, gpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+        SANITY_CHECK(gpu_descriptors);
    }
    else
    {
-        cv::ORB orb(4000);
+        cv::ORB orb(nFeatures);

-        std::vector<cv::KeyPoint> keypoints;
-        cv::Mat descriptors;
+        std::vector<cv::KeyPoint> cpu_keypoints;
+        cv::Mat cpu_descriptors;

-        orb(img, cv::noArray(), keypoints, descriptors);
+        TEST_CYCLE() orb(img, cv::noArray(), cpu_keypoints, cpu_descriptors);

-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            orb(img, cv::noArray(), keypoints, descriptors);
-        }
-
-        SANITY_CHECK_KEYPOINTS(keypoints);
-        SANITY_CHECK(descriptors);
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+        SANITY_CHECK(cpu_descriptors);
    }
 }

@@ -144,166 +199,165 @@ PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.png"))

 DEF_PARAM_TEST(DescSize_Norm, int, NormType);

-PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch,
+            Combine(Values(64, 128, 256),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
    declare.time(20.0);

-    int desc_size = GET_PARAM(0);
-    int normType = GET_PARAM(1);
+    const int desc_size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);

-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

    cv::Mat query(3000, desc_size, type);
-    fillRandom(query);
+    declare.in(query, WARMUP_RNG);

    cv::Mat train(3000, desc_size, type);
-    fillRandom(train);
+    declare.in(train, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
        cv::gpu::BFMatcher_GPU d_matcher(normType);

-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
        cv::gpu::GpuMat d_trainIdx, d_distance;

-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        TEST_CYCLE() d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);

-        TEST_CYCLE()
-        {
-            d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        }
+        std::vector<cv::DMatch> gpu_matches;
+        d_matcher.matchDownload(d_trainIdx, d_distance, gpu_matches);

-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
+        SANITY_CHECK_MATCHES(gpu_matches);
    }
    else
    {
        cv::BFMatcher matcher(normType);

-        std::vector<cv::DMatch> matches;
+        std::vector<cv::DMatch> cpu_matches;

-        matcher.match(query, train, matches);
+        TEST_CYCLE() matcher.match(query, train, cpu_matches);

-        TEST_CYCLE()
-        {
-            matcher.match(query, train, matches);
-        }
-
-        SANITY_CHECK(matches);
+        SANITY_CHECK_MATCHES(cpu_matches);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // BFKnnMatch

+static void toOneRowMatches(const std::vector< std::vector<cv::DMatch> >& src, std::vector<cv::DMatch>& dst)
+{
+    dst.clear();
+    for (size_t i = 0; i < src.size(); ++i)
+        for (size_t j = 0; j < src[i].size(); ++j)
+            dst.push_back(src[i][j]);
+}
+
 DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);

-PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
-    Values(64, 128, 256),
-    Values(2, 3),
-    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
+            Combine(Values(64, 128, 256),
+                    Values(2, 3),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
 {
    declare.time(30.0);

-    int desc_size = GET_PARAM(0);
-    int k = GET_PARAM(1);
-    int normType = GET_PARAM(2);
+    const int desc_size = GET_PARAM(0);
+    const int k = GET_PARAM(1);
+    const int normType = GET_PARAM(2);

-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

    cv::Mat query(3000, desc_size, type);
-    fillRandom(query);
+    declare.in(query, WARMUP_RNG);

    cv::Mat train(3000, desc_size, type);
-    fillRandom(train);
+    declare.in(train, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
        cv::gpu::BFMatcher_GPU d_matcher(normType);

-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
        cv::gpu::GpuMat d_trainIdx, d_distance, d_allDist;

-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        TEST_CYCLE() d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);

-        TEST_CYCLE()
-        {
-            d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
-        }
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+        d_matcher.knnMatchDownload(d_trainIdx, d_distance, matchesTbl);

-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
+        std::vector<cv::DMatch> gpu_matches;
+        toOneRowMatches(matchesTbl, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
    }
    else
    {
        cv::BFMatcher matcher(normType);

-        std::vector< std::vector<cv::DMatch> > matches;
+        std::vector< std::vector<cv::DMatch> > matchesTbl;

-        matcher.knnMatch(query, train, matches, k);
+        TEST_CYCLE() matcher.knnMatch(query, train, matchesTbl, k);

-        TEST_CYCLE()
-        {
-            matcher.knnMatch(query, train, matches, k);
-        }
+        std::vector<cv::DMatch> cpu_matches;
+        toOneRowMatches(matchesTbl, cpu_matches);

-        SANITY_CHECK(matches);
+        SANITY_CHECK_MATCHES(cpu_matches);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // BFRadiusMatch

-PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch,
+            Combine(Values(64, 128, 256),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
 {
    declare.time(30.0);

-    int desc_size = GET_PARAM(0);
-    int normType = GET_PARAM(1);
+    const int desc_size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);

-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const float maxDistance = 10000;

    cv::Mat query(3000, desc_size, type);
-    fillRandom(query, 0.0, 1.0);
+    declare.in(query, WARMUP_RNG);

    cv::Mat train(3000, desc_size, type);
-    fillRandom(train, 0.0, 1.0);
+    declare.in(train, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
        cv::gpu::BFMatcher_GPU d_matcher(normType);

-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
        cv::gpu::GpuMat d_trainIdx, d_nMatches, d_distance;

-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
+        TEST_CYCLE() d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, maxDistance);

-        TEST_CYCLE()
-        {
-            d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
-        }
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+        d_matcher.radiusMatchDownload(d_trainIdx, d_distance, d_nMatches, matchesTbl);

-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
+        std::vector<cv::DMatch> gpu_matches;
+        toOneRowMatches(matchesTbl, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
    }
    else
    {
        cv::BFMatcher matcher(normType);

-        std::vector< std::vector<cv::DMatch> > matches;
+        std::vector< std::vector<cv::DMatch> > matchesTbl;

-        matcher.radiusMatch(query, train, matches, 2.0);
+        TEST_CYCLE() matcher.radiusMatch(query, train, matchesTbl, maxDistance);

-        TEST_CYCLE()
-        {
-            matcher.radiusMatch(query, train, matches, 2.0);
-        }
+        std::vector<cv::DMatch> cpu_matches;
+        toOneRowMatches(matchesTbl, cpu_matches);

-        SANITY_CHECK(matches);
+        SANITY_CHECK_MATCHES(cpu_matches);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -3,48 +3,39 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 //////////////////////////////////////////////////////////////////////
 // Blur

 DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);

-PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), Values(3, 5, 7)))
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC1, CV_8UC4),
+                    Values(3, 5, 7)))
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
+        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));

-        TEST_CYCLE()
-        {
-            cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::blur(src, dst, cv::Size(ksize, ksize));
-
-        TEST_CYCLE()
-        {
-            cv::blur(src, dst, cv::Size(ksize, ksize));
-        }
+        TEST_CYCLE() cv::blur(src, dst, cv::Size(ksize, ksize));

        CPU_SANITY_CHECK(dst);
    }
@@ -57,38 +48,28 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Valu
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
+        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);

-        TEST_CYCLE()
-        {
-            cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::Sobel(src, dst, -1, 1, 1, ksize);
-
-        TEST_CYCLE()
-        {
-            cv::Sobel(src, dst, -1, 1, 1, ksize);
-        }
+        TEST_CYCLE() cv::Sobel(src, dst, -1, 1, 1, ksize);

        CPU_SANITY_CHECK(dst);
    }
@@ -101,37 +82,27 @@ PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
+        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);

-        TEST_CYCLE()
-        {
-            cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::Scharr(src, dst, -1, 1, 0);
-
-        TEST_CYCLE()
-        {
-            cv::Scharr(src, dst, -1, 1, 0);
-        }
+        TEST_CYCLE() cv::Scharr(src, dst, -1, 1, 0);

        CPU_SANITY_CHECK(dst);
    }
@@ -144,38 +115,28 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZE
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);

-        TEST_CYCLE()
-        {
-            cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-
-        TEST_CYCLE()
-        {
-            cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-        }
+        TEST_CYCLE() cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);

        CPU_SANITY_CHECK(dst);
    }
@@ -188,37 +149,27 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Laplacian, Combine(GPU_TYPICAL_MAT_SIZES,
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
+        TEST_CYCLE() cv::gpu::Laplacian(d_src, dst, -1, ksize);

-        TEST_CYCLE()
-        {
-            cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::Laplacian(src, dst, -1, ksize);
-
-        TEST_CYCLE()
-        {
-            cv::Laplacian(src, dst, -1, ksize);
-        }
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);

        CPU_SANITY_CHECK(dst);
    }
@@ -231,39 +182,29 @@ PERF_TEST_P(Sz_Type, Filters_Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::erode(d_src, d_dst, ker, d_buf);
+        TEST_CYCLE() cv::gpu::erode(d_src, dst, ker, d_buf);

-        TEST_CYCLE()
-        {
-            cv::gpu::erode(d_src, d_dst, ker, d_buf);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::erode(src, dst, ker);
-
-        TEST_CYCLE()
-        {
-            cv::erode(src, dst, ker);
-        }
+        TEST_CYCLE() cv::erode(src, dst, ker);

        CPU_SANITY_CHECK(dst);
    }
@@ -276,39 +217,29 @@ PERF_TEST_P(Sz_Type, Filters_Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::dilate(d_src, d_dst, ker, d_buf);
+        TEST_CYCLE() cv::gpu::dilate(d_src, dst, ker, d_buf);

-        TEST_CYCLE()
-        {
-            cv::gpu::dilate(d_src, d_dst, ker, d_buf);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::dilate(src, dst, ker);
-
-        TEST_CYCLE()
-        {
-            cv::dilate(src, dst, ker);
-        }
+        TEST_CYCLE() cv::dilate(src, dst, ker);

        CPU_SANITY_CHECK(dst);
    }
@@ -326,41 +257,31 @@ PERF_TEST_P(Sz_Type_Op, Filters_MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Val
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int morphOp = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int morphOp = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf1;
        cv::gpu::GpuMat d_buf2;

-        cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
+        TEST_CYCLE() cv::gpu::morphologyEx(d_src, dst, morphOp, ker, d_buf1, d_buf2);

-        TEST_CYCLE()
-        {
-            cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::morphologyEx(src, dst, morphOp, ker);
-
-        TEST_CYCLE()
-        {
-            cv::morphologyEx(src, dst, morphOp, ker);
-        }
+        TEST_CYCLE() cv::morphologyEx(src, dst, morphOp, ker);

        CPU_SANITY_CHECK(dst);
    }
@@ -373,43 +294,31 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, V
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fillRandom(kernel, 0.0, 1.0);
+    declare.in(kernel, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::filter2D(d_src, d_dst, -1, kernel);
+        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);

-        TEST_CYCLE()
-        {
-            cv::gpu::filter2D(d_src, d_dst, -1, kernel);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::filter2D(src, dst, -1, kernel);
-
-        TEST_CYCLE()
-        {
-            cv::filter2D(src, dst, -1, kernel);
-        }
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);

        CPU_SANITY_CHECK(dst);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_labeling.cpp
+++ b/modules/gpu/perf/perf_labeling.cpp
@@ -3,8 +3,6 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 DEF_PARAM_TEST_1(Image, string);

 struct GreedyLabeling
@@ -100,28 +98,45 @@ struct GreedyLabeling
    dot* stack;
 };

-PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/aloe-disp.png"))
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectivityMask,
+            Values<string>("gpu/labeling/aloe-disp.png"))
 {
    declare.time(1.0);

-    cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());

    if (PERF_RUN_GPU())
    {
+        cv::gpu::GpuMat d_image(image);
        cv::gpu::GpuMat mask;
-        mask.create(image.rows, image.cols, CV_8UC1);
+
+        TEST_CYCLE() cv::gpu::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        GPU_SANITY_CHECK(mask);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectedComponents,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
+
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_mask;
+        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));

        cv::gpu::GpuMat components;
-        components.create(image.rows, image.cols, CV_32SC1);

-        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
-
-        ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));
-
-        TEST_CYCLE()
-        {
-            cv::gpu::labelComponents(mask, components);
-        }
+        TEST_CYCLE() cv::gpu::labelComponents(d_mask, components);

        GPU_SANITY_CHECK(components);
    }
@@ -129,17 +144,9 @@ PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/al
    {
        GreedyLabeling host(image);

-        host(host._labels);
+        TEST_CYCLE() host(host._labels);

-        declare.time(1.0);
-
-        TEST_CYCLE()
-        {
-            host(host._labels);
-        }
-
-        CPU_SANITY_CHECK(host._labels);
+        cv::Mat components = host._labels;
+        CPU_SANITY_CHECK(components);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@@ -1,7 +1,5 @@
 #include "perf_precomp.hpp"

-namespace{
-
 static void printOsInfo()
 {
 #if defined _WIN32
@@ -69,6 +67,4 @@ static void printCudaInfo()
 #endif
 }

-}
-
-CV_PERF_TEST_MAIN(gpu, printCudaInfo())
+CV_PERF_TEST_MAIN(gpu, printCudaInfo())
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@@ -3,137 +3,112 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 //////////////////////////////////////////////////////////////////////
 // SetTo

-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

-    cv::Scalar val(1, 2, 3, 4);
+    const cv::Scalar val(1, 2, 3, 4);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(size, type);
+        cv::gpu::GpuMat dst(size, type);

-        d_src.setTo(val);
+        TEST_CYCLE() dst.setTo(val);

-        TEST_CYCLE()
-        {
-            d_src.setTo(val);
-        }
-
-        GPU_SANITY_CHECK(d_src);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        cv::Mat src(size, type);
+        cv::Mat dst(size, type);

-        src.setTo(val);
+        TEST_CYCLE() dst.setTo(val);

-        TEST_CYCLE()
-        {
-            src.setTo(val);
-        }
-
-        CPU_SANITY_CHECK(src);
+        CPU_SANITY_CHECK(dst);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // SetToMasked

-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

    cv::Mat src(size, type);
-    fillRandom(src);
-
    cv::Mat mask(size, CV_8UC1);
-    fillRandom(mask, 0, 2);
+    declare.in(src, mask, WARMUP_RNG);

-    cv::Scalar val(1, 2, 3, 4);
+    const cv::Scalar val(1, 2, 3, 4);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat dst(src);
+        const cv::gpu::GpuMat d_mask(mask);

-        d_src.setTo(val, d_mask);
+        TEST_CYCLE() dst.setTo(val, d_mask);

-        TEST_CYCLE()
-        {
-            d_src.setTo(val, d_mask);
-        }
-
-        GPU_SANITY_CHECK(d_src);
+        GPU_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
-        src.setTo(val, mask);
+        cv::Mat dst = src;

-        TEST_CYCLE()
-        {
-            src.setTo(val, mask);
-        }
+        TEST_CYCLE() dst.setTo(val, mask);

-        CPU_SANITY_CHECK(src);
+        CPU_SANITY_CHECK(dst);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // CopyToMasked

-PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

    cv::Mat src(size, type);
-    fillRandom(src);
-
    cv::Mat mask(size, CV_8UC1);
-    fillRandom(mask, 0, 2);
+    declare.in(src, mask, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_mask(mask);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        const cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat dst(d_src.size(), d_src.type(), cv::Scalar::all(0));

-        d_src.copyTo(d_dst, d_mask);
+        TEST_CYCLE() d_src.copyTo(dst, d_mask);

-        TEST_CYCLE()
-        {
-            d_src.copyTo(d_dst, d_mask);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
-        cv::Mat dst;
+        cv::Mat dst(src.size(), src.type(), cv::Scalar::all(0));

-        src.copyTo(dst, mask);
-
-        TEST_CYCLE()
-        {
-            src.copyTo(dst, mask);
-        }
+        TEST_CYCLE() src.copyTo(dst, mask);

        CPU_SANITY_CHECK(dst);
    }
@@ -144,42 +119,36 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Valu

 DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);

-PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
-    cv::Size size = GET_PARAM(0);
-    int depth1 = GET_PARAM(1);
-    int depth2 = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth1 = GET_PARAM(1);
+    const int depth2 = GET_PARAM(2);

    cv::Mat src(size, depth1);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);
+
+    const double a = 0.5;
+    const double b = 1.0;

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+        TEST_CYCLE() d_src.convertTo(dst, depth2, a, b);

-        TEST_CYCLE()
-        {
-            d_src.convertTo(d_dst, depth2, 0.5, 1.0);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
        cv::Mat dst;

-        src.convertTo(dst, depth2, 0.5, 1.0);
-
-        TEST_CYCLE()
-        {
-            src.convertTo(dst, depth2, 0.5, 1.0);
-        }
+        TEST_CYCLE() src.convertTo(dst, depth2, a, b);

        CPU_SANITY_CHECK(dst);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -3,90 +3,47 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 ///////////////////////////////////////////////////////////////
 // HOG

 DEF_PARAM_TEST_1(Image, string);

-PERF_TEST_P(Image, ObjDetect_HOG, Values<string>("gpu/hog/road.png"))
+PERF_TEST_P(Image, ObjDetect_HOG,
+            Values<string>("gpu/hog/road.png",
+                           "gpu/caltech/image_00000009_0.png",
+                           "gpu/caltech/image_00000032_0.png",
+                           "gpu/caltech/image_00000165_0.png",
+                           "gpu/caltech/image_00000261_0.png",
+                           "gpu/caltech/image_00000469_0.png",
+                           "gpu/caltech/image_00000527_0.png",
+                           "gpu/caltech/image_00000574_0.png"))
 {
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

-    std::vector<cv::Rect> found_locations;
-
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
+        std::vector<cv::Rect> gpu_found_locations;

        cv::gpu::HOGDescriptor d_hog;
        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());

-        d_hog.detectMultiScale(d_img, found_locations);
+        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);

-        TEST_CYCLE()
-        {
-            d_hog.detectMultiScale(d_img, found_locations);
-        }
+        SANITY_CHECK(gpu_found_locations);
    }
    else
    {
+        std::vector<cv::Rect> cpu_found_locations;
+
        cv::HOGDescriptor hog;
        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());

-        hog.detectMultiScale(img, found_locations);
+        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);

-        TEST_CYCLE()
-        {
-            hog.detectMultiScale(img, found_locations);
-        }
+        SANITY_CHECK(cpu_found_locations);
    }
-
-    SANITY_CHECK(found_locations);
-}
-
-//===========test for CalTech data =============//
-DEF_PARAM_TEST_1(HOG, string);
-
-PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gpu/caltech/image_00000032_0.png",
-    "gpu/caltech/image_00000165_0.png", "gpu/caltech/image_00000261_0.png", "gpu/caltech/image_00000469_0.png",
-    "gpu/caltech/image_00000527_0.png", "gpu/caltech/image_00000574_0.png"))
-{
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::Rect> found_locations;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_img(img);
-
-        cv::gpu::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-        d_hog.detectMultiScale(d_img, found_locations);
-
-        TEST_CYCLE()
-        {
-            d_hog.detectMultiScale(d_img, found_locations);
-        }
-    }
-    else
-    {
-        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-        hog.detectMultiScale(img, found_locations);
-
-        TEST_CYCLE()
-        {
-            hog.detectMultiScale(img, found_locations);
-        }
-    }
-
-    SANITY_CHECK(found_locations);
 }

 ///////////////////////////////////////////////////////////////
@@ -96,9 +53,9 @@ typedef pair<string, string> pair_string;
 DEF_PARAM_TEST_1(ImageAndCascade, pair_string);

 PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
-    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
 {
-    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

    if (PERF_RUN_GPU())
@@ -106,33 +63,28 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
        cv::gpu::CascadeClassifier_GPU d_cascade;
        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_objects_buffer;
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat objects_buffer;
+        int detections_num = 0;

-        d_cascade.detectMultiScale(d_img, d_objects_buffer);
+        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);

-        TEST_CYCLE()
-        {
-            d_cascade.detectMultiScale(d_img, d_objects_buffer);
-        }
-
-        GPU_SANITY_CHECK(d_objects_buffer);
+        std::vector<cv::Rect> gpu_rects(detections_num);
+        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
+        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
    }
    else
    {
        cv::CascadeClassifier cascade;
        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));

-        std::vector<cv::Rect> rects;
+        std::vector<cv::Rect> cpu_rects;

-        cascade.detectMultiScale(img, rects);
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);

-        TEST_CYCLE()
-        {
-            cascade.detectMultiScale(img, rects);
-        }
-
-        CPU_SANITY_CHECK(rects);
+        SANITY_CHECK(cpu_rects);
    }
 }

@@ -140,9 +92,9 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
 // LBP cascade

 PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
-    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
 {
-    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

    if (PERF_RUN_GPU())
@@ -150,34 +102,27 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
        cv::gpu::CascadeClassifier_GPU d_cascade;
        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_gpu_rects;
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat objects_buffer;
+        int detections_num = 0;

-        d_cascade.detectMultiScale(d_img, d_gpu_rects);
+        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);

-        TEST_CYCLE()
-        {
-            d_cascade.detectMultiScale(d_img, d_gpu_rects);
-        }
-
-        GPU_SANITY_CHECK(d_gpu_rects);
+        std::vector<cv::Rect> gpu_rects(detections_num);
+        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
+        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
    }
    else
    {
        cv::CascadeClassifier cascade;
        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));

-        std::vector<cv::Rect> rects;
+        std::vector<cv::Rect> cpu_rects;

-        cascade.detectMultiScale(img, rects);
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);

-        TEST_CYCLE()
-        {
-            cascade.detectMultiScale(img, rects);
-        }
-
-        CPU_SANITY_CHECK(rects);
+        SANITY_CHECK(cpu_rects);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/perf/utility.cpp
+++ b/modules/gpu/perf/utility.cpp
@@ -2,13 +2,6 @@

 using namespace std;
 using namespace cv;
-using namespace cv::gpu;
-
-void fillRandom(Mat& m, double a, double b)
-{
-    RNG rng(123456789);
-    rng.fill(m, RNG::UNIFORM, Scalar::all(a), Scalar::all(b));
-}

 Mat readImage(const string& fileName, int flags)
 {
@@ -188,4 +181,4 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
    };

    *os << str[info.code];
-}
+}
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@@ -2,11 +2,9 @@
 #define __OPENCV_PERF_GPU_UTILITY_HPP__

 #include "opencv2/core/core.hpp"
-#include "opencv2/core/gpumat.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/ts/ts_perf.hpp"

-void fillRandom(cv::Mat& m, double a = 0.0, double b = 255.0);
 cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);

 using perf::MatType;
@@ -17,12 +15,13 @@ CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONS

 CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
 #define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
+
 CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)

-const int Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4;
+enum { Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4 };
 CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
-#define GPU_CHANNELS_1_3_4 testing::Values(Gray, BGR, BGRA)
-#define GPU_CHANNELS_1_3 testing::Values(Gray, BGR)
+#define GPU_CHANNELS_1_3_4 testing::Values(MatCn(Gray), MatCn(BGR), MatCn(BGRA))
+#define GPU_CHANNELS_1_3 testing::Values(MatCn(Gray), MatCn(BGR))

 struct CvtColorInfo
 {
@@ -30,7 +29,8 @@ struct CvtColorInfo
    int dcn;
    int code;

-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
+    CvtColorInfo() {}
+    explicit CvtColorInfo(int scn_, int dcn_, int code_) : scn(scn_), dcn(dcn_), code(code_) {}
 };
 void PrintTo(const CvtColorInfo& info, std::ostream* os);

@@ -46,39 +46,18 @@ DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);

 #define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)

-#define GPU_SANITY_CHECK(dmat, ...) \
+#define FAIL_NO_CPU() FAIL() << "No such CPU implementation analogy"
+
+#define GPU_SANITY_CHECK(mat, ...) \
    do{ \
-        cv::Mat d##dmat(dmat); \
-        SANITY_CHECK(d##dmat, ## __VA_ARGS__); \
+        cv::Mat gpu_##mat(mat); \
+        SANITY_CHECK(gpu_##mat, ## __VA_ARGS__); \
    } while(0)

-#define CPU_SANITY_CHECK(cmat, ...) \
+#define CPU_SANITY_CHECK(mat, ...) \
    do{ \
-        SANITY_CHECK(cmat, ## __VA_ARGS__); \
+        cv::Mat cpu_##mat(mat); \
+        SANITY_CHECK(cpu_##mat, ## __VA_ARGS__); \
    } while(0)

-#define GPU_SANITY_CHECK_KEYPOINTS(alg, dmat, ...)                                          \
-    do{                                                                                     \
-        cv::Mat d##dmat(dmat);                                                              \
-        cv::Mat __pt_x      = d##dmat.row(cv::gpu::alg##_GPU::X_ROW);                       \
-        cv::Mat __pt_y      = d##dmat.row(cv::gpu::alg##_GPU::Y_ROW);                       \
-        cv::Mat __angle     = d##dmat.row(cv::gpu::alg##_GPU::ANGLE_ROW);                   \
-        cv::Mat __octave    = d##dmat.row(cv::gpu::alg##_GPU::OCTAVE_ROW);                               \
-        cv::Mat __size      = d##dmat.row(cv::gpu::alg##_GPU::SIZE_ROW);                                 \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-x-row",     __pt_x,     ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-y-row",     __pt_y,     ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-angle-row",    __angle,    ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "octave-row",    __octave,   ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-size-row",  __size,     ## __VA_ARGS__); \
-    } while(0)
-
-#define GPU_SANITY_CHECK_RESPONSE(alg, dmat, ...) \
-    do{                                                                                     \
-        cv::Mat d##dmat(dmat);                                                              \
-        cv::Mat __response  = d##dmat.row(cv::gpu::alg##_GPU::RESPONSE_ROW);                \
-        ::perf::Regression::add(this, std::string(#dmat) + "-response-row", __response, ## __VA_ARGS__); \
-    } while(0)
-
-#define FAIL_NO_CPU()   FAIL() << "No such CPU implementation analogy"
-
 #endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
@@ -648,7 +648,7 @@ namespace cv { namespace gpu { namespace device
                    tWeight += gmm_weight(mode * frame.rows + y, x);
                    if (tWeight > c_TB)
                        break;
-                };
+                }
            }

            fgmask(y, x) = background ? 0 : isShadow ? c_shadowVal : 255;
@@ -761,4 +761,4 @@ namespace cv { namespace gpu { namespace device
 }}}


-#endif /* CUDA_DISABLER */
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -194,10 +194,10 @@ namespace cv { namespace gpu { namespace device
            if ( y > 0 && connected(intensity, image(y - 1, x)))
                c |= UP;

-            if ( x - 1 < image.cols && connected(intensity, image(y, x + 1)))
+            if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
                c |= RIGHT;

-            if ( y - 1 < image.rows && connected(intensity, image(y + 1, x)))
+            if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
                c |= DOWN;

            components(y, x) = c;
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -2284,15 +2284,18 @@ namespace arithm

    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarAnd<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);

    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarOr<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);

    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarXor<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 }

 //////////////////////////////////////////////////////////////////////////
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -2280,11 +2280,11 @@ namespace
 {
    typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);

-    template <bit_scalar_func_t func> struct BitScalar
+    template <typename T, bit_scalar_func_t func> struct BitScalar
    {
        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
        {
-            func(src, static_cast<unsigned int>(sc.val[0]), dst, stream);
+            func(src, saturate_cast<T>(sc.val[0]), dst, stream);
        }
    };

@@ -2292,14 +2292,12 @@ namespace
    {
        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
        {
-            Scalar_<unsigned int> isc = sc;
-
            unsigned int packedVal = 0;

-            packedVal |= (isc.val[0] & 0xffff);
-            packedVal |= (isc.val[1] & 0xffff) << 8;
-            packedVal |= (isc.val[2] & 0xffff) << 16;
-            packedVal |= (isc.val[3] & 0xffff) << 24;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[0]) & 0xffff);
+            packedVal |= (saturate_cast<unsigned char>(sc.val[1]) & 0xffff) << 8;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[2]) & 0xffff) << 16;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[3]) & 0xffff) << 24;

            func(src, packedVal, dst, stream);
        }
@@ -2330,7 +2328,7 @@ namespace
            oSizeROI.width = src.cols;
            oSizeROI.height = src.rows;

-            const npp_t pConstants[] = {static_cast<npp_t>(sc.val[0]), static_cast<npp_t>(sc.val[1]), static_cast<npp_t>(sc.val[2]), static_cast<npp_t>(sc.val[3])};
+            const npp_t pConstants[] = {saturate_cast<npp_t>(sc.val[0]), saturate_cast<npp_t>(sc.val[1]), saturate_cast<npp_t>(sc.val[2]), saturate_cast<npp_t>(sc.val[3])};

            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );

@@ -2350,7 +2348,7 @@ namespace
            oSizeROI.width = src.cols;
            oSizeROI.height = src.rows;

-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), static_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), saturate_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -2365,11 +2363,11 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarAnd<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarAnd<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        {BitScalar<int, bitScalarAnd<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
    };

    const int depth = src.depth();
@@ -2390,11 +2388,11 @@ void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Strea
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarOr<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarOr<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarOr<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+        {BitScalar<int, bitScalarOr<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
    };

    const int depth = src.depth();
@@ -2415,11 +2413,11 @@ void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarXor<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarXor<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarXor<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        {BitScalar<int, bitScalarXor<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
    };

    const int depth = src.depth();
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -104,12 +104,12 @@ void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scal

 void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
 {
-    if (!TargetArchs::builtWith(SHARED_ATOMICS) || !DeviceInfo().supports(SHARED_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
    CV_Assert(!mask.empty() && mask.type() == CV_8U);

-    if (mask.size() != components.size() || components.type() != CV_32SC1)
-        components.create(mask.size(), CV_32SC1);
+    if (!deviceSupports(SHARED_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
+
+    components.create(mask.size(), CV_32SC1);

    cudaStream_t stream = StreamAccessor::getStream(s);
    device::ccl::labelComponents(mask, components, flags, stream);
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -517,6 +517,7 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);

    dst.create(dsize, src.type());
+    dst.setTo(Scalar::all(0));

    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
 }
--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
@@ -380,6 +380,7 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
+            dstcol[3] = 255;
        }
    }
 }
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -206,6 +206,8 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
+    uPyr_[0].setTo(Scalar::all(0));
+    vPyr_[0].setTo(Scalar::all(0));
    uPyr_[1].setTo(Scalar::all(0));
    vPyr_[1].setTo(Scalar::all(0));

--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@@ -232,10 +232,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
    };

    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
-    #ifdef linux
-        // NPP bug on float data
-        useNpp = useNpp && src.depth() != CV_32F;
-    #endif
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;

    if (useNpp)
    {
@@ -372,10 +370,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
    };

    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
-    #ifdef linux
-        // NPP bug on float data
-        useNpp = useNpp && src.depth() != CV_32F;
-    #endif
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;

    if (useNpp)
    {
--- a/modules/gpu/test/test_bgfg.cpp
+++ b/modules/gpu/test/test_bgfg.cpp
@@ -207,11 +207,17 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG, testing::Combine(
 //////////////////////////////////////////////////////
 // MOG2

-PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, UseRoi)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
+}
+
+PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, DetectShadow, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    std::string inputFile;
    bool useGray;
+    bool detectShadow;
    bool useRoi;

    virtual void SetUp()
@@ -220,10 +226,9 @@ PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, UseRoi)
        cv::gpu::setDevice(devInfo.deviceID());

        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-
        useGray = GET_PARAM(2);
-
-        useRoi = GET_PARAM(3);
+        detectShadow = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
    }
 };

@@ -237,9 +242,11 @@ GPU_TEST_P(MOG2, Update)
    ASSERT_FALSE(frame.empty());

    cv::gpu::MOG2_GPU mog2;
+    mog2.bShadowDetection = detectShadow;
    cv::gpu::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);

    cv::BackgroundSubtractorMOG2 mog2_gold;
+    mog2_gold.set("detectShadows", detectShadow);
    cv::Mat foreground_gold;

    for (int i = 0; i < 10; ++i)
@@ -258,11 +265,14 @@ GPU_TEST_P(MOG2, Update)

        mog2_gold(frame, foreground_gold);

-        double norm = cv::norm(foreground_gold, cv::Mat(foreground), cv::NORM_L1);
-
-        norm /= foreground_gold.size().area();
-
-        ASSERT_LE(norm, 0.09);
+        if (detectShadow)
+        {
+            ASSERT_MAT_SIMILAR(foreground_gold, foreground, 1e-2);
+        }
+        else
+        {
+            ASSERT_MAT_NEAR(foreground_gold, foreground, 0);
+        }
    }
 }

@@ -277,9 +287,11 @@ GPU_TEST_P(MOG2, getBackgroundImage)
    cv::Mat frame;

    cv::gpu::MOG2_GPU mog2;
+    mog2.bShadowDetection = detectShadow;
    cv::gpu::GpuMat foreground;

    cv::BackgroundSubtractorMOG2 mog2_gold;
+    mog2_gold.set("detectShadows", detectShadow);
    cv::Mat foreground_gold;

    for (int i = 0; i < 10; ++i)
@@ -305,6 +317,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG2, testing::Combine(
    ALL_DEVICES,
    testing::Values(std::string("768x576.avi")),
    testing::Values(UseGray(true), UseGray(false)),
+    testing::Values(DetectShadow(true), DetectShadow(false)),
    WHOLE_SUBMAT));

 //////////////////////////////////////////////////////
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -1873,7 +1873,7 @@ PARAM_TEST_CASE(Bitwise_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channel
        cv::gpu::setDevice(devInfo.deviceID());

        src = randomMat(size, CV_MAKE_TYPE(depth, channels));
-        cv::Scalar_<int> ival = randomScalar(0.0, 255.0);
+        cv::Scalar_<int> ival = randomScalar(0.0, std::numeric_limits<int>::max());
        val = ival;
    }
 };
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@@ -252,6 +252,8 @@ PARAM_TEST_CASE(WarpAffineNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolat
 GPU_TEST_P(WarpAffineNPP, Accuracy)
 {
    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
    int flags = interpolation;
    if (inverse)
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@@ -255,6 +255,8 @@ PARAM_TEST_CASE(WarpPerspectiveNPP, cv::gpu::DeviceInfo, MatType, Inverse, Inter
 GPU_TEST_P(WarpPerspectiveNPP, Accuracy)
 {
    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
    int flags = interpolation;
    if (inverse)
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -275,7 +275,7 @@ if(WIN32 AND WITH_FFMPEG)
                       COMMAND ${CMAKE_COMMAND} -E copy "${ffmpeg_path}" "${EXECUTABLE_OUTPUT_PATH}/Release/${ffmpeg_bare_name_ver}"
                       COMMAND ${CMAKE_COMMAND} -E copy "${ffmpeg_path}" "${EXECUTABLE_OUTPUT_PATH}/Debug/${ffmpeg_bare_name_ver}"
                       COMMENT "Copying ${ffmpeg_path} to the output directory")
-  elseif(MSVC)
+  elseif(MSVC AND (CMAKE_GENERATOR MATCHES "Visual"))
    add_custom_command(TARGET ${the_module} POST_BUILD
                       COMMAND ${CMAKE_COMMAND} -E copy "${ffmpeg_path}" "${EXECUTABLE_OUTPUT_PATH}/${CMAKE_BUILD_TYPE}/${ffmpeg_bare_name_ver}"
                       COMMENT "Copying ${ffmpeg_path} to the output directory")
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -217,6 +217,12 @@ endif(ANDROID AND ANDROID_EXECUTABLE)

 set(step3_depends ${step2_depends} ${step3_input_files} ${copied_files})

+if(ANDROID)
+  set(LIB_NAME_SUFIX "")
+else()
+  set(LIB_NAME_SUFIX "${OPENCV_VERSION_MAJOR}${OPENCV_VERSION_MINOR}${OPENCV_VERSION_PATCH}")
+endif()
+
 # step 4: build jar
 if(ANDROID)
  set(JAR_FILE "${OpenCV_BINARY_DIR}/bin/classes.jar")
@@ -241,7 +247,7 @@ if(ANDROID)
                      )
  endif()
 else(ANDROID)
-  set(JAR_NAME opencv-${OPENCV_VERSION}.jar)
+  set(JAR_NAME opencv-${LIB_NAME_SUFIX}.jar)
  set(JAR_FILE "${OpenCV_BINARY_DIR}/bin/${JAR_NAME}")
  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.xml.in" "${OpenCV_BINARY_DIR}/build.xml" IMMEDIATE @ONLY)
  list(APPEND step3_depends "${OpenCV_BINARY_DIR}/build.xml")
@@ -294,8 +300,8 @@ endif()

 # Additional target properties
 set_target_properties(${the_module} PROPERTIES
-    OUTPUT_NAME "${the_module}${OPENCV_DLLVERSION}"
-    DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+    OUTPUT_NAME "${the_module}${LIB_NAME_SUFIX}"
+    #DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
    ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
    RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
    INSTALL_NAME_DIR ${OPENCV_LIB_INSTALL_PATH}
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -557,6 +557,15 @@ func_arg_fix = {
    }, # '', i.e. no class
 } # func_arg_fix

+
+def getLibVersion(version_hpp_path):
+    version_file = open(version_hpp_path, "rt").read()
+    epoch = re.search("^W*#\W*define\W+CV_VERSION_EPOCH\W+(\d+)\W*$", version_file, re.MULTILINE).group(1)
+    major = re.search("^W*#\W*define\W+CV_VERSION_MAJOR\W+(\d+)\W*$", version_file, re.MULTILINE).group(1)
+    minor = re.search("^W*#\W*define\W+CV_VERSION_MINOR\W+(\d+)\W*$", version_file, re.MULTILINE).group(1)
+    revision = re.search("^W*#\W*define\W+CV_VERSION_REVISION\W+(\d+)\W*$", version_file, re.MULTILINE).group(1)
+    return (epoch, major, minor, revision)
+
 class ConstInfo(object):
    def __init__(self, cname, name, val, addedManually=False):
        self.cname = cname
@@ -719,13 +728,16 @@ $imports
 public class %(jc)s {
 """ % { 'm' : self.module, 'jc' : jname } )

-#        self.java_code[class_name]["jn_code"].write("""
-#    //
-#    // native stuff
-#    //
-#    static { System.loadLibrary("opencv_java"); }
-#""" )
-
+        if class_name == 'Core':
+            (epoch, major, minor, revision) = getLibVersion(
+                (os.path.dirname(__file__) or '.') + '/../../core/include/opencv2/core/version.hpp')
+            version_str    = '.'.join( (epoch, major, minor, revision) )
+            version_suffix =  ''.join( (epoch, major, minor) )
+            self.classes[class_name].imports.add("java.lang.String")
+            self.java_code[class_name]["j_code"].write("""
+    public static final String VERSION = "%(v)s", NATIVE_LIBRARY_NAME = "opencv_java%(vs)s";
+    public static final int VERSION_EPOCH = %(ep)s, VERSION_MAJOR = %(ma)s, VERSION_MINOR = %(mi)s, VERSION_REVISION = %(re)s;
+""" % { 'v' : version_str, 'vs' : version_suffix, 'ep' : epoch, 'ma' : major, 'mi' : minor, 're' : revision } )


    def add_class(self, decl):
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -2122,12 +2122,16 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
                              };

    int dst_step1 = dst.cols * dst.elemSize();
+    int src1_step = (int) src1.step;
+    int src2_step = (int) src2.step;
+    int dst_step  = (int) dst.step;
+    float alpha_f = alpha, beta_f = beta, gama_f = gama;
    std::vector<std::pair<size_t , const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset));
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset));

    if(src1.clCxt -> impl -> double_support != 0)
@@ -2138,14 +2142,13 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
    }
    else
    {
-        float alpha_f = alpha, beta_f = beta, gama_f = gama;
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&alpha_f ));
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&beta_f ));
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&gama_f ));
    }

    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
--- a/modules/ocl/src/blend.cpp
+++ b/modules/ocl/src/blend.cpp
@@ -73,7 +73,7 @@ void cv::ocl::blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &
    size_t localSize[] = {256, 1, 1};

    std::vector< std::pair<size_t, const void *> > args;
-
+    result.create(img1.size(), CV_MAKE_TYPE(depth,img1.channels()));
    if(globalSize[0] != 0)
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data ));
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -60,7 +60,7 @@ namespace cv
 }

 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask,
+void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -75,7 +75,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
@@ -101,7 +101,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int
 }

 template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
-void match(const oclMat &query, const oclMat &train, const oclMat &mask,
+void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
           const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -115,7 +115,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
@@ -141,7 +141,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o

 //radius_matchUnrolledCached
 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
+void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -157,7 +157,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&maxDistance ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
@@ -181,7 +181,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist

 //radius_match
 template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
-void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
+void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                  const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -196,7 +196,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&maxDistance ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
@@ -470,7 +470,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD

 //knn match Dispatcher
 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask,
+void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                             const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -485,7 +485,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
@@ -505,7 +505,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
 }

 template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
-void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
+void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
               const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@@ -519,7 +519,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
@@ -538,7 +538,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
 }

 template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
-void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType)
+void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
@@ -552,7 +552,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&allDist.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
@@ -571,7 +571,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
 }

 template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
-void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType)
+void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
@@ -584,7 +584,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&allDist.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
@@ -1005,6 +1005,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons

 void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask)
 {
+    assert(mask.empty()); // mask is not supported at the moment
    oclMat trainIdx, distance;
    matchSingle(query, train, trainIdx, distance, mask);
    matchDownload(trainIdx, distance, matches);
@@ -1448,7 +1449,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vec

 // radiusMatchSingle
 void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const oclMat &train,
-        oclMat &trainIdx,	oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
+        oclMat &trainIdx,   oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
 {
    if (query.empty() || train.empty())
        return;
@@ -1694,4 +1695,4 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, std::
    oclMat trainIdx, imgIdx, distance, nMatches;
    radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
    radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
-}
+}
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -288,13 +288,14 @@ namespace cv
                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.cols));
                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.rows));
                args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
-                if(src.clCxt -> impl -> double_support != 0)
+                float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
+
+               if(src.clCxt -> impl -> double_support != 0)
                {
                    args.push_back( std::make_pair(sizeof(cl_double4), (void *)&borderValue));
                }
                else
                {
-                    float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
                    args.push_back( std::make_pair(sizeof(cl_float4), (void *)&borderFloat));
                }
            }
--- a/modules/ocl/src/kernels/brute_force_match.cl
+++ b/modules/ocl/src/kernels/brute_force_match.cl
@@ -5,11 +5,13 @@ int bit1Count(float x)
 {
    int c = 0;
    int ix = (int)x;
+
    for (int i = 0 ; i < 32 ; i++)
    {
        c += ix & 0x1;
        ix >>= 1;
    }
+
    return (float)c;
 }
 /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
@@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size.
 __kernel void BruteForceMatch_UnrollMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
@@ -30,7 +32,7 @@ __kernel void BruteForceMatch_UnrollMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@@ -40,6 +42,7 @@ __kernel void BruteForceMatch_UnrollMatch(
    __local float *s_train = sharebuffer + block_size * max_desc_len;

    int queryIdx = groupidx * block_size + lidy;
+
    // load the query into local memory.
    for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
    {
@@ -52,9 +55,11 @@ __kernel void BruteForceMatch_UnrollMatch(

    // loopUnrolledCached to find the best trainIdx and best distance.
    volatile int imgIdx = 0;
+
    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0;
+
        for (int i = 0 ; i < max_desc_len / block_size ; i++)
        {
            //load a block_size * block_size block into local train.
@@ -67,28 +72,34 @@ __kernel void BruteForceMatch_UnrollMatch(
            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-            switch(distType)
+            switch (distType)
            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                }
-                break;
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                    }
+
+                    break;
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@@ -105,8 +116,8 @@ __kernel void BruteForceMatch_UnrollMatch(
    }

    barrier(CLK_LOCAL_MEM_FENCE);
-    __local float *s_distance = (__local float*)(sharebuffer);
-    __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+    __local float *s_distance = (__local float *)(sharebuffer);
+    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);

    //find BestMatch
    s_distance += lidy * block_size;
@@ -136,7 +147,7 @@ __kernel void BruteForceMatch_UnrollMatch(
 __kernel void BruteForceMatch_Match(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
@@ -147,7 +158,7 @@ __kernel void BruteForceMatch_Match(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@@ -166,6 +177,7 @@ __kernel void BruteForceMatch_Match(
    {
        //Dist dist;
        float result = 0;
+
        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
        {
            const int loadx = lidx + i * block_size;
@@ -184,28 +196,34 @@ __kernel void BruteForceMatch_Match(
            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-            switch(distType)
+            switch (distType)
            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
-                }
-                break;
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+                    }
+
+                    break;
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    __global float *query,
    __global float *train,
    float maxDistance,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
@@ -271,7 +289,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    int step,
    int ostep,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@@ -285,6 +303,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    __local float *s_train = sharebuffer + block_size * block_size;

    float result = 0;
+
    for (int i = 0 ; i < max_desc_len / block_size ; ++i)
    {
        //load a block_size * block_size block into local train.
@@ -299,27 +318,33 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-        switch(distType)
+        switch (distType)
        {
-        case 0:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-            }
-            break;
-        case 1:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                result += qr * qr;
-            }
-            break;
-        case 2:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-            }
-            break;
+            case 0:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+                }
+
+                break;
+            case 1:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+                    result += qr * qr;
+                }
+
+                break;
+            case 2:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                }
+
+                break;
        }

        barrier(CLK_LOCAL_MEM_FENCE);
@@ -329,7 +354,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    {
        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);

-        if(ind < bestTrainIdx_cols)
+        if (ind < bestTrainIdx_cols)
        {
            //bestImgIdx = imgIdx;
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
@@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    __global float *query,
    __global float *train,
    float maxDistance,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
@@ -357,7 +382,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    int step,
    int ostep,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@@ -371,6 +396,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    __local float *s_train = sharebuffer + block_size * block_size;

    float result = 0;
+
    for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
    {
        //load a block_size * block_size block into local train.
@@ -385,27 +411,33 @@ __kernel void BruteForceMatch_RadiusMatch(
        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-        switch(distType)
+        switch (distType)
        {
-        case 0:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-            }
-            break;
-        case 1:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                result += qr * qr;
-            }
-            break;
-        case 2:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-            }
-            break;
+            case 0:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+                }
+
+                break;
+            case 1:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+                    result += qr * qr;
+                }
+
+                break;
+            case 2:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                }
+
+                break;
        }

        barrier(CLK_LOCAL_MEM_FENCE);
@@ -415,7 +447,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    {
        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);

-        if(ind < bestTrainIdx_cols)
+        if (ind < bestTrainIdx_cols)
        {
            //bestImgIdx = imgIdx;
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
@@ -428,7 +460,7 @@ __kernel void BruteForceMatch_RadiusMatch(
 __kernel void BruteForceMatch_knnUnrollMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
@@ -440,7 +472,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@@ -464,9 +496,11 @@ __kernel void BruteForceMatch_knnUnrollMatch(

    //loopUnrolledCached
    volatile int imgIdx = 0;
+
    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0;
+
        for (int i = 0 ; i < max_desc_len / block_size ; i++)
        {
            const int loadX = lidx + i * block_size;
@@ -480,28 +514,34 @@ __kernel void BruteForceMatch_knnUnrollMatch(
            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-            switch(distType)
+            switch (distType)
            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                }
-                break;
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                    }
+
+                    break;
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@@ -549,6 +589,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
        for (int i = 0 ; i < block_size ; i++)
        {
            float val = s_distance[i];
+
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
@@ -602,7 +643,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
 __kernel void BruteForceMatch_knnMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
@@ -613,7 +654,7 @@ __kernel void BruteForceMatch_knnMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@@ -632,7 +673,8 @@ __kernel void BruteForceMatch_knnMatch(
    for (int  t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0.0f;
-        for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
+
+        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
        {
            const int loadx = lidx + i * block_size;
            //load query and train into local memory
@@ -650,28 +692,34 @@ __kernel void BruteForceMatch_knnMatch(
            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-            switch(distType)
+            switch (distType)
            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
-                }
-                break;
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+                    }
+
+                    break;
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@@ -719,6 +767,7 @@ __kernel void BruteForceMatch_knnMatch(
        for (int i = 0 ; i < block_size ; i++)
        {
            float val = s_distance[i];
+
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
@@ -772,7 +821,7 @@ __kernel void BruteForceMatch_knnMatch(
 kernel void BruteForceMatch_calcDistanceUnrolled(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int block_size,
@@ -790,7 +839,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
 kernel void BruteForceMatch_calcDistance(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int block_size,
@@ -808,9 +857,9 @@ kernel void BruteForceMatch_findBestMatch(
    __global float *allDist,
    __global int *bestTrainIdx,
    __global float *bestDistance,
-     int k,
-     int block_size
-    )
+    int k,
+    int block_size
+)
 {
    /* Todo */
 }
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/kernels/nonfree_surf.cl
@@ -78,7 +78,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col

 // dynamically change the precision used for floating type

-#if defined (__ATI__) || defined (__NVIDIA__)
+#if defined DOUBLE_SUPPORT
 #define F double
 #else
 #define F float
@@ -299,7 +299,7 @@ __kernel
    __global const float * det,
    __global const float * trace,
    __global int4 * maxPosBuffer,
-    volatile __global unsigned int* maxCounter,
+    volatile __global int* maxCounter,
    int counter_offset,
    int det_step,     // the step of det in bytes
    int trace_step,   // the step of trace in bytes
@@ -408,7 +408,7 @@ __kernel

                if(condmax)
                {
-                    unsigned int ind = atomic_inc(maxCounter);
+                    int ind = atomic_inc(maxCounter);

                    if (ind < c_max_candidates)
                    {
@@ -427,7 +427,7 @@ __kernel
    __global float * det,
    __global float * trace,
    __global int4 * maxPosBuffer,
-    volatile __global unsigned int* maxCounter,
+    volatile __global  int* maxCounter,
    int counter_offset,
    int det_step,     // the step of det in bytes
    int trace_step,   // the step of trace in bytes
@@ -525,7 +525,7 @@ __kernel

            if(condmax)
            {
-                unsigned int ind = atomic_inc(maxCounter);
+                 int ind = atomic_inc(maxCounter);

                if (ind < c_max_candidates)
                {
@@ -585,7 +585,7 @@ __kernel
    __global const float * det,
    __global const int4 * maxPosBuffer,
    __global float * keypoints,
-    volatile __global unsigned int * featureCounter,
+    volatile __global  int * featureCounter,
    int det_step,
    int keypoints_step,
    int c_img_rows,
@@ -684,7 +684,7 @@ __kernel
                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
                {
                    // Get a new feature index.
-                    unsigned int ind = atomic_inc(featureCounter);
+                     int ind = atomic_inc(featureCounter);

                    if (ind < c_max_features)
                    {
@@ -737,19 +737,19 @@ __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448
 __constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
 __constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};

-void reduce_32_sum(volatile __local  float * data, float partial_reduction, int tid)
+void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduction, int tid)
 {
-#define op(A, B) (A)+(B)
-    data[tid] = partial_reduction;
+#define op(A, B) (*A)+(B)
+    data[tid] = *partial_reduction;
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 16)
    {
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
    }
 #undef op
 }
@@ -831,7 +831,7 @@ __kernel
    {
        const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;

-        float sumx = 0.0f, sumy = 0.0f;
+        volatile float sumx = 0.0f, sumy = 0.0f;
        int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
        {
@@ -856,8 +856,8 @@ __kernel
            sumx += s_X[get_local_id(0) + 96];
            sumy += s_Y[get_local_id(0) + 96];
        }
-        reduce_32_sum(s_sumx + get_local_id(1) * 32, sumx, get_local_id(0));
-        reduce_32_sum(s_sumy + get_local_id(1) * 32, sumy, get_local_id(0));
+        reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
+        reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));

        const float temp_mod = sumx * sumx + sumy * sumy;
        if (temp_mod > best_mod)
@@ -892,14 +892,32 @@ __kernel
            kp_dir += 2.0f * CV_PI_F;
        kp_dir *= 180.0f / CV_PI_F;

-        kp_dir = 360.0f - kp_dir;
-        if (fabs(kp_dir - 360.f) < FLT_EPSILON)
-            kp_dir = 0.f;
+        //kp_dir = 360.0f - kp_dir;
+        //if (fabs(kp_dir - 360.f) < FLT_EPSILON)
+        //    kp_dir = 0.f;

        featureDir[get_group_id(0)] = kp_dir;
    }
 }

+
+__kernel
+    void icvSetUpright(
+    __global float * keypoints,
+    int keypoints_step,
+    int nFeatures
+    )
+{
+    keypoints_step /= sizeof(*keypoints);
+    __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
+
+    if(get_global_id(0) <= nFeatures)
+    {
+        featureDir[get_global_id(0)] = 90.0f;
+    }
+}
+
+
 #undef ORI_SEARCH_INC
 #undef ORI_WIN
 #undef ORI_SAMPLES
@@ -993,10 +1011,7 @@ void calc_dx_dy(
    const float centerX = featureX[get_group_id(0)];
    const float centerY = featureY[get_group_id(0)];
    const float size = featureSize[get_group_id(0)];
-    float descriptor_dir = 360.0f - featureDir[get_group_id(0)];
-    if (fabs(descriptor_dir - 360.f) < FLT_EPSILON)
-        descriptor_dir = 0.f;
-    descriptor_dir *= (float)(CV_PI_F / 180.0f);
+    float descriptor_dir = featureDir[get_group_id(0)] * (float)(CV_PI_F / 180.0f);

    /* The sampling intervals and wavelet sized for selecting an orientation
    and building the keypoint descriptor are defined relative to 's' */
@@ -1125,11 +1140,15 @@ __kernel
    {
        sdxabs[tid] = fabs(sdx[tid]); // |dx| array
        sdyabs[tid] = fabs(sdy[tid]); // |dy| array
-        //barrier(CLK_LOCAL_MEM_FENCE);
-
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 25)
+    {
        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-        //barrier(CLK_LOCAL_MEM_FENCE);
-
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 25)
+    {
        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);

        // write dx, dy, |dx|, |dy|
--- a/modules/ocl/src/kernels/objdetect_hog.cl
+++ b/modules/ocl/src/kernels/objdetect_hog.cl
@@ -140,6 +140,10 @@ float reduce_smem(volatile __local float* smem, int size)
    if (tid < 32)
    {
        if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
        if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
        if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
        if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
@@ -224,6 +228,11 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
    {
        volatile __local float* smem = products;
        smem[tid] = product = product + smem[tid + 32];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
+        volatile __local float* smem = products;
        smem[tid] = product = product + smem[tid + 16];
        smem[tid] = product = product + smem[tid + 8];
        smem[tid] = product = product + smem[tid + 4];
--- a/modules/ocl/src/surf.cpp
+++ b/modules/ocl/src/surf.cpp
@@ -56,6 +56,21 @@ namespace cv
    {
        ///////////////////////////OpenCL kernel strings///////////////////////////
        extern const char *nonfree_surf;
+
+        const char* noImage2dOption = "-D DISABLE_IMAGE2D";
+
+        static void openCLExecuteKernelSURF(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+            size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
+        {
+            if(support_image2d())
+            {
+                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth);
+            }
+            else
+            {
+                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption);
+            }
+        }
    }
 }

@@ -79,10 +94,6 @@ static inline int calcSize(int octave, int layer)
    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 }

-namespace
-{
-    const char* noImage2dOption = "-D DISABLE_IMAGE2D";
-}

 class SURF_OCL_Invoker
 {
@@ -99,15 +110,16 @@ public:
    void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
                                  int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);

-    void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
+    void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
                                    oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures);

    void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);

+    void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures);
+
    void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
    // end of kernel callers declarations

-
    SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
        surf_(surf),
        img_cols(img.cols), img_rows(img.rows),
@@ -181,8 +193,8 @@ public:
            icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
                                     octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);

-            unsigned int maxCounter = Mat(counters).at<unsigned int>(1 + octave);
-            maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
+            int maxCounter = ((Mat)counters).at<int>(1 + octave);
+            maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));

            if (maxCounter > 0)
            {
@@ -190,15 +202,29 @@ public:
                                           keypoints, counters, octave, layer_rows, maxFeatures);
            }
        }
-        unsigned int featureCounter = Mat(counters).at<unsigned int>(0);
-        featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
+        int featureCounter = Mat(counters).at<int>(0);
+        featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));

        keypoints.cols = featureCounter;

        if (surf_.upright)
-            keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
+        {
+            //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
+            setUpright(keypoints);
+        }
        else
+        {
            findOrientation(keypoints);
+        }
+    }
+
+    void setUpright(oclMat &keypoints)
+    {
+        const int nFeatures = keypoints.cols;
+        if(nFeatures > 0)
+        {
+            icvSetUpright_gpu(keypoints, keypoints.cols);
+        }
    }

    void findOrientation(oclMat &keypoints)
@@ -483,14 +509,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
        divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
        1
    };
-    if(support_image2d())
-    {
-    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-    else
-    {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-    }
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
@@ -536,17 +555,10 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
                               1
                              };

-    if(support_image2d())
-    {
-    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-    else
-    {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-    }
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }

-void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
+void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
        oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
 {
    Context *clCxt = det.clCxt;
@@ -568,14 +580,7 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
    size_t localThreads[3]  = {3, 3, 3};
    size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};

-    if(support_image2d())
-    {
-    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-    else
-    {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-    }
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
@@ -602,16 +607,27 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
    size_t localThreads[3]  = {32, 4, 1};
    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};

-    if(support_image2d())
-    {
-    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
-    else
-    {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-    }
+
+void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
+{
+    Context *clCxt = counters.clCxt;
+    std::string kernelName = "icvSetUpright";
+
+    std::vector< std::pair<size_t, const void *> > args;
+
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures));
+
+    size_t localThreads[3]  = {256, 1, 1};
+    size_t globalThreads[3] = {nFeatures, 1, 1};
+
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }

+
 void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
 {
    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
@@ -647,14 +663,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
-        if(support_image2d())
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-        }
-        else
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-        }
+
+        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);

        kernelName = "normalize_descriptors64";

@@ -667,14 +677,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        if(support_image2d())
-        {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-    }
-    else
-    {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-        }
+
+        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
    }
    else
    {
@@ -702,14 +706,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
-        if(support_image2d())
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-        }
-        else
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-        }
+
+        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);

        kernelName = "normalize_descriptors128";

@@ -722,14 +720,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        if(support_image2d())
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-        }
-        else
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-        }
+
+        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
    }
 }