From 9b00c14fff6e9d9753d25c7c2578528bd2d1c54d Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 28 Feb 2013 16:19:16 +0400
Subject: [PATCH 1/8] moved documentation for CUDA version of softcascade

---
 modules/gpu/doc/object_detection.rst         | 82 --------------------
 modules/softcascade/doc/softcascade.rst      |  3 +-
 modules/softcascade/doc/softcascade_cuda.rst | 62 +++++++++++++++
 3 files changed, 64 insertions(+), 83 deletions(-)
 create mode 100644 modules/softcascade/doc/softcascade_cuda.rst

diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst
index 4afe8bb4a..fd0ac867e 100644
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -199,88 +199,6 @@ Returns block descriptors computed for the whole image.
 The function is mainly used to learn the classifier.
 
 
-Soft Cascade Classifier
-==========================
-
-Soft Cascade Classifier for Object Detection
-----------------------------------------------------------
-
-Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows:
-
-.. math::
-    \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)}
-
-where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let
-
-.. math::
-    \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)}
-
-be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*.
-After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*.
-
-The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_.
-
-gpu::SCascade
------------------------------------------------
-.. ocv:class:: gpu::SCascade : public Algorithm
-
-Implementation of soft (stageless) cascaded detector. ::
-
-    class CV_EXPORTS SCascade : public Algorithm
-    {
-        struct CV_EXPORTS Detection
-        {
-              ushort x;
-              ushort y;
-              ushort w;
-              ushort h;
-              float confidence;
-              int kind;
-
-              enum {PEDESTRIAN = 0};
-        };
-
-        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
-        virtual ~SCascade();
-        virtual bool load(const FileNode& fn);
-        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
-    };
-
-
-gpu::SCascade::~SCascade
----------------------------
-Destructor for SCascade.
-
-.. ocv:function:: gpu::SCascade::~SCascade()
-
-
-
-gpu::SCascade::load
---------------------------
-Load cascade from FileNode.
-
-.. ocv:function:: bool gpu::SCascade::load(const FileNode& fn)
-
-    :param fn: File node from which the soft cascade are read.
-
-
-
-gpu::SCascade::detect
---------------------------
-Apply cascade to an input frame and return the vector of Decection objcts.
-
-.. ocv:function:: void gpu::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
-
-    :param image: a frame on which detector will be applied.
-
-    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
-
-    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
-
-    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
-
-
 gpu::CascadeClassifier_GPU
 --------------------------
 .. ocv:class:: gpu::CascadeClassifier_GPU
diff --git a/modules/softcascade/doc/softcascade.rst b/modules/softcascade/doc/softcascade.rst
index 66c93d1fc..a8e041732 100644
--- a/modules/softcascade/doc/softcascade.rst
+++ b/modules/softcascade/doc/softcascade.rst
@@ -8,4 +8,5 @@ softcascade. Soft Cascade object detection and training.
     :maxdepth: 2
 
     softcascade_detector
-    softcascade_training
\ No newline at end of file
+    softcascade_training
+    softcascade_cuda
\ No newline at end of file
diff --git a/modules/softcascade/doc/softcascade_cuda.rst b/modules/softcascade/doc/softcascade_cuda.rst
new file mode 100644
index 000000000..504774898
--- /dev/null
+++ b/modules/softcascade/doc/softcascade_cuda.rst
@@ -0,0 +1,62 @@
+CUDA version of Soft Cascade Classifier
+========================================
+
+softcascade::SCascade
+-----------------------------------------------
+.. ocv:class:: softcascade::SCascade : public Algorithm
+
+Implementation of soft (stageless) cascaded detector. ::
+
+    class CV_EXPORTS SCascade : public Algorithm
+    {
+        struct CV_EXPORTS Detection
+        {
+              ushort x;
+              ushort y;
+              ushort w;
+              ushort h;
+              float confidence;
+              int kind;
+
+              enum {PEDESTRIAN = 0};
+        };
+
+        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
+        virtual ~SCascade();
+        virtual bool load(const FileNode& fn);
+        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+    };
+
+
+softcascade::SCascade::~SCascade
+---------------------------
+Destructor for SCascade.
+
+.. ocv:function:: gpu::SCascade::~SCascade()
+
+
+
+softcascade::SCascade::load
+--------------------------
+Load cascade from FileNode.
+
+.. ocv:function:: bool gpu::SCascade::load(const FileNode& fn)
+
+    :param fn: File node from which the soft cascade are read.
+
+
+
+softcascade::SCascade::detect
+--------------------------
+Apply cascade to an input frame and return the vector of Decection objcts.
+
+.. ocv:function:: void gpu::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
+
+    :param image: a frame on which detector will be applied.
+
+    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
+
+    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
+
+    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.

From 5120322cea5a0f00e1120780287f8eb89994b9e9 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sun, 3 Mar 2013 11:11:42 +0400
Subject: [PATCH 2/8] move gpu version of soft cascade to dedicated module

---
 apps/sft/CMakeLists.txt                       |   2 +-
 modules/gpu/include/opencv2/gpu/gpu.hpp       |   5 +-
 modules/gpu/src/gpu_init.cpp                  |  59 ------
 modules/softcascade/CMakeLists.txt            |  51 ++++-
 .../include/opencv2/softcascade.hpp           |  90 ++++++++
 .../perf/perf_cuda_softcascade.cpp}           |  40 ++--
 .../{gpu => softcascade}/src/cuda/icf-sc.cu   |  13 +-
 .../src/cuda_invoker.hpp}                     |  29 ++-
 .../src/detector_cuda.cpp}                    | 195 +++++++++---------
 modules/softcascade/src/softcascade_init.cpp  |   7 +-
 .../test/test_cuda_softcascade.cpp}           |  71 +++----
 modules/softcascade/test/test_main.cpp        |   2 +-
 modules/softcascade/test/utility.cpp          | 109 ++++++++++
 modules/softcascade/test/utility.hpp          |  73 +++++++
 samples/gpu/CMakeLists.txt                    |   2 +-
 samples/gpu/softcascade.cpp                   |   5 +-
 16 files changed, 504 insertions(+), 249 deletions(-)
 delete mode 100644 modules/gpu/src/gpu_init.cpp
 rename modules/{gpu/perf/perf_softcascade.cpp => softcascade/perf/perf_cuda_softcascade.cpp} (82%)
 rename modules/{gpu => softcascade}/src/cuda/icf-sc.cu (98%)
 rename modules/{gpu/src/icf.hpp => softcascade/src/cuda_invoker.hpp} (81%)
 rename modules/{gpu/src/softcascade.cpp => softcascade/src/detector_cuda.cpp} (74%)
 rename modules/{gpu/test/test_softcascade.cpp => softcascade/test/test_cuda_softcascade.cpp} (80%)
 create mode 100644 modules/softcascade/test/utility.cpp
 create mode 100644 modules/softcascade/test/utility.hpp

diff --git a/apps/sft/CMakeLists.txt b/apps/sft/CMakeLists.txt
index 8b950225c..c7bd187a2 100644
--- a/apps/sft/CMakeLists.txt
+++ b/apps/sft/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(name sft)
 set(the_target opencv_${name})
 
-set(OPENCV_${the_target}_DEPS opencv_core opencv_softcascade opencv_highgui opencv_imgproc opencv_ml)
+set(OPENCV_${the_target}_DEPS opencv_core opencv_softcascade opencv_highgui opencv_imgproc opencv_ml OPTIONAL opencv_gpu opencv_objdetect opencv_featurest2d)
 ocv_check_dependencies(${OPENCV_${the_target}_DEPS})
 
 if(NOT OCV_DEPENDENCIES_FOUND)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index d82211bf3..ee42816db 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -7,12 +7,11 @@
 //  copy or use the software.
 //
 //
-//                          License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -23,7 +22,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other GpuMaterials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/gpu/src/gpu_init.cpp b/modules/gpu/src/gpu_init.cpp
deleted file mode 100644
index 8ed93651a..000000000
--- a/modules/gpu/src/gpu_init.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace gpu
-{
-
-CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
-                  obj.info()->addParam(obj, "minScale", obj.minScale);
-                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
-                  obj.info()->addParam(obj, "scales",   obj.scales));
-
-bool initModule_gpu(void)
-{
-    Ptr<Algorithm> sc = createSCascade();
-    return sc->info() != 0;
-}
-
-} }
\ No newline at end of file
diff --git a/modules/softcascade/CMakeLists.txt b/modules/softcascade/CMakeLists.txt
index fb48814cf..f19241a4a 100644
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@@ -1,3 +1,50 @@
+macro(ocv_glob_cuda_powered_module_sources)
+  file(GLOB_RECURSE lib_srcs "src/*.cpp")
+  file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
+  file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
+  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
+
+  file(GLOB_RECURSE lib_device_srcs "src/*.cu")
+  set(device_objs "")
+  set(lib_device_hdrs "")
+
+  if (HAVE_CUDA AND lib_device_srcs)
+  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  file(GLOB_RECURSE lib_device_hdrs "src/cuda/*.hpp")
+
+  ocv_cuda_compile(device_objs ${lib_device_srcs})
+  source_group("Src\\Cuda" FILES ${lib_device_srcs} ${lib_device_hdrs})
+  if (lib_device_hdrs)
+  list(REMOVE_ITEM lib_int_hdrs ${lib_device_hdrs})
+  endif()
+  endif()
+
+  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
+  SOURCES ${lib_srcs} ${lib_int_hdrs} ${device_objs} ${lib_device_srcs} ${lib_device_hdrs})
+
+  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
+  source_group("Include" FILES ${lib_hdrs})
+  source_group("Include\\detail" FILES ${lib_hdrs_detail})
+endmacro()
+
 set(the_description "Soft Cascade detection and training")
-ocv_define_module(softcascade opencv_core opencv_imgproc opencv_ml)
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310)
\ No newline at end of file
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310 -Wundef -Wmissing-declarations)
+set(cuda_deps "")
+set(cuda_include "")
+
+if (NAVE_CUDA)
+  set(cuda_deps ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
+ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL opencv_gpu ${cuda_deps})
+
+if(HAVE_CUDA)
+  ocv_module_include_directories(${CUDA_INCLUDE_DIRS})
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+endif()
+
+ocv_glob_cuda_powered_module_sources()
+ocv_create_module()
+ocv_add_precompiled_headers(${the_module})
+
+ocv_add_accuracy_tests()
+ocv_add_perf_tests()
diff --git a/modules/softcascade/include/opencv2/softcascade.hpp b/modules/softcascade/include/opencv2/softcascade.hpp
index e0dbdf450..e97ac4d78 100644
--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
@@ -212,6 +212,96 @@ public:
 
 CV_EXPORTS bool initModule_softcascade(void);
 
+// ======================== GPU version for soft cascade ===================== //
+
+class CV_EXPORTS ChannelsProcessor
+{
+public:
+    enum
+    {
+        GENERIC   = 1 << 4,
+        SEPARABLE = 2 << 4
+    };
+
+    // Appends specified number of HOG first-order features integrals into given vector.
+    // Param frame is an input 3-channel bgr image.
+    // Param channels is a GPU matrix of optionally shrinked channels
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
+    virtual void apply(InputArray frame, OutputArray channels, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) = 0;
+
+    // Creates a specific preprocessor implementation.
+    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
+    // Param bins is a number of HOG-like channels.
+    // Param flags is a channel computing extra flags.
+    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
+
+    virtual ~ChannelsProcessor();
+
+protected:
+    ChannelsProcessor();
+};
+
+// Implementation of soft (stage-less) cascaded detector.
+class CV_EXPORTS SCascade : public cv::Algorithm
+{
+public:
+
+    // Representation of detectors result.
+    struct CV_EXPORTS Detection
+    {
+        ushort x;
+        ushort y;
+        ushort w;
+        ushort h;
+        float confidence;
+        int kind;
+
+        enum {PEDESTRIAN = 0};
+    };
+
+    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
+
+    // An empty cascade will be created.
+    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applied.
+    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applied.
+    // Param scales is a number of scales from minScale to maxScale.
+    // Param flags is an extra tuning flags.
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
+        const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
+
+    virtual ~SCascade();
+
+    cv::AlgorithmInfo* info() const;
+
+    // Load cascade from FileNode.
+    // Param fn is a root node for cascade. Should be <cascade>.
+    virtual bool load(const FileNode& fn);
+
+    // Load cascade config.
+    virtual void read(const FileNode& fn);
+
+    // Return the matrix of of detected objects.
+    // Param image is a frame on which detector will be applied.
+    // Param rois is a regions of interests mask generated by genRoi.
+    //    Only the objects that fall into one of the regions will be returned.
+    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
+    //    The first element of the matrix is  actually a count of detections.
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) const;
+
+private:
+
+    struct Fields;
+    Fields* fields;
+
+    double minScale;
+    double maxScale;
+    int scales;
+
+    int flags;
+};
+
+
 }} // namespace cv { namespace softcascade {
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/softcascade/perf/perf_cuda_softcascade.cpp
similarity index 82%
rename from modules/gpu/perf/perf_softcascade.cpp
rename to modules/softcascade/perf/perf_cuda_softcascade.cpp
index 6cb3c6356..86b7c7dd0 100644
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/softcascade/perf/perf_cuda_softcascade.cpp
@@ -1,5 +1,7 @@
 #include "perf_precomp.hpp"
 
+using std::tr1::get;
+
 #define SC_PERF_TEST_P(fixture, name, params)  \
     class fixture##_##name : public fixture {\
      public:\
@@ -25,8 +27,8 @@ void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";
 namespace {
     struct DetectionLess
     {
-        bool operator()(const cv::gpu::SCascade::Detection& a,
-            const cv::gpu::SCascade::Detection& b) const
+        bool operator()(const cv::softcascade::SCascade::Detection& a,
+            const cv::softcascade::SCascade::Detection& b) const
         {
             if (a.x != b.x)      return a.x < b.x;
             else if (a.y != b.y) return a.y < b.y;
@@ -39,7 +41,7 @@ namespace {
     {
         cv::Mat detections(objects);
 
-        typedef cv::gpu::SCascade::Detection Detection;
+        typedef cv::softcascade::SCascade::Detection Detection;
         Detection* begin = (Detection*)(detections.ptr<char>(0));
         Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
         std::sort(begin, end, DetectionLess());
@@ -60,18 +62,18 @@ SC_PERF_TEST_P(SCascadeTest, detect,
 
 RUN_GPU(SCascadeTest, detect)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));;
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(1);
 
     cascade.detect(colored, rois, objectBoxes);
@@ -118,13 +120,13 @@ SC_PERF_TEST_P(SCascadeTestRoi, detectInRoi,
 
 RUN_GPU(SCascadeTestRoi, detectInRoi)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -132,7 +134,7 @@ RUN_GPU(SCascadeTestRoi, detectInRoi)
     cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
 
-    int nroi = GET_PARAM(2);
+    int nroi = get<2>(GetParam());
     cv::RNG rng;
     for (int i = 0; i < nroi; ++i)
     {
@@ -163,13 +165,13 @@ SC_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,
 
 RUN_GPU(SCascadeTestRoi, detectEachRoi)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -177,7 +179,7 @@ RUN_GPU(SCascadeTestRoi, detectEachRoi)
     cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
 
-    int idx = GET_PARAM(2);
+    int idx = get<2>(GetParam());
     cv::Rect r = getFromTable(idx);
     cv::gpu::GpuMat sub(rois, r);
     sub.setTo(1);
@@ -202,18 +204,18 @@ SC_PERF_TEST_P(SCascadeTest, detectStream,
 
 RUN_GPU(SCascadeTest, detectStream)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(1);
 
     cv::gpu::Stream s;
diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/softcascade/src/cuda/icf-sc.cu
similarity index 98%
rename from modules/gpu/src/cuda/icf-sc.cu
rename to modules/softcascade/src/cuda/icf-sc.cu
index f6eb74422..d339ef0d3 100644
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -43,12 +43,11 @@
 #include <opencv2/gpu/device/common.hpp>
 #include <opencv2/gpu/device/saturate_cast.hpp>
 
-#include <icf.hpp>
+#include <cuda_invoker.hpp>
 #include <float.h>
 #include <stdio.h>
 
-namespace cv { namespace gpu { namespace device {
-namespace icf {
+namespace cv { namespace softcascade { namespace device {
 
     template <int FACTOR>
     __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
@@ -303,7 +302,7 @@ namespace icf {
                     excluded = excluded || (suppessed == i);
                 }
 
-            #if __CUDA_ARCH__ >= 120
+            #if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 120)
                 if (__all(excluded)) break;
             #endif
             }
@@ -348,7 +347,7 @@ namespace icf {
     template<typename Policy>
     struct PrefixSum
     {
-    __device static void apply(float& impact)
+    __device_inline__ static void apply(float& impact)
         {
     #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
     #pragma unroll
@@ -442,6 +441,7 @@ namespace icf {
     {
         x += area.x;
         y += area.y;
+
         int a = tex2D(thogluv, x, y);
         int b = tex2D(thogluv, x + area.z, y);
         int c = tex2D(thogluv, x + area.z, y + area.w);
@@ -454,7 +454,7 @@ namespace icf {
 
 template<typename Policy>
 template<bool isUp>
-__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
+__device_inline__ void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
 {
     const int y = blockIdx.y * blockDim.y + threadIdx.y;
     const int x = blockIdx.x;
@@ -563,5 +563,4 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
 template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
     PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
 
-}
 }}}
diff --git a/modules/gpu/src/icf.hpp b/modules/softcascade/src/cuda_invoker.hpp
similarity index 81%
rename from modules/gpu/src/icf.hpp
rename to modules/softcascade/src/cuda_invoker.hpp
index e4e3f9416..958850f0c 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/softcascade/src/cuda_invoker.hpp
@@ -22,7 +22,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and / or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -46,15 +46,16 @@
 
 #include <opencv2/gpu/device/common.hpp>
 
+using namespace cv::gpu::device;
+
 #if defined __CUDACC__
-# define __device __device__ __forceinline__
+# define __device_inline__ __device__ __forceinline__
 #else
-# define __device
+# define __device_inline__
 #endif
 
 
-namespace cv { namespace gpu { namespace device {
-namespace icf {
+namespace cv { namespace softcascade { namespace device {
 
 struct Octave
 {
@@ -68,20 +69,19 @@ struct Octave
     : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
 };
 
-struct Level //is actually 24 bytes
+struct Level
 {
     int octave;
     int step;
 
     float relScale;
-    float scaling[2]; // calculated according to Dollal paper
+    float scaling[2];// calculated according to Dollar paper
 
-    // for 640x480 we can not get overflow
     uchar2 workRect;
     uchar2 objSize;
 
     Level(int idx, const Octave& oct, const float scale, const int w, const int h);
-    __device Level(){}
+    __device_inline__ Level(){}
 };
 
 struct Node
@@ -106,7 +106,7 @@ struct Detection
     int kind;
 
     Detection(){}
-    __device Detection(int _x, int _y, uchar _w, uchar _h, float c)
+    __device_inline__ Detection(int _x, int _y, uchar _w, uchar _h, float c)
     : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };
 
@@ -125,8 +125,8 @@ struct CascadeInvoker
 {
     CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
 
-    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages,
-                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    CascadeInvoker(const cv::gpu::PtrStepSzb& _levels, const cv::gpu::PtrStepSzf& _stages,
+                   const cv::gpu::PtrStepSzb& _nodes,  const cv::gpu::PtrStepSzf& _leaves)
     : levels((const Level*)_levels.ptr()),
       stages((const float*)_stages.ptr()),
       nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
@@ -141,14 +141,13 @@ struct CascadeInvoker
 
     int scales;
 
-    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+    void operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects,
         const int downscales, const cudaStream_t& stream = 0) const;
 
     template<bool isUp>
-    __device void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
+    __device_inline__ void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
 };
 
-}
 }}}
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/softcascade/src/detector_cuda.cpp
similarity index 74%
rename from modules/gpu/src/softcascade.cpp
rename to modules/softcascade/src/detector_cuda.cpp
index 5abcd6308..6c920332f 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -41,17 +41,18 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencv2/gpu/stream_accessor.hpp"
 
 #if !defined (HAVE_CUDA)
-cv::gpu::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
+cv::softcascade::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
 
-cv::gpu::SCascade::~SCascade() { throw_nogpu(); }
+cv::softcascade::SCascade::~SCascade() { throw_nogpu(); }
 
-bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
+bool cv::softcascade::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
 
-void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
+void cv::softcascade::SCascade::detect(InputArray, InputArray, OutputArray, cv::gpu::Stream&) const { throw_nogpu(); }
 
-void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
+void cv::softcascade::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 
 cv::gpu::ChannelsProcessor::ChannelsProcessor() { throw_nogpu(); }
  cv::gpu::ChannelsProcessor::~ChannelsProcessor() { throw_nogpu(); }
@@ -60,9 +61,9 @@ cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int
 { throw_nogpu(); return cv::Ptr<cv::gpu::ChannelsProcessor>(0); }
 
 #else
-# include "icf.hpp"
+# include "cuda_invoker.hpp"
 
-cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+cv::softcascade::device::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
 :  octave(idx), step(oct.stages), relScale(scale / oct.scale)
 {
     workRect.x = cvRound(w / (float)oct.shrinkage);
@@ -81,23 +82,20 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale
     }
 }
 
-namespace cv { namespace gpu { namespace device {
+namespace cv { namespace softcascade { namespace device {
 
-namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins, cudaStream_t stream);
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
-        PtrStepSzb suppressed, cudaStream_t stream);
+    void suppress(const cv::gpu::PtrStepSzb& objects, cv::gpu::PtrStepSzb overlaps, cv::gpu::PtrStepSzi ndetections,
+        cv::gpu::PtrStepSzb suppressed, cudaStream_t stream);
 
-    void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv);
-    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins);
+    void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv);
+    void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins);
     void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk);
-}
-
 }}}
 
-struct cv::gpu::SCascade::Fields
+struct cv::softcascade::SCascade::Fields
 {
     static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals, const int method)
     {
@@ -138,11 +136,9 @@ struct cv::gpu::SCascade::Fields
         FileNode fn = root[SC_OCTAVES];
         if (fn.empty()) return 0;
 
-        using namespace device::icf;
-
-        std::vector<Octave>  voctaves;
+        std::vector<device::Octave>  voctaves;
         std::vector<float>   vstages;
-        std::vector<Node>    vnodes;
+        std::vector<device::Node>    vnodes;
         std::vector<float>   vleaves;
 
         FileNodeIterator it = fn.begin(), it_end = fn.end();
@@ -158,7 +154,7 @@ struct cv::gpu::SCascade::Fields
             size.x = cvRound(origWidth * scale);
             size.y = cvRound(origHeight * scale);
 
-            Octave octave(octIndex, nweaks, shrinkage, size, scale);
+            device::Octave octave(octIndex, nweaks, shrinkage, size, scale);
             CV_Assert(octave.stages > 0);
             voctaves.push_back(octave);
 
@@ -227,7 +223,7 @@ struct cv::gpu::SCascade::Fields
                     rect.w = saturate_cast<uchar>(r.height);
 
                     unsigned int channel = saturate_cast<unsigned int>(feature_channels[featureIdx]);
-                    vnodes.push_back(Node(rect, channel, th));
+                    vnodes.push_back(device::Node(rect, channel, th));
                 }
 
                 intfns = octfn[SC_LEAF];
@@ -239,13 +235,13 @@ struct cv::gpu::SCascade::Fields
             }
         }
 
-        cv::Mat hoctaves(1, (int) (voctaves.size() * sizeof(Octave)), CV_8UC1, (uchar*)&(voctaves[0]));
+        cv::Mat hoctaves(1, (int) (voctaves.size() * sizeof(device::Octave)), CV_8UC1, (uchar*)&(voctaves[0]));
         CV_Assert(!hoctaves.empty());
 
         cv::Mat hstages(cv::Mat(vstages).reshape(1,1));
         CV_Assert(!hstages.empty());
 
-        cv::Mat hnodes(1, (int) (vnodes.size() * sizeof(Node)), CV_8UC1, (uchar*)&(vnodes[0]) );
+        cv::Mat hnodes(1, (int) (vnodes.size() * sizeof(device::Node)), CV_8UC1, (uchar*)&(vnodes[0]) );
         CV_Assert(!hnodes.empty());
 
         cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1));
@@ -272,8 +268,7 @@ struct cv::gpu::SCascade::Fields
 
     int createLevels(const int fh, const int fw)
     {
-        using namespace device::icf;
-        std::vector<Level> vlevels;
+        std::vector<device::Level> vlevels;
         float logFactor = (::log(maxScale) - ::log(minScale)) / (totals -1);
 
         float scale = minScale;
@@ -286,7 +281,7 @@ struct cv::gpu::SCascade::Fields
             float logScale = ::log(scale);
             int fit = fitOctave(voctaves, logScale);
 
-            Level level(fit, voctaves[fit], scale, width, height);
+            device::Level level(fit, voctaves[fit], scale, width, height);
 
             if (!width || !height)
                 break;
@@ -300,7 +295,7 @@ struct cv::gpu::SCascade::Fields
             scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
         }
 
-        cv::Mat hlevels = cv::Mat(1, (int) (vlevels.size() * sizeof(Level)), CV_8UC1, (uchar*)&(vlevels[0]) );
+        cv::Mat hlevels = cv::Mat(1, (int) (vlevels.size() * sizeof(device::Level)), CV_8UC1, (uchar*)&(vlevels[0]) );
         CV_Assert(!hlevels.empty());
         levels.upload(hlevels);
         downscales = dcs;
@@ -334,7 +329,7 @@ struct cv::gpu::SCascade::Fields
         preprocessor = ChannelsProcessor::create(shrinkage, 6, method);
     }
 
-    void detect(cv::gpu::GpuMat& objects, Stream& s) const
+    void detect(cv::gpu::GpuMat& objects, cv::gpu::Stream& s) const
     {
         if (s)
             s.enqueueMemSet(objects, 0);
@@ -343,16 +338,16 @@ struct cv::gpu::SCascade::Fields
 
         cudaSafeCall( cudaGetLastError());
 
-        device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
-        = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
+        device::CascadeInvoker<device::GK107PolicyX4> invoker
+        = device::CascadeInvoker<device::GK107PolicyX4>(levels, stages, nodes, leaves);
 
-        cudaStream_t stream = StreamAccessor::getStream(s);
+        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
         invoker(mask, hogluv, objects, downscales, stream);
     }
 
-    void suppress(GpuMat& objects, Stream& s)
+    void suppress(cv::gpu::GpuMat& objects, cv::gpu::Stream& s)
     {
-        GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+        cv::gpu::GpuMat ndetections = cv::gpu::GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
         ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
 
         if (s)
@@ -366,20 +361,20 @@ struct cv::gpu::SCascade::Fields
             suppressed.setTo(0);
         }
 
-        cudaStream_t stream = StreamAccessor::getStream(s);
-        device::icf::suppress(objects, overlaps, ndetections, suppressed, stream);
+        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+        device::suppress(objects, overlaps, ndetections, suppressed, stream);
     }
 
 private:
 
-    typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
-    static int fitOctave(const std::vector<device::icf::Octave>& octs, const float& logFactor)
+    typedef std::vector<device::Octave>::const_iterator  octIt_t;
+    static int fitOctave(const std::vector<device::Octave>& octs, const float& logFactor)
     {
         float minAbsLog = FLT_MAX;
         int res =  0;
         for (int oct = 0; oct < (int)octs.size(); ++oct)
         {
-            const device::icf::Octave& octave =octs[oct];
+            const device::Octave& octave =octs[oct];
             float logOctave = ::log(octave.scale);
             float logAbsScale = ::fabs(logFactor - logOctave);
 
@@ -410,37 +405,37 @@ public:
 
 
     // 160x120x10
-    GpuMat shrunk;
+    cv::gpu::GpuMat shrunk;
 
     // temporal mat for integral
-    GpuMat integralBuffer;
+    cv::gpu::GpuMat integralBuffer;
 
     // 161x121x10
-    GpuMat hogluv;
+    cv::gpu::GpuMat hogluv;
 
 
     // used for suppression
-    GpuMat suppressed;
+    cv::gpu::GpuMat suppressed;
     // used for area overlap computing during
-    GpuMat overlaps;
+    cv::gpu::GpuMat overlaps;
 
 
     // Cascade from xml
-    GpuMat octaves;
-    GpuMat stages;
-    GpuMat nodes;
-    GpuMat leaves;
-    GpuMat levels;
+    cv::gpu::GpuMat octaves;
+    cv::gpu::GpuMat stages;
+    cv::gpu::GpuMat nodes;
+    cv::gpu::GpuMat leaves;
+    cv::gpu::GpuMat levels;
 
 
     // For ROI
-    GpuMat mask;
-    GpuMat genRoiTmp;
+    cv::gpu::GpuMat mask;
+    cv::gpu::GpuMat genRoiTmp;
 
-//     GpuMat collected;
+//     cv::gpu::GpuMat collected;
 
 
-    std::vector<device::icf::Octave> voctaves;
+    std::vector<device::Octave> voctaves;
 
 //     DeviceInfo info;
 
@@ -453,19 +448,19 @@ public:
     };
 };
 
-cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int fl)
+cv::softcascade::SCascade::SCascade(const double mins, const double maxs, const int sc, const int fl)
 : fields(0),  minScale(mins), maxScale(maxs), scales(sc), flags(fl) {}
 
-cv::gpu::SCascade::~SCascade() { delete fields; }
+cv::softcascade::SCascade::~SCascade() { delete fields; }
 
-bool cv::gpu::SCascade::load(const FileNode& fn)
+bool cv::softcascade::SCascade::load(const FileNode& fn)
 {
     if (fields) delete fields;
     fields = Fields::parseCascade(fn, (float)minScale, (float)maxScale, scales, flags);
     return fields != 0;
 }
 
-void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, Stream& s) const
+void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, cv::gpu::Stream& s) const
 {
     CV_Assert(fields);
 
@@ -473,11 +468,11 @@ void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray
     int type = _image.type();
     CV_Assert(type == CV_8UC3 || type == CV_32SC1 || (!_rois.empty()));
 
-    const GpuMat image = _image.getGpuMat();
+    const cv::gpu::GpuMat image = _image.getGpuMat();
 
     if (_objects.empty()) _objects.create(1, 4096 * sizeof(Detection), CV_8UC1);
 
-    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
+    cv::gpu::GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
 
     /// roi
     Fields& flds = *fields;
@@ -510,13 +505,13 @@ void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray
 
     if ( (flags && NMS_MASK) != NO_REJECT)
     {
-        GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
+        cv::gpu::GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
         flds.suppress(objects, s);
         flds.suppressed.copyTo(spr);
     }
 }
 
-void cv::gpu::SCascade::read(const FileNode& fn)
+void cv::softcascade::SCascade::read(const FileNode& fn)
 {
     Algorithm::read(fn);
 }
@@ -528,7 +523,7 @@ using cv::OutputArray;
 using cv::gpu::Stream;
 using cv::gpu::GpuMat;
 
-inline void setZero(cv::gpu::GpuMat& m, Stream& s)
+inline void setZero(cv::gpu::GpuMat& m, cv::gpu::Stream& s)
 {
     if (s)
         s.enqueueMemSet(m, 0);
@@ -536,17 +531,17 @@ inline void setZero(cv::gpu::GpuMat& m, Stream& s)
         m.setTo(0);
 }
 
-struct GenricPreprocessor : public cv::gpu::ChannelsProcessor
+struct GenricPreprocessor : public cv::softcascade::ChannelsProcessor
 {
-    GenricPreprocessor(const int s, const int b) : cv::gpu::ChannelsProcessor(), shrinkage(s), bins(b) {}
+    GenricPreprocessor(const int s, const int b) : cv::softcascade::ChannelsProcessor(), shrinkage(s), bins(b) {}
     virtual ~GenricPreprocessor() {}
 
-    virtual void apply(InputArray _frame, OutputArray _shrunk, Stream& s = Stream::Null())
+    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
     {
-        const GpuMat frame = _frame.getGpuMat();
+        const cv::gpu::GpuMat frame = _frame.getGpuMat();
 
         _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
-        GpuMat shrunk = _shrunk.getGpuMat();
+        cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
 
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
@@ -561,53 +556,53 @@ struct GenricPreprocessor : public cv::gpu::ChannelsProcessor
 
 private:
 
-    void createHogBins(Stream& s)
+    void createHogBins(cv::gpu::Stream& s)
     {
         static const int fw = gray.cols;
         static const int fh = gray.rows;
 
         fplane.create(fh * HOG_BINS, fw, CV_32FC1);
 
-        GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
-        GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
+        cv::gpu::GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
+        cv::gpu::GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
 
         cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
         cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
 
-        GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
-        GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
+        cv::gpu::GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
+        cv::gpu::GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
 
         cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
 
         // normalize magnitude to uchar interval and angles to 6 bins
-        GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
-        GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
+        cv::gpu::GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
+        cv::gpu::GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
 
         cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
         cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
 
         //create uchar magnitude
-        GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
+        cv::gpu::GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
         if (s)
             s.enqueueConvert(nmag, cmag, CV_8UC1);
         else
             nmag.convertTo(cmag, CV_8UC1);
 
         cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
-        cv::gpu::device::icf::fillBins(channels, nang, fw, fh, HOG_BINS, stream);
+        cv::softcascade::device::fillBins(channels, nang, fw, fh, HOG_BINS, stream);
     }
 
-    void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s)
+    void createLuvBins(const cv::gpu::GpuMat& colored, cv::gpu::Stream& s)
     {
         static const int fw = colored.cols;
         static const int fh = colored.rows;
 
         cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
 
-        std::vector<GpuMat> splited;
+        std::vector<cv::gpu::GpuMat> splited;
         for(int i = 0; i < LUV_BINS; ++i)
         {
-            splited.push_back(GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
+            splited.push_back(cv::gpu::GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
         }
 
         cv::gpu::split(luv, splited, s);
@@ -618,62 +613,62 @@ private:
     const int shrinkage;
     const int bins;
 
-    GpuMat gray;
-    GpuMat luv;
-    GpuMat channels;
+    cv::gpu::GpuMat gray;
+    cv::gpu::GpuMat luv;
+    cv::gpu::GpuMat channels;
 
     // preallocated buffer for floating point operations
-    GpuMat fplane;
-    GpuMat sobelBuf;
+    cv::gpu::GpuMat fplane;
+    cv::gpu::GpuMat sobelBuf;
 };
 
 
-struct SeparablePreprocessor : public cv::gpu::ChannelsProcessor
+struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
 {
-    SeparablePreprocessor(const int s, const int b) : cv::gpu::ChannelsProcessor(), shrinkage(s), bins(b) {}
+    SeparablePreprocessor(const int s, const int b) : cv::softcascade::ChannelsProcessor(), shrinkage(s), bins(b) {}
     virtual ~SeparablePreprocessor() {}
 
-    virtual void apply(InputArray _frame, OutputArray _shrunk, Stream& s = Stream::Null())
+    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
     {
-        const GpuMat frame = _frame.getGpuMat();
+        const cv::gpu::GpuMat frame = _frame.getGpuMat();
         cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
 
         _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
-        GpuMat shrunk = _shrunk.getGpuMat();
+        cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
 
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
 
         cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
-        cv::gpu::device::icf::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
+        cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
 
         cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));
-        cv::gpu::device::icf::bgr2Luv(bgr, luv);
-        cv::gpu::device::icf::shrink(channels, shrunk);
+        cv::softcascade::device::bgr2Luv(bgr, luv);
+        cv::softcascade::device::shrink(channels, shrunk);
     }
 
 private:
     const int shrinkage;
     const int bins;
 
-    GpuMat bgr;
-    GpuMat gray;
-    GpuMat channels;
+    cv::gpu::GpuMat bgr;
+    cv::gpu::GpuMat gray;
+    cv::gpu::GpuMat channels;
 };
 
 }
 
-cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int s, const int b, const int m)
+cv::Ptr<cv::softcascade::ChannelsProcessor> cv::softcascade::ChannelsProcessor::create(const int s, const int b, const int m)
 {
     CV_Assert((m && SEPARABLE) || (m && GENERIC));
 
     if (m && GENERIC)
-        return cv::Ptr<cv::gpu::ChannelsProcessor>(new GenricPreprocessor(s, b));
+        return cv::Ptr<cv::softcascade::ChannelsProcessor>(new GenricPreprocessor(s, b));
 
-    return cv::Ptr<cv::gpu::ChannelsProcessor>(new SeparablePreprocessor(s, b));
+    return cv::Ptr<cv::softcascade::ChannelsProcessor>(new SeparablePreprocessor(s, b));
 }
 
-cv::gpu::ChannelsProcessor::ChannelsProcessor() { }
-cv::gpu::ChannelsProcessor::~ChannelsProcessor() { }
+cv::softcascade::ChannelsProcessor::ChannelsProcessor() { }
+cv::softcascade::ChannelsProcessor::~ChannelsProcessor() { }
 
 #endif
diff --git a/modules/softcascade/src/softcascade_init.cpp b/modules/softcascade/src/softcascade_init.cpp
index 48ad46ab2..902ad48a1 100644
--- a/modules/softcascade/src/softcascade_init.cpp
+++ b/modules/softcascade/src/softcascade_init.cpp
@@ -51,11 +51,16 @@ CV_INIT_ALGORITHM(Detector, "SoftCascade.Detector",
                   obj.info()->addParam(obj, "scales",      obj.scales);
                   obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));
 
+CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
+                  obj.info()->addParam(obj, "minScale", obj.minScale);
+                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
+                  obj.info()->addParam(obj, "scales",   obj.scales));
 
 bool initModule_softcascade(void)
 {
+    Ptr<Algorithm> sc = createSCascade();
     Ptr<Algorithm> sc1 = createDetector();
-    return (sc1->info() != 0);
+    return (sc1->info() != 0) && (sc->info() != 0);
 }
 
 } }
\ No newline at end of file
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/softcascade/test/test_cuda_softcascade.cpp
similarity index 80%
rename from modules/gpu/test/test_softcascade.cpp
rename to modules/softcascade/test/test_cuda_softcascade.cpp
index c08dc06c8..f97a26ad3 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/softcascade/test/test_cuda_softcascade.cpp
@@ -41,10 +41,9 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/core/gpumat.hpp"
 
-#ifdef HAVE_CUDA
-
-using cv::gpu::GpuMat;
+using std::tr1::get;
 
 // show detection results on input image with cv::imshow
 //#define SHOW_DETECTIONS
@@ -59,7 +58,7 @@ using cv::gpu::GpuMat;
 
 static std::string path(std::string relative)
 {
-    return cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/" + relative;
+    return cvtest::TS::ptr()->get_data_path() + "cascadeandhog/" + relative;
 }
 
 TEST(SCascadeTest, readCascade)
@@ -67,7 +66,7 @@ TEST(SCascadeTest, readCascade)
     std::string xml = path("cascades/inria_caltech-17.01.2013.xml");
     cv::FileStorage fs(xml, cv::FileStorage::READ);
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
     ASSERT_TRUE(fs.isOpened());
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@@ -75,7 +74,7 @@ TEST(SCascadeTest, readCascade)
 
 namespace
 {
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
 
     cv::Rect getFromTable(int idx)
     {
@@ -97,7 +96,6 @@ namespace
         return rois[idx];
     }
 
-
     void print(std::ostream &out, const Detection& d)
     {
     #if defined SHOW_DETECTIONS
@@ -156,36 +154,36 @@ namespace
 #endif
 }
 
-PARAM_TEST_CASE(SCascadeTestRoi, cv::gpu::DeviceInfo, std::string, std::string, int)
+class SCascadeTestRoi : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> >
 {
     virtual void SetUp()
     {
-        cv::gpu::setDevice(GET_PARAM(0).deviceID());
+        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
     }
 };
 
-GPU_TEST_P(SCascadeTestRoi, Detect)
+TEST_P(SCascadeTestRoi, Detect)
 {
-    cv::Mat coloredCpu = cv::imread(path(GET_PARAM(2)));
+    cv::Mat coloredCpu = cv::imread(path(get<2>(GetParam())));
     ASSERT_FALSE(coloredCpu.empty());
 
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
-    cv::FileStorage fs(path(GET_PARAM(1)), cv::FileStorage::READ);
+    cv::FileStorage fs(path(get<1>(GetParam())), cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
 
     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
-    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
 
-    int nroi = GET_PARAM(3);
+    int nroi = get<3>(GetParam());
     cv::Mat result(coloredCpu);
     cv::RNG rng;
     for (int i = 0; i < nroi; ++i)
     {
         cv::Rect r = getFromTable(rng(10));
-        GpuMat sub(rois, r);
+        cv::gpu::GpuMat sub(rois, r);
         sub.setTo(1);
         cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
     }
@@ -194,7 +192,7 @@ GPU_TEST_P(SCascadeTestRoi, Detect)
     cascade.detect(colored, rois, objectBoxes);
 
     cv::Mat dt(objectBoxes);
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
 
     Detection* dts = ((Detection*)dt.data) + 1;
     int* count = dt.ptr<int>(0);
@@ -211,15 +209,13 @@ GPU_TEST_P(SCascadeTestRoi, Detect)
     SHOW(result);
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestRoi, testing::Combine(
-    ALL_DEVICES,
+INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestRoi, testing::Combine(
+    testing::ValuesIn(DeviceManager::instance().values()),
     testing::Values(std::string("cascades/inria_caltech-17.01.2013.xml"),
                     std::string("cascades/sc_cvpr_2012_to_opencv_new_format.xml")),
     testing::Values(std::string("images/image_00000000_0.png")),
     testing::Range(0, 5)));
 
-////////////////////////////////////////
-
 namespace {
 
 struct Fixture
@@ -232,23 +228,24 @@ struct Fixture
 };
 }
 
-PARAM_TEST_CASE(SCascadeTestAll, cv::gpu::DeviceInfo, Fixture)
+typedef std::tr1::tuple<cv::gpu::DeviceInfo, Fixture> SCascadeTestAllFixture;
+class SCascadeTestAll : public ::testing::TestWithParam<SCascadeTestAllFixture>
 {
-
+protected:
     std::string xml;
     int expected;
 
     virtual void SetUp()
     {
-        cv::gpu::setDevice(GET_PARAM(0).deviceID());
-        xml = path(GET_PARAM(1).path);
-        expected = GET_PARAM(1).expected;
+        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
+        xml = path(get<1>(GetParam()).path);
+        expected = get<1>(GetParam()).expected;
     }
 };
 
-GPU_TEST_P(SCascadeTestAll, detect)
+TEST_P(SCascadeTestAll, detect)
 {
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
     cv::FileStorage fs(xml, cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
@@ -258,12 +255,12 @@ GPU_TEST_P(SCascadeTestAll, detect)
     cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
     ASSERT_FALSE(coloredCpu.empty());
 
-    GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
     rois.setTo(1);
 
     cascade.detect(colored, rois, objectBoxes);
 
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
     cv::Mat dt(objectBoxes);
 
 
@@ -283,9 +280,9 @@ GPU_TEST_P(SCascadeTestAll, detect)
     ASSERT_EQ(*count, expected);
 }
 
-GPU_TEST_P(SCascadeTestAll, detectStream)
+TEST_P(SCascadeTestAll, detectStream)
 {
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;
 
     cv::FileStorage fs(xml, cv::FileStorage::READ);
     ASSERT_TRUE(fs.isOpened());
@@ -295,7 +292,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream)
     cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
     ASSERT_FALSE(coloredCpu.empty());
 
-    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(cv::Scalar::all(1));
 
     cv::gpu::Stream s;
@@ -304,14 +301,12 @@ GPU_TEST_P(SCascadeTestAll, detectStream)
     cascade.detect(colored, rois, objectBoxes, s);
     s.waitForCompletion();
 
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
     cv::Mat detections(objectBoxes);
     int a = *(detections.ptr<int>(0));
     ASSERT_EQ(a, expected);
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestAll, testing::Combine( ALL_DEVICES,
+INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestAll, testing::Combine( ALL_DEVICES,
                     testing::Values(Fixture("cascades/inria_caltech-17.01.2013.xml", 7),
-                                    Fixture("cascades/sc_cvpr_2012_to_opencv_new_format.xml", 1291))));
-
-#endif
+                                    Fixture("cascades/sc_cvpr_2012_to_opencv_new_format.xml", 1291))));
\ No newline at end of file
diff --git a/modules/softcascade/test/test_main.cpp b/modules/softcascade/test/test_main.cpp
index d3999d483..ab0e8615b 100644
--- a/modules/softcascade/test/test_main.cpp
+++ b/modules/softcascade/test/test_main.cpp
@@ -42,4 +42,4 @@
 
 #include "test_precomp.hpp"
 
-CV_TEST_MAIN("cv")
+CV_TEST_MAIN("cv")
\ No newline at end of file
diff --git a/modules/softcascade/test/utility.cpp b/modules/softcascade/test/utility.cpp
new file mode 100644
index 000000000..cb3b1fbf4
--- /dev/null
+++ b/modules/softcascade/test/utility.cpp
@@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+using namespace testing::internal;
+
+//////////////////////////////////////////////////////////////////////
+// Gpu devices
+
+bool supportFeature(const DeviceInfo& info, FeatureSet feature)
+{
+    return TargetArchs::builtWith(feature) && info.supports(feature);
+}
+
+DeviceManager& DeviceManager::instance()
+{
+    static DeviceManager obj;
+    return obj;
+}
+
+void DeviceManager::load(int i)
+{
+    devices_.clear();
+    devices_.reserve(1);
+
+    std::ostringstream msg;
+
+    if (i < 0 || i >= getCudaEnabledDeviceCount())
+    {
+        msg << "Incorrect device number - " << i;
+        CV_Error(CV_StsBadArg, msg.str());
+    }
+
+    DeviceInfo info(i);
+
+    if (!info.isCompatible())
+    {
+        msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
+        CV_Error(CV_StsBadArg, msg.str());
+    }
+
+    devices_.push_back(info);
+}
+
+void DeviceManager::loadAll()
+{
+    int deviceCount = getCudaEnabledDeviceCount();
+
+    devices_.clear();
+    devices_.reserve(deviceCount);
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        DeviceInfo info(i);
+        if (info.isCompatible())
+        {
+            devices_.push_back(info);
+        }
+    }
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/softcascade/test/utility.hpp b/modules/softcascade/test/utility.hpp
new file mode 100644
index 000000000..e6b840c53
--- /dev/null
+++ b/modules/softcascade/test/utility.hpp
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
+#define __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/ts/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+//////////////////////////////////////////////////////////////////////
+// Gpu devices
+//! return true if device supports specified feature and gpu module was built with support the feature.
+bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+
+class DeviceManager
+{
+public:
+    static DeviceManager& instance();
+
+    void load(int i);
+    void loadAll();
+
+    const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
+
+private:
+    std::vector<cv::gpu::DeviceInfo> devices_;
+    DeviceManager() {loadAll();}
+};
+
+#define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
+
+
+#endif // __OPENCV_GPU_TEST_UTILITY_HPP__
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 6abb7e5af..6d20fc34d 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -1,7 +1,7 @@
 SET(OPENCV_GPU_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc opencv_highgui
                                      opencv_ml opencv_video opencv_objdetect opencv_features2d
                                      opencv_calib3d opencv_legacy opencv_contrib opencv_gpu
-                                     opencv_nonfree)
+                                     opencv_nonfree opencv_softcascade)
 
 ocv_check_dependencies(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
 
diff --git a/samples/gpu/softcascade.cpp b/samples/gpu/softcascade.cpp
index 5f1adaf6c..e3683583a 100644
--- a/samples/gpu/softcascade.cpp
+++ b/samples/gpu/softcascade.cpp
@@ -1,4 +1,5 @@
 #include <opencv2/gpu/gpu.hpp>
+#include <opencv2/softcascade/softcascade.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <iostream>
 
@@ -46,7 +47,7 @@ int main(int argc, char** argv)
     float maxScale =  parser.get<float>("max_scale");
     int scales     =  parser.get<int>("total_scales");
 
-    using cv::gpu::SCascade;
+    using cv::softcascade::SCascade;
     SCascade cascade(minScale, maxScale, scales);
 
     if (!cascade.load(fs.getFirstTopLevelNode()))
@@ -79,7 +80,7 @@ int main(int argc, char** argv)
         cascade.detect(dframe, roi, objects);
 
         cv::Mat dt(objects);
-        typedef cv::gpu::SCascade::Detection Detection;
+        typedef cv::softcascade::SCascade::Detection Detection;
 
         Detection* dts = ((Detection*)dt.data) + 1;
         int* count = dt.ptr<int>(0);

From 6daf17f9740b3a6ca0f9144f07ed32e5199e24e3 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sun, 3 Mar 2013 13:01:17 +0400
Subject: [PATCH 3/8] remove softcascade host dependencies on gpu module

---
 modules/core/include/opencv2/core/gpumat.hpp  | 135 ++++++++++++++++++
 .../include/opencv2/core/stream_accessor.hpp  |  64 +++++++++
 modules/{gpu => core}/src/cudastream.cpp      |  27 ++++
 .../{gpu => core}/src/matrix_operations.cpp   |  30 +++-
 .../include/opencv2/gpu/stream_accessor.hpp   |   2 +-
 modules/softcascade/CMakeLists.txt            |   4 +-
 modules/softcascade/src/cuda/icf-sc.cu        |   2 +
 modules/softcascade/src/detector_cuda.cpp     |  28 ++--
 8 files changed, 274 insertions(+), 18 deletions(-)
 create mode 100644 modules/core/include/opencv2/core/stream_accessor.hpp
 rename modules/{gpu => core}/src/cudastream.cpp (92%)
 rename modules/{gpu => core}/src/matrix_operations.cpp (89%)

diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index be757a935..bb436ec55 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -50,6 +50,141 @@
 
 namespace cv { namespace gpu
 {
+    //////////////////////////////// CudaMem ////////////////////////////////
+    // CudaMem is limited cv::Mat with page locked memory allocation.
+    // Page locked memory is only needed for async and faster coping to GPU.
+    // It is convertable to cv::Mat header without reference counting
+    // so you can use it with other opencv functions.
+
+    // Page-locks the matrix m memory and maps it for the device(s)
+    CV_EXPORTS void registerPageLocked(Mat& m);
+    // Unmaps the memory of matrix m, and makes it pageable again.
+    CV_EXPORTS void unregisterPageLocked(Mat& m);
+
+    class CV_EXPORTS CudaMem
+    {
+    public:
+        enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
+
+        CudaMem();
+        CudaMem(const CudaMem& m);
+
+        CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
+        CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+
+        //! creates from cv::Mat with coping data
+        explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        ~CudaMem();
+
+        CudaMem& operator = (const CudaMem& m);
+
+        //! returns deep copy of the matrix, i.e. the data is copied
+        CudaMem clone() const;
+
+        //! allocates new matrix data unless the matrix already has specified size and type.
+        void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+        void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        //! decrements reference counter and released memory if needed.
+        void release();
+
+        //! returns matrix header with disabled reference counting for CudaMem data.
+        Mat createMatHeader() const;
+        operator Mat() const;
+
+        //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+        GpuMat createGpuMatHeader() const;
+        operator GpuMat() const;
+
+        //returns if host memory can be mapperd to gpu address space;
+        static bool canMapHostMemory();
+
+        // Please see cv::Mat for descriptions
+        bool isContinuous() const;
+        size_t elemSize() const;
+        size_t elemSize1() const;
+        int type() const;
+        int depth() const;
+        int channels() const;
+        size_t step1() const;
+        Size size() const;
+        bool empty() const;
+
+
+        // Please see cv::Mat for descriptions
+        int flags;
+        int rows, cols;
+        size_t step;
+
+        uchar* data;
+        int* refcount;
+
+        uchar* datastart;
+        uchar* dataend;
+
+        int alloc_type;
+    };
+
+
+    //////////////////////////////// CudaStream ////////////////////////////////
+    // Encapculates Cuda Stream. Provides interface for async coping.
+    // Passed to each function that supports async kernel execution.
+    // Reference counting is enabled
+
+    class CV_EXPORTS Stream
+    {
+    public:
+        Stream();
+        ~Stream();
+
+        Stream(const Stream&);
+        Stream& operator =(const Stream&);
+
+        bool queryIfComplete();
+        void waitForCompletion();
+
+        //! downloads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
+        void enqueueDownload(const GpuMat& src, CudaMem& dst);
+        void enqueueDownload(const GpuMat& src, Mat& dst);
+
+        //! uploads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
+        void enqueueUpload(const CudaMem& src, GpuMat& dst);
+        void enqueueUpload(const Mat& src, GpuMat& dst);
+
+        //! copy asynchronously
+        void enqueueCopy(const GpuMat& src, GpuMat& dst);
+
+        //! memory set asynchronously
+        void enqueueMemSet(GpuMat& src, Scalar val);
+        void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
+
+        //! converts matrix type, ex from float to uchar depending on type
+        void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
+
+        //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+        typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+        void enqueueHostCallback(StreamCallback callback, void* userData);
+
+        static Stream& Null();
+
+        operator bool() const;
+
+    private:
+        struct Impl;
+
+        explicit Stream(Impl* impl);
+        void create();
+        void release();
+
+        Impl *impl;
+
+        friend struct StreamAccessor;
+    };
+
     //////////////////////////////// Initialization & Info ////////////////////////
 
     //! This is the only function that do not throw exceptions if the library is compiled without Cuda.
diff --git a/modules/core/include/opencv2/core/stream_accessor.hpp b/modules/core/include/opencv2/core/stream_accessor.hpp
new file mode 100644
index 000000000..6a1a0bddd
--- /dev/null
+++ b/modules/core/include/opencv2/core/stream_accessor.hpp
@@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+
+#include "opencv2/gpu/gpu.hpp"
+#include "cuda_runtime_api.h"
+
+namespace cv
+{
+    namespace gpu
+    {
+        // This is only header file that depends on Cuda. All other headers are independent.
+        // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
+        // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
+        // In this case you have to install Cuda Toolkit.
+        struct StreamAccessor
+        {
+            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+        };
+    }
+}
+
+#endif /* __OPENCV_GPU_STREAM_ACCESSOR_HPP__ */
\ No newline at end of file
diff --git a/modules/gpu/src/cudastream.cpp b/modules/core/src/cudastream.cpp
similarity index 92%
rename from modules/gpu/src/cudastream.cpp
rename to modules/core/src/cudastream.cpp
index e302fd1fc..a10807cf2 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
@@ -72,6 +72,33 @@ void cv::gpu::Stream::release() { throw_nogpu(); }
 
 #include "opencv2/gpu/stream_accessor.hpp"
 
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+
+    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+        {
+            std::ostringstream msg;
+            msg << "NPP API Call Error: " << err;
+            cv::gpu::error(msg.str().c_str(), file, line, func);
+        }
+    }
+}
+
 namespace cv { namespace gpu
 {
     void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
similarity index 89%
rename from modules/gpu/src/matrix_operations.cpp
rename to modules/core/src/matrix_operations.cpp
index b19524842..3b82df572 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"
 
 using namespace cv;
 using namespace cv::gpu;
@@ -178,7 +179,7 @@ bool cv::gpu::CudaMem::empty() const
     return data == 0;
 }
 
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA)
 
 void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }
 void cv::gpu::unregisterPageLocked(Mat&) { throw_nogpu(); }
@@ -188,6 +189,33 @@ void cv::gpu::CudaMem::release() { throw_nogpu(); }
 GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_nogpu(); return GpuMat(); }
 
 #else /* !defined (HAVE_CUDA) */
+#include <cuda_runtime_api.h>
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+
+    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+        {
+            std::ostringstream msg;
+            msg << "NPP API Call Error: " << err;
+            cv::gpu::error(msg.str().c_str(), file, line, func);
+        }
+    }
+}
 
 void cv::gpu::registerPageLocked(Mat& m)
 {
diff --git a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
index abafc613d..1797749b6 100644
--- a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
+++ b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
@@ -56,7 +56,7 @@ namespace cv
         // In this case you have to install Cuda Toolkit.
         struct StreamAccessor
         {
-            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+            CV_EXPORTS static cudaStream_t getStream(const cv::gpu::Stream& stream);
         };
     }
 }
diff --git a/modules/softcascade/CMakeLists.txt b/modules/softcascade/CMakeLists.txt
index f19241a4a..ee6e89299 100644
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@@ -9,7 +9,7 @@ macro(ocv_glob_cuda_powered_module_sources)
   set(lib_device_hdrs "")
 
   if (HAVE_CUDA AND lib_device_srcs)
-  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  ocv_include_directories(${CUDA_INCLUDE_DIRS} "${OpenCV_SOURCE_DIR}/modules/gpu/include")
   file(GLOB_RECURSE lib_device_hdrs "src/cuda/*.hpp")
 
   ocv_cuda_compile(device_objs ${lib_device_srcs})
@@ -35,7 +35,7 @@ set(cuda_include "")
 if (NAVE_CUDA)
   set(cuda_deps ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 endif()
-ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL opencv_gpu ${cuda_deps})
+ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL ${cuda_deps})
 
 if(HAVE_CUDA)
   ocv_module_include_directories(${CUDA_INCLUDE_DIRS})
diff --git a/modules/softcascade/src/cuda/icf-sc.cu b/modules/softcascade/src/cuda/icf-sc.cu
index d339ef0d3..19b20db9c 100644
--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -49,6 +49,8 @@
 
 namespace cv { namespace softcascade { namespace device {
 
+typedef unsigned char uchar;
+
     template <int FACTOR>
     __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
     {
diff --git a/modules/softcascade/src/detector_cuda.cpp b/modules/softcascade/src/detector_cuda.cpp
index 6c920332f..a013a16fe 100644
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -480,8 +480,8 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
 
     flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());
 
-    cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
-    cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
+    //cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
+    //cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
 
     if (type == CV_8UC3)
     {
@@ -491,7 +491,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
             flds.createLevels(image.rows, image.cols);
 
         flds.preprocessor->apply(image, flds.shrunk);
-        cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
+        //cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
     }
     else
     {
@@ -546,12 +546,12 @@ struct GenricPreprocessor : public cv::softcascade::ChannelsProcessor
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
 
-        cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
+        //cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
         createHogBins(s);
 
         createLuvBins(frame, s);
 
-        cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
+        //cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
     }
 
 private:
@@ -566,20 +566,20 @@ private:
         cv::gpu::GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
         cv::gpu::GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
 
-        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
-        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
+        //cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
+        //cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
 
         cv::gpu::GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
         cv::gpu::GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
 
-        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
+        //cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
 
         // normalize magnitude to uchar interval and angles to 6 bins
         cv::gpu::GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
         cv::gpu::GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
 
-        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
-        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
+        //cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
+        //cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
 
         //create uchar magnitude
         cv::gpu::GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
@@ -597,7 +597,7 @@ private:
         static const int fw = colored.cols;
         static const int fh = colored.rows;
 
-        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
+        //cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
 
         std::vector<cv::gpu::GpuMat> splited;
         for(int i = 0; i < LUV_BINS; ++i)
@@ -605,7 +605,7 @@ private:
             splited.push_back(cv::gpu::GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
         }
 
-        cv::gpu::split(luv, splited, s);
+        //cv::gpu::split(luv, splited, s);
     }
 
     enum {HOG_BINS = 6, LUV_BINS = 3};
@@ -631,7 +631,7 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
     virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
     {
         const cv::gpu::GpuMat frame = _frame.getGpuMat();
-        cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
+        //cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
 
         _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
         cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
@@ -639,7 +639,7 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
 
-        cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+        //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
         cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
 
         cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));

From 3c8e66d58025956d0d2c7e205395dacd282f8945 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sun, 3 Mar 2013 16:27:49 +0400
Subject: [PATCH 4/8] softcascade: remove device dependency on gpu

---
 .../include/opencv2/core/stream_accessor.hpp  |  2 +-
 modules/core/src/cudastream.cpp               |  4 +-
 modules/core/src/matrix_operations.cpp        | 13 ++++---
 modules/softcascade/CMakeLists.txt            |  4 +-
 modules/softcascade/src/cuda/icf-sc.cu        | 39 +++++++++++++------
 modules/softcascade/src/cuda_invoker.hpp      |  8 ++--
 modules/softcascade/src/detector_cuda.cpp     | 24 +++++++++---
 .../test/test_cuda_softcascade.cpp            |  8 +++-
 modules/softcascade/test/utility.hpp          |  9 +++--
 9 files changed, 77 insertions(+), 34 deletions(-)

diff --git a/modules/core/include/opencv2/core/stream_accessor.hpp b/modules/core/include/opencv2/core/stream_accessor.hpp
index 6a1a0bddd..30dcc6042 100644
--- a/modules/core/include/opencv2/core/stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/stream_accessor.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
 #define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
 
-#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/core/gpumat.hpp"
 #include "cuda_runtime_api.h"
 
 namespace cv
diff --git a/modules/core/src/cudastream.cpp b/modules/core/src/cudastream.cpp
index a10807cf2..c22db8719 100644
--- a/modules/core/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
@@ -41,11 +41,13 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"
 
 using namespace cv;
 using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA)
+#define throw_nogpu() CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
 
 cv::gpu::Stream::Stream() { throw_nogpu(); }
 cv::gpu::Stream::~Stream() {}
@@ -70,7 +72,7 @@ void cv::gpu::Stream::release() { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
-#include "opencv2/gpu/stream_accessor.hpp"
+#include "opencv2/core/stream_accessor.hpp"
 
 namespace
 {
diff --git a/modules/core/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
index 3b82df572..eace5181d 100644
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -181,12 +181,13 @@ bool cv::gpu::CudaMem::empty() const
 
 #if !defined (HAVE_CUDA)
 
-void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }
-void cv::gpu::unregisterPageLocked(Mat&) { throw_nogpu(); }
-void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); }
-bool cv::gpu::CudaMem::canMapHostMemory() { throw_nogpu(); return false; }
-void cv::gpu::CudaMem::release() { throw_nogpu(); }
-GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_nogpu(); return GpuMat(); }
+void cv::gpu::registerPageLocked(Mat&) { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+void cv::gpu::unregisterPageLocked(Mat&) { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/)
+{ CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+bool cv::gpu::CudaMem::canMapHostMemory() { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return false; }
+void cv::gpu::CudaMem::release() { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return GpuMat(); }
 
 #else /* !defined (HAVE_CUDA) */
 #include <cuda_runtime_api.h>
diff --git a/modules/softcascade/CMakeLists.txt b/modules/softcascade/CMakeLists.txt
index ee6e89299..0d0d6fecb 100644
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@@ -9,7 +9,7 @@ macro(ocv_glob_cuda_powered_module_sources)
   set(lib_device_hdrs "")
 
   if (HAVE_CUDA AND lib_device_srcs)
-  ocv_include_directories(${CUDA_INCLUDE_DIRS} "${OpenCV_SOURCE_DIR}/modules/gpu/include")
+  ocv_include_directories(${CUDA_INCLUDE_DIRS})
   file(GLOB_RECURSE lib_device_hdrs "src/cuda/*.hpp")
 
   ocv_cuda_compile(device_objs ${lib_device_srcs})
@@ -40,6 +40,8 @@ ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL ${cuda_
 if(HAVE_CUDA)
   ocv_module_include_directories(${CUDA_INCLUDE_DIRS})
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+else()
+  ocv_module_include_directories()
 endif()
 
 ocv_glob_cuda_powered_module_sources()
diff --git a/modules/softcascade/src/cuda/icf-sc.cu b/modules/softcascade/src/cuda/icf-sc.cu
index 19b20db9c..cb2f4c8cd 100644
--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -40,13 +40,28 @@
 //
 //M*/
 
-#include <opencv2/gpu/device/common.hpp>
-#include <opencv2/gpu/device/saturate_cast.hpp>
-
 #include <cuda_invoker.hpp>
 #include <float.h>
 #include <stdio.h>
 
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        //if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+}
+
+#ifndef CV_PI
+    #define CV_PI   3.1415926535897932384626433832795
+#endif
+
 namespace cv { namespace softcascade { namespace device {
 
 typedef unsigned char uchar;
@@ -126,7 +141,7 @@ typedef unsigned char uchar;
         luvg[luvgPitch * (y + 2 * 480) + x] = v;
     }
 
-    void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv)
+    void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv)
     {
         dim3 block(32, 8);
         dim3 grid(bgr.cols / 32, bgr.rows / 8);
@@ -208,7 +223,7 @@ typedef unsigned char uchar;
     texture<uchar,  cudaTextureType2D, cudaReadModeElementType> tgray;
 
     template<bool isDefaultNum>
-    __global__ void gray2hog(PtrStepSzb mag)
+    __global__ void gray2hog(cv::gpu::PtrStepSzb mag)
     {
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -223,7 +238,7 @@ typedef unsigned char uchar;
         mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag;
     }
 
-    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins)
+    void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins)
     {
         dim3 block(32, 8);
         dim3 grid(gray.cols / 32, gray.rows / 8);
@@ -326,8 +341,8 @@ typedef unsigned char uchar;
         }
     }
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
-        PtrStepSzb suppressed, cudaStream_t stream)
+    void suppress(const cv::gpu::PtrStepSzb& objects, cv::gpu::PtrStepSzb overlaps, cv::gpu::PtrStepSzi ndetections,
+        cv::gpu::PtrStepSzb suppressed, cudaStream_t stream)
     {
         int block = 192;
         int grid = 1;
@@ -529,8 +544,8 @@ __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* ob
 }
 
 template<typename Policy>
-void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
+void CascadeInvoker<Policy>::operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv,
+    cv::gpu::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
 {
     int fw = roi.rows;
     int fh = roi.cols;
@@ -562,7 +577,7 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
     }
 }
 
-template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
+template void CascadeInvoker<GK107PolicyX4>::operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv,
+    cv::gpu::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
 
 }}}
diff --git a/modules/softcascade/src/cuda_invoker.hpp b/modules/softcascade/src/cuda_invoker.hpp
index 958850f0c..dfce0ba09 100644
--- a/modules/softcascade/src/cuda_invoker.hpp
+++ b/modules/softcascade/src/cuda_invoker.hpp
@@ -44,9 +44,9 @@
 #ifndef __OPENCV_ICF_HPP__
 #define __OPENCV_ICF_HPP__
 
-#include <opencv2/gpu/device/common.hpp>
-
-using namespace cv::gpu::device;
+// #include <opencv2/gpu/device/common.hpp>
+#include "opencv2/core/cuda_devptrs.hpp"
+#include "cuda_runtime_api.h"
 
 #if defined __CUDACC__
 # define __device_inline__ __device__ __forceinline__
@@ -57,6 +57,8 @@ using namespace cv::gpu::device;
 
 namespace cv { namespace softcascade { namespace device {
 
+typedef unsigned char uchar;
+
 struct Octave
 {
     ushort index;
diff --git a/modules/softcascade/src/detector_cuda.cpp b/modules/softcascade/src/detector_cuda.cpp
index a013a16fe..07d453576 100644
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -41,9 +41,9 @@
 //M*/
 
 #include "precomp.hpp"
-#include "opencv2/gpu/stream_accessor.hpp"
 
 #if !defined (HAVE_CUDA)
+#define throw_nogpu() CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
 cv::softcascade::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
 
 cv::softcascade::SCascade::~SCascade() { throw_nogpu(); }
@@ -54,14 +54,28 @@ void cv::softcascade::SCascade::detect(InputArray, InputArray, OutputArray, cv::
 
 void cv::softcascade::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 
-cv::gpu::ChannelsProcessor::ChannelsProcessor() { throw_nogpu(); }
- cv::gpu::ChannelsProcessor::~ChannelsProcessor() { throw_nogpu(); }
+cv::softcascade::ChannelsProcessor::ChannelsProcessor() { throw_nogpu(); }
+ cv::softcascade::ChannelsProcessor::~ChannelsProcessor() { throw_nogpu(); }
 
-cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int, const int, const int)
-{ throw_nogpu(); return cv::Ptr<cv::gpu::ChannelsProcessor>(0); }
+cv::Ptr<cv::softcascade::ChannelsProcessor> cv::softcascade::ChannelsProcessor::create(const int, const int, const int)
+{ throw_nogpu(); return cv::Ptr<cv::softcascade::ChannelsProcessor>(0); }
 
 #else
 # include "cuda_invoker.hpp"
+# include "opencv2/core/stream_accessor.hpp"
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        //if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+}
 
 cv::softcascade::device::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
 :  octave(idx), step(oct.stages), relScale(scale / oct.scale)
diff --git a/modules/softcascade/test/test_cuda_softcascade.cpp b/modules/softcascade/test/test_cuda_softcascade.cpp
index f97a26ad3..139c60856 100644
--- a/modules/softcascade/test/test_cuda_softcascade.cpp
+++ b/modules/softcascade/test/test_cuda_softcascade.cpp
@@ -43,6 +43,8 @@
 #include "test_precomp.hpp"
 #include "opencv2/core/gpumat.hpp"
 
+
+#ifdef HAVE_CUDA
 using std::tr1::get;
 
 // show detection results on input image with cv::imshow
@@ -210,7 +212,7 @@ TEST_P(SCascadeTestRoi, Detect)
 }
 
 INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestRoi, testing::Combine(
-    testing::ValuesIn(DeviceManager::instance().values()),
+    ALL_DEVICES,
     testing::Values(std::string("cascades/inria_caltech-17.01.2013.xml"),
                     std::string("cascades/sc_cvpr_2012_to_opencv_new_format.xml")),
     testing::Values(std::string("images/image_00000000_0.png")),
@@ -309,4 +311,6 @@ TEST_P(SCascadeTestAll, detectStream)
 
 INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestAll, testing::Combine( ALL_DEVICES,
                     testing::Values(Fixture("cascades/inria_caltech-17.01.2013.xml", 7),
-                                    Fixture("cascades/sc_cvpr_2012_to_opencv_new_format.xml", 1291))));
\ No newline at end of file
+                                    Fixture("cascades/sc_cvpr_2012_to_opencv_new_format.xml", 1291))));
+
+#endif
\ No newline at end of file
diff --git a/modules/softcascade/test/utility.hpp b/modules/softcascade/test/utility.hpp
index e6b840c53..2018a156e 100644
--- a/modules/softcascade/test/utility.hpp
+++ b/modules/softcascade/test/utility.hpp
@@ -52,6 +52,8 @@
 //! return true if device supports specified feature and gpu module was built with support the feature.
 bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
 
+
+#if defined(HAVE_CUDA)
 class DeviceManager
 {
 public:
@@ -66,8 +68,9 @@ private:
     std::vector<cv::gpu::DeviceInfo> devices_;
     DeviceManager() {loadAll();}
 };
-
-#define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
-
+# define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
+#else
+# define ALL_DEVICES testing::ValuesIn(std::vector<cv::gpu::DeviceInfo>())
+#endif
 
 #endif // __OPENCV_GPU_TEST_UTILITY_HPP__

From 83e7d3dd671947cedebed0adda5e6a05021e0274 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 13 Mar 2013 15:06:27 +0400
Subject: [PATCH 5/8] remove generic version of GPU channel computer.

---
 modules/softcascade/CMakeLists.txt            |  53 +---------
 .../include/opencv2/softcascade.hpp           |   6 +-
 modules/softcascade/src/detector_cuda.cpp     | 100 +-----------------
 3 files changed, 7 insertions(+), 152 deletions(-)

diff --git a/modules/softcascade/CMakeLists.txt b/modules/softcascade/CMakeLists.txt
index 0d0d6fecb..87798d299 100644
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@@ -1,52 +1,3 @@
-macro(ocv_glob_cuda_powered_module_sources)
-  file(GLOB_RECURSE lib_srcs "src/*.cpp")
-  file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
-  file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
-  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
-
-  file(GLOB_RECURSE lib_device_srcs "src/*.cu")
-  set(device_objs "")
-  set(lib_device_hdrs "")
-
-  if (HAVE_CUDA AND lib_device_srcs)
-  ocv_include_directories(${CUDA_INCLUDE_DIRS})
-  file(GLOB_RECURSE lib_device_hdrs "src/cuda/*.hpp")
-
-  ocv_cuda_compile(device_objs ${lib_device_srcs})
-  source_group("Src\\Cuda" FILES ${lib_device_srcs} ${lib_device_hdrs})
-  if (lib_device_hdrs)
-  list(REMOVE_ITEM lib_int_hdrs ${lib_device_hdrs})
-  endif()
-  endif()
-
-  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
-  SOURCES ${lib_srcs} ${lib_int_hdrs} ${device_objs} ${lib_device_srcs} ${lib_device_hdrs})
-
-  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
-  source_group("Include" FILES ${lib_hdrs})
-  source_group("Include\\detail" FILES ${lib_hdrs_detail})
-endmacro()
-
 set(the_description "Soft Cascade detection and training")
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310 -Wundef -Wmissing-declarations)
-set(cuda_deps "")
-set(cuda_include "")
-
-if (NAVE_CUDA)
-  set(cuda_deps ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-endif()
-ocv_add_module(softcascade opencv_core opencv_imgproc opencv_ml OPTIONAL ${cuda_deps})
-
-if(HAVE_CUDA)
-  ocv_module_include_directories(${CUDA_INCLUDE_DIRS})
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
-else()
-  ocv_module_include_directories()
-endif()
-
-ocv_glob_cuda_powered_module_sources()
-ocv_create_module()
-ocv_add_precompiled_headers(${the_module})
-
-ocv_add_accuracy_tests()
-ocv_add_perf_tests()
+ocv_define_module(softcascade opencv_core opencv_imgproc opencv_ml)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310 -Wundef)
diff --git a/modules/softcascade/include/opencv2/softcascade.hpp b/modules/softcascade/include/opencv2/softcascade.hpp
index e97ac4d78..7ce31a613 100644
--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
@@ -219,7 +219,7 @@ class CV_EXPORTS ChannelsProcessor
 public:
     enum
     {
-        GENERIC   = 1 << 4,
+        // GENERIC   = 1 << 4, does not supported
         SEPARABLE = 2 << 4
     };
 
@@ -233,7 +233,7 @@ public:
     // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
     // Param bins is a number of HOG-like channels.
     // Param flags is a channel computing extra flags.
-    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
+    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = SEPARABLE);
 
     virtual ~ChannelsProcessor();
 
@@ -267,7 +267,7 @@ public:
     // Param scales is a number of scales from minScale to maxScale.
     // Param flags is an extra tuning flags.
     SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
-        const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
+        const int flags = NO_REJECT | ChannelsProcessor::SEPARABLE);
 
     virtual ~SCascade();
 
diff --git a/modules/softcascade/src/detector_cuda.cpp b/modules/softcascade/src/detector_cuda.cpp
index 07d453576..9a422189b 100644
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -73,7 +73,7 @@ namespace
 
     inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
     {
-        //if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+        if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
     }
 }
 
@@ -545,98 +545,6 @@ inline void setZero(cv::gpu::GpuMat& m, cv::gpu::Stream& s)
         m.setTo(0);
 }
 
-struct GenricPreprocessor : public cv::softcascade::ChannelsProcessor
-{
-    GenricPreprocessor(const int s, const int b) : cv::softcascade::ChannelsProcessor(), shrinkage(s), bins(b) {}
-    virtual ~GenricPreprocessor() {}
-
-    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
-    {
-        const cv::gpu::GpuMat frame = _frame.getGpuMat();
-
-        _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
-        cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
-
-        channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
-        setZero(channels, s);
-
-        //cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
-        createHogBins(s);
-
-        createLuvBins(frame, s);
-
-        //cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
-    }
-
-private:
-
-    void createHogBins(cv::gpu::Stream& s)
-    {
-        static const int fw = gray.cols;
-        static const int fh = gray.rows;
-
-        fplane.create(fh * HOG_BINS, fw, CV_32FC1);
-
-        cv::gpu::GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
-        cv::gpu::GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
-
-        //cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
-        //cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
-
-        cv::gpu::GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
-        cv::gpu::GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
-
-        //cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
-
-        // normalize magnitude to uchar interval and angles to 6 bins
-        cv::gpu::GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
-        cv::gpu::GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
-
-        //cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
-        //cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
-
-        //create uchar magnitude
-        cv::gpu::GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
-        if (s)
-            s.enqueueConvert(nmag, cmag, CV_8UC1);
-        else
-            nmag.convertTo(cmag, CV_8UC1);
-
-        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
-        cv::softcascade::device::fillBins(channels, nang, fw, fh, HOG_BINS, stream);
-    }
-
-    void createLuvBins(const cv::gpu::GpuMat& colored, cv::gpu::Stream& s)
-    {
-        static const int fw = colored.cols;
-        static const int fh = colored.rows;
-
-        //cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
-
-        std::vector<cv::gpu::GpuMat> splited;
-        for(int i = 0; i < LUV_BINS; ++i)
-        {
-            splited.push_back(cv::gpu::GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
-        }
-
-        //cv::gpu::split(luv, splited, s);
-    }
-
-    enum {HOG_BINS = 6, LUV_BINS = 3};
-
-    const int shrinkage;
-    const int bins;
-
-    cv::gpu::GpuMat gray;
-    cv::gpu::GpuMat luv;
-    cv::gpu::GpuMat channels;
-
-    // preallocated buffer for floating point operations
-    cv::gpu::GpuMat fplane;
-    cv::gpu::GpuMat sobelBuf;
-};
-
-
 struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
 {
     SeparablePreprocessor(const int s, const int b) : cv::softcascade::ChannelsProcessor(), shrinkage(s), bins(b) {}
@@ -674,11 +582,7 @@ private:
 
 cv::Ptr<cv::softcascade::ChannelsProcessor> cv::softcascade::ChannelsProcessor::create(const int s, const int b, const int m)
 {
-    CV_Assert((m && SEPARABLE) || (m && GENERIC));
-
-    if (m && GENERIC)
-        return cv::Ptr<cv::softcascade::ChannelsProcessor>(new GenricPreprocessor(s, b));
-
+    CV_Assert((m && SEPARABLE));
     return cv::Ptr<cv::softcascade::ChannelsProcessor>(new SeparablePreprocessor(s, b));
 }
 

From 6f11dc03b9ee6606746c3fc6392077270c78ab57 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 13 Mar 2013 17:12:19 +0400
Subject: [PATCH 6/8] implement integral

---
 modules/softcascade/src/cuda/channels.cu  | 505 ++++++++++++++++++++++
 modules/softcascade/src/cuda/icf-sc.cu    |   2 +-
 modules/softcascade/src/detector_cuda.cpp |  48 +-
 3 files changed, 551 insertions(+), 4 deletions(-)
 create mode 100644 modules/softcascade/src/cuda/channels.cu

diff --git a/modules/softcascade/src/cuda/channels.cu b/modules/softcascade/src/cuda/channels.cu
new file mode 100644
index 000000000..7b153413d
--- /dev/null
+++ b/modules/softcascade/src/cuda/channels.cu
@@ -0,0 +1,505 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/cuda_devptrs.hpp"
+
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    // if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+
+__host__ __device__ __forceinline__ int divUp(int total, int grain)
+{
+    return (total + grain - 1) / grain;
+}
+
+namespace cv { namespace softcascade { namespace device
+{
+    // Utility function to extract unsigned chars from an unsigned integer
+    __device__ uchar4 int_to_uchar4(unsigned int in)
+    {
+        uchar4 bytes;
+        bytes.x = (in & 0x000000ff) >>  0;
+        bytes.y = (in & 0x0000ff00) >>  8;
+        bytes.z = (in & 0x00ff0000) >> 16;
+        bytes.w = (in & 0xff000000) >> 24;
+        return bytes;
+    }
+
+    __global__ void shfl_integral_horizontal(const cv::gpu::PtrStep<uint4> img, cv::gpu::PtrStep<uint4> integral)
+    {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+        __shared__ int sums[128];
+
+        const int id = threadIdx.x;
+        const int lane_id = id % warpSize;
+        const int warp_id = id / warpSize;
+
+        const uint4 data = img(blockIdx.x, id);
+
+        const uchar4 a = int_to_uchar4(data.x);
+        const uchar4 b = int_to_uchar4(data.y);
+        const uchar4 c = int_to_uchar4(data.z);
+        const uchar4 d = int_to_uchar4(data.w);
+
+        int result[16];
+
+        result[0]  =              a.x;
+        result[1]  = result[0]  + a.y;
+        result[2]  = result[1]  + a.z;
+        result[3]  = result[2]  + a.w;
+
+        result[4]  = result[3]  + b.x;
+        result[5]  = result[4]  + b.y;
+        result[6]  = result[5]  + b.z;
+        result[7]  = result[6]  + b.w;
+
+        result[8]  = result[7]  + c.x;
+        result[9]  = result[8]  + c.y;
+        result[10] = result[9]  + c.z;
+        result[11] = result[10] + c.w;
+
+        result[12] = result[11] + d.x;
+        result[13] = result[12] + d.y;
+        result[14] = result[13] + d.z;
+        result[15] = result[14] + d.w;
+
+        int sum = result[15];
+
+        // the prefix sum for each thread's 16 value is computed,
+        // now the final sums (result[15]) need to be shared
+        // with the other threads and add.  To do this,
+        // the __shfl_up() instruction is used and a shuffle scan
+        // operation is performed to distribute the sums to the correct
+        // threads
+        #pragma unroll
+        for (int i = 1; i < 32; i *= 2)
+        {
+            const int n = __shfl_up(sum, i, 32);
+
+            if (lane_id >= i)
+            {
+                #pragma unroll
+                for (int i = 0; i < 16; ++i)
+                    result[i] += n;
+
+                sum += n;
+            }
+        }
+
+        // Now the final sum for the warp must be shared
+        // between warps.  This is done by each warp
+        // having a thread store to shared memory, then
+        // having some other warp load the values and
+        // compute a prefix sum, again by using __shfl_up.
+        // The results are uniformly added back to the warps.
+        // last thread in the warp holding sum of the warp
+        // places that in shared
+        if (threadIdx.x % warpSize == warpSize - 1)
+            sums[warp_id] = result[15];
+
+        __syncthreads();
+
+        if (warp_id == 0)
+        {
+            int warp_sum = sums[lane_id];
+
+            #pragma unroll
+            for (int i = 1; i <= 32; i *= 2)
+            {
+                const int n = __shfl_up(warp_sum, i, 32);
+
+                if (lane_id >= i)
+                    warp_sum += n;
+            }
+
+            sums[lane_id] = warp_sum;
+        }
+
+        __syncthreads();
+
+        int blockSum = 0;
+
+        // fold in unused warp
+        if (warp_id > 0)
+        {
+            blockSum = sums[warp_id - 1];
+
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+                result[i] += blockSum;
+        }
+
+        // assemble result
+        // Each thread has 16 values to write, which are
+        // now integer data (to avoid overflow).  Instead of
+        // each thread writing consecutive uint4s, the
+        // approach shown here experiments using
+        // the shuffle command to reformat the data
+        // inside the registers so that each thread holds
+        // consecutive data to be written so larger contiguous
+        // segments can be assembled for writing.
+
+        /*
+            For example data that needs to be written as
+
+            GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+            but is stored in registers (r0..r3), in four threads (0..3) as:
+
+            threadId   0  1  2  3
+              r0      x0 y0 z0 w0
+              r1      x1 y1 z1 w1
+              r2      x2 y2 z2 w2
+              r3      x3 y3 z3 w3
+
+              after apply __shfl_xor operations to move data between registers r1..r3:
+
+            threadId  00 01 10 11
+                      x0 y0 z0 w0
+             xor(01)->y1 x1 w1 z1
+             xor(10)->z2 w2 x2 y2
+             xor(11)->w3 z3 y3 x3
+
+             and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+             In the current code, each register above is actually representing
+             four integers to be written as uint4's to GMEM.
+        */
+
+        result[4]  = __shfl_xor(result[4] , 1, 32);
+        result[5]  = __shfl_xor(result[5] , 1, 32);
+        result[6]  = __shfl_xor(result[6] , 1, 32);
+        result[7]  = __shfl_xor(result[7] , 1, 32);
+
+        result[8]  = __shfl_xor(result[8] , 2, 32);
+        result[9]  = __shfl_xor(result[9] , 2, 32);
+        result[10] = __shfl_xor(result[10], 2, 32);
+        result[11] = __shfl_xor(result[11], 2, 32);
+
+        result[12] = __shfl_xor(result[12], 3, 32);
+        result[13] = __shfl_xor(result[13], 3, 32);
+        result[14] = __shfl_xor(result[14], 3, 32);
+        result[15] = __shfl_xor(result[15], 3, 32);
+
+        uint4* integral_row = integral.ptr(blockIdx.x);
+        uint4 output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+        // continuning from the above example,
+        // this use of __shfl_xor() places the y0..y3 and w0..w3 data
+        // in order.
+
+        #pragma unroll
+        for (int i = 0; i < 16; ++i)
+            result[i] = __shfl_xor(result[i], 1, 32);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+    #endif
+    }
+
+    // This kernel computes columnwise prefix sums.  When the data input is
+    // the row sums from above, this completes the integral image.
+    // The approach here is to have each block compute a local set of sums.
+    // First , the data covered by the block is loaded into shared memory,
+    // then instead of performing a sum in shared memory using __syncthreads
+    // between stages, the data is reformatted so that the necessary sums
+    // occur inside warps and the shuffle scan operation is used.
+    // The final set of sums from the block is then propgated, with the block
+    // computing "down" the image and adding the running sum to the local
+    // block sums.
+    __global__ void shfl_integral_vertical(cv::gpu::PtrStepSz<unsigned int> integral)
+    {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+        __shared__ unsigned int sums[32][9];
+
+        const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+        const int lane_id = tidx % 8;
+
+        if (tidx >= integral.cols)
+            return;
+
+        sums[threadIdx.x][threadIdx.y] = 0;
+        __syncthreads();
+
+        unsigned int stepSum = 0;
+
+        for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+        {
+            unsigned int* p = integral.ptr(y) + tidx;
+
+            unsigned int sum = *p;
+
+            sums[threadIdx.x][threadIdx.y] = sum;
+            __syncthreads();
+
+            // place into SMEM
+            // shfl scan reduce the SMEM, reformating so the column
+            // sums are computed in a warp
+            // then read out properly
+            const int j = threadIdx.x % 8;
+            const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+            int partial_sum = sums[k][j];
+
+            for (int i = 1; i <= 8; i *= 2)
+            {
+                int n = __shfl_up(partial_sum, i, 32);
+
+                if (lane_id >= i)
+                    partial_sum += n;
+            }
+
+            sums[k][j] = partial_sum;
+            __syncthreads();
+
+            if (threadIdx.y > 0)
+                sum += sums[threadIdx.x][threadIdx.y - 1];
+
+            sum += stepSum;
+            stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+            __syncthreads();
+
+            *p = sum;
+        }
+    #endif
+    }
+
+    void shfl_integral(const cv::gpu::PtrStepSzb& img, cv::gpu::PtrStepSz<unsigned int> integral, cudaStream_t stream)
+    {
+        {
+            // each thread handles 16 values, use 1 block/row
+            // save, becouse step is actually can't be less 512 bytes
+            int block = integral.cols / 16;
+
+            // launch 1 block / row
+            const int grid = img.rows;
+
+            cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+            shfl_integral_horizontal<<<grid, block, 0, stream>>>((const cv::gpu::PtrStepSz<uint4>) img, (cv::gpu::PtrStepSz<uint4>) integral);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        {
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(integral.cols, block.x), 1);
+
+            shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    __global__ void shfl_integral_vertical(cv::gpu::PtrStepSz<unsigned int> buffer, cv::gpu::PtrStepSz<unsigned int> integral)
+    {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+        __shared__ unsigned int sums[32][9];
+
+        const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+        const int lane_id = tidx % 8;
+
+        if (tidx >= integral.cols)
+            return;
+
+        sums[threadIdx.x][threadIdx.y] = 0;
+        __syncthreads();
+
+        unsigned int stepSum = 0;
+
+        for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+        {
+            unsigned int* p = buffer.ptr(y) + tidx;
+            unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
+
+            unsigned int sum = *p;
+
+            sums[threadIdx.x][threadIdx.y] = sum;
+            __syncthreads();
+
+            // place into SMEM
+            // shfl scan reduce the SMEM, reformating so the column
+            // sums are computed in a warp
+            // then read out properly
+            const int j = threadIdx.x % 8;
+            const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+            int partial_sum = sums[k][j];
+
+            for (int i = 1; i <= 8; i *= 2)
+            {
+                int n = __shfl_up(partial_sum, i, 32);
+
+                if (lane_id >= i)
+                    partial_sum += n;
+            }
+
+            sums[k][j] = partial_sum;
+            __syncthreads();
+
+            if (threadIdx.y > 0)
+                sum += sums[threadIdx.x][threadIdx.y - 1];
+
+            sum += stepSum;
+            stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+            __syncthreads();
+
+            *dst = sum;
+        }
+    #endif
+    }
+
+    // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
+    void shfl_integral_gpu_buffered(cv::gpu::PtrStepSzb img, cv::gpu::PtrStepSz<uint4> buffer, cv::gpu::PtrStepSz<unsigned int> integral,
+        int blockStep, cudaStream_t stream)
+    {
+        {
+            const int block = blockStep;
+            const int grid = img.rows;
+
+            cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+            shfl_integral_horizontal<<<grid, block, 0, stream>>>((cv::gpu::PtrStepSz<uint4>) img, buffer);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        {
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(integral.cols, block.x), 1);
+
+            shfl_integral_vertical<<<grid, block, 0, stream>>>((cv::gpu::PtrStepSz<uint>)buffer, integral);
+            cudaSafeCall( cudaGetLastError() );
+        }
+    }
+    // 0
+#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+
+    enum
+    {
+        yuv_shift  = 14,
+        xyz_shift  = 12,
+        R2Y        = 4899,
+        G2Y        = 9617,
+        B2Y        = 1868
+    };
+
+    template <int bidx> static __device__ __forceinline__ unsigned char RGB2GrayConvert(uint src)
+    {
+        uint b = 0xffu & (src >> (bidx * 8));
+        uint g = 0xffu & (src >> 8);
+        uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+        return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
+    }
+
+    void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray)
+    {
+
+    }
+}}}
\ No newline at end of file
diff --git a/modules/softcascade/src/cuda/icf-sc.cu b/modules/softcascade/src/cuda/icf-sc.cu
index cb2f4c8cd..9020fa375 100644
--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -54,7 +54,7 @@ namespace
 
     inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
     {
-        //if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+        // if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
     }
 }
 
diff --git a/modules/softcascade/src/detector_cuda.cpp b/modules/softcascade/src/detector_cuda.cpp
index 9a422189b..4652a2b2a 100644
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -105,8 +105,11 @@ namespace cv { namespace softcascade { namespace device {
         cv::gpu::PtrStepSzb suppressed, cudaStream_t stream);
 
     void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv);
+    void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray);
     void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins);
     void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk);
+
+    void shfl_integral(const cv::gpu::PtrStepSzb& img, cv::gpu::PtrStepSz<unsigned int> integral, cudaStream_t stream);
 }}}
 
 struct cv::softcascade::SCascade::Fields
@@ -474,6 +477,45 @@ bool cv::softcascade::SCascade::load(const FileNode& fn)
     return fields != 0;
 }
 
+namespace {
+
+void integral(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& sum, cv::gpu::GpuMat& buffer, cv::gpu::Stream& s)
+{
+    CV_Assert(src.type() == CV_8UC1);
+
+    cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+
+    cv::Size whole;
+    cv::Point offset;
+
+    src.locateROI(whole, offset);
+
+    if (cv::gpu::deviceSupports(cv::gpu::WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
+    {
+        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
+
+        cv::softcascade::device::shfl_integral(src, buffer, stream);
+
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        if (s)
+            s.enqueueMemSet(sum, cv::Scalar::all(0));
+        else
+            sum.setTo(cv::Scalar::all(0));
+
+        cv::gpu::GpuMat inner = sum(cv::Rect(1, 1, src.cols, src.rows));
+        cv::gpu::GpuMat res = buffer(cv::Rect(0, 0, src.cols, src.rows));
+
+        if (s)
+            s.enqueueCopy(res, inner);
+        else
+            res.copyTo(inner);
+    }
+    else {CV_Error(CV_GpuNotSupported, ": CC 3.x required.");}
+}
+
+}
+
 void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, cv::gpu::Stream& s) const
 {
     CV_Assert(fields);
@@ -494,7 +536,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
 
     flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());
 
-    //cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
+    device::shrink(rois, flds.genRoiTmp);
     //cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
 
     if (type == CV_8UC3)
@@ -505,7 +547,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
             flds.createLevels(image.rows, image.cols);
 
         flds.preprocessor->apply(image, flds.shrunk);
-        //cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
+        integral(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
     }
     else
     {
@@ -561,7 +603,7 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
         channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
         setZero(channels, s);
 
-        //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+        cv::softcascade::device::transform(bgr, gray); //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
         cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
 
         cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));

From a47666414483f603bdd49b6e0a239a54e1c6bc3b Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 14 Mar 2013 13:49:48 +0400
Subject: [PATCH 7/8] fixed compilation with latest master changes

---
 modules/gpu/include/opencv2/gpu.hpp           | 227 ------------------
 modules/softcascade/CMakeLists.txt            |   2 +-
 modules/softcascade/doc/softcascade_cuda.rst  |  12 +-
 .../include/opencv2/softcascade.hpp           |   1 +
 modules/softcascade/src/cuda/channels.cu      |  29 ++-
 modules/softcascade/src/detector_cuda.cpp     |   9 +-
 modules/softcascade/src/precomp.hpp           |   1 +
 modules/softcascade/src/softcascade_init.cpp  |  18 ++
 modules/softcascade/test/test_precomp.hpp     |   1 +
 modules/softcascade/test/utility.hpp          |   5 +-
 samples/cpp/peopledetect.cpp                  |   8 +-
 samples/gpu/softcascade.cpp                   |   6 +-
 12 files changed, 65 insertions(+), 254 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu.hpp b/modules/gpu/include/opencv2/gpu.hpp
index ebf764f63..21a03dc20 100644
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -55,142 +55,6 @@
 #include "opencv2/features2d.hpp"
 
 namespace cv { namespace gpu {
-
-//////////////////////////////// CudaMem ////////////////////////////////
-// CudaMem is limited cv::Mat with page locked memory allocation.
-// Page locked memory is only needed for async and faster coping to GPU.
-// It is convertable to cv::Mat header without reference counting
-// so you can use it with other opencv functions.
-
-// Page-locks the matrix m memory and maps it for the device(s)
-CV_EXPORTS void registerPageLocked(Mat& m);
-// Unmaps the memory of matrix m, and makes it pageable again.
-CV_EXPORTS void unregisterPageLocked(Mat& m);
-
-class CV_EXPORTS CudaMem
-{
-public:
-    enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
-
-    CudaMem();
-    CudaMem(const CudaMem& m);
-
-    CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
-    CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-
-    //! creates from cv::Mat with coping data
-    explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
-
-    ~CudaMem();
-
-    CudaMem& operator = (const CudaMem& m);
-
-    //! returns deep copy of the matrix, i.e. the data is copied
-    CudaMem clone() const;
-
-    //! allocates new matrix data unless the matrix already has specified size and type.
-    void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-    void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-    //! decrements reference counter and released memory if needed.
-    void release();
-
-    //! returns matrix header with disabled reference counting for CudaMem data.
-    Mat createMatHeader() const;
-    operator Mat() const;
-
-    //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
-    GpuMat createGpuMatHeader() const;
-    operator GpuMat() const;
-
-    //returns if host memory can be mapperd to gpu address space;
-    static bool canMapHostMemory();
-
-    // Please see cv::Mat for descriptions
-    bool isContinuous() const;
-    size_t elemSize() const;
-    size_t elemSize1() const;
-    int type() const;
-    int depth() const;
-    int channels() const;
-    size_t step1() const;
-    Size size() const;
-    bool empty() const;
-
-
-    // Please see cv::Mat for descriptions
-    int flags;
-    int rows, cols;
-    size_t step;
-
-    uchar* data;
-    int* refcount;
-
-    uchar* datastart;
-    uchar* dataend;
-
-    int alloc_type;
-};
-
-//////////////////////////////// CudaStream ////////////////////////////////
-// Encapculates Cuda Stream. Provides interface for async coping.
-// Passed to each function that supports async kernel execution.
-// Reference counting is enabled
-
-class CV_EXPORTS Stream
-{
-public:
-    Stream();
-    ~Stream();
-
-    Stream(const Stream&);
-    Stream& operator =(const Stream&);
-
-    bool queryIfComplete();
-    void waitForCompletion();
-
-    //! downloads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
-    void enqueueDownload(const GpuMat& src, CudaMem& dst);
-    void enqueueDownload(const GpuMat& src, Mat& dst);
-
-    //! uploads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
-    void enqueueUpload(const CudaMem& src, GpuMat& dst);
-    void enqueueUpload(const Mat& src, GpuMat& dst);
-
-    //! copy asynchronously
-    void enqueueCopy(const GpuMat& src, GpuMat& dst);
-
-    //! memory set asynchronously
-    void enqueueMemSet(GpuMat& src, Scalar val);
-    void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
-
-    //! converts matrix type, ex from float to uchar depending on type
-    void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
-
-    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
-    typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
-    void enqueueHostCallback(StreamCallback callback, void* userData);
-
-    static Stream& Null();
-
-    operator bool() const;
-
-private:
-    struct Impl;
-
-    explicit Stream(Impl* impl);
-    void create();
-    void release();
-
-    Impl *impl;
-
-    friend struct StreamAccessor;
-};
-
-
 //////////////////////////////// Filter Engine ////////////////////////////////
 
 /*!
@@ -1522,97 +1386,6 @@ private:
     friend class CascadeClassifier_GPU_LBP;
 };
 
-// ======================== GPU version for soft cascade ===================== //
-
-class CV_EXPORTS ChannelsProcessor
-{
-public:
-    enum
-    {
-        GENERIC   = 1 << 4,
-        SEPARABLE = 2 << 4
-    };
-
-    // Appends specified number of HOG first-order features integrals into given vector.
-    // Param frame is an input 3-channel bgr image.
-    // Param channels is a GPU matrix of optionally shrinked channels
-    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
-    virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0;
-
-    // Creates a specific preprocessor implementation.
-    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
-    // Param bins is a number of HOG-like channels.
-    // Param flags is a channel computing extra flags.
-    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
-
-    virtual ~ChannelsProcessor();
-
-protected:
-    ChannelsProcessor();
-};
-
-// Implementation of soft (stage-less) cascaded detector.
-class CV_EXPORTS SCascade : public cv::Algorithm
-{
-public:
-
-    // Representation of detectors result.
-    struct CV_EXPORTS Detection
-    {
-        ushort x;
-        ushort y;
-        ushort w;
-        ushort h;
-        float confidence;
-        int kind;
-
-        enum {PEDESTRIAN = 0};
-    };
-
-    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
-
-    // An empty cascade will be created.
-    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applied.
-    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applied.
-    // Param scales is a number of scales from minScale to maxScale.
-    // Param flags is an extra tuning flags.
-    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
-        const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
-
-    virtual ~SCascade();
-
-    cv::AlgorithmInfo* info() const;
-
-    // Load cascade from FileNode.
-    // Param fn is a root node for cascade. Should be <cascade>.
-    virtual bool load(const FileNode& fn);
-
-    // Load cascade config.
-    virtual void read(const FileNode& fn);
-
-    // Return the matrix of of detected objects.
-    // Param image is a frame on which detector will be applied.
-    // Param rois is a regions of interests mask generated by genRoi.
-    //    Only the objects that fall into one of the regions will be returned.
-    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
-    //    The first element of the matrix is  actually a count of detections.
-    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
-    virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-
-private:
-
-    struct Fields;
-    Fields* fields;
-
-    double minScale;
-    double maxScale;
-    int scales;
-
-    int flags;
-};
-
-CV_EXPORTS bool initModule_gpu(void);
-
 ////////////////////////////////// SURF //////////////////////////////////////////
 
 class CV_EXPORTS SURF_GPU
diff --git a/modules/softcascade/CMakeLists.txt b/modules/softcascade/CMakeLists.txt
index 87798d299..d558e8d29 100644
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@@ -1,3 +1,3 @@
 set(the_description "Soft Cascade detection and training")
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310 -Wundef -Wsign-promo -Wmissing-declarations -Wmissing-prototypes)
 ocv_define_module(softcascade opencv_core opencv_imgproc opencv_ml)
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310 -Wundef)
diff --git a/modules/softcascade/doc/softcascade_cuda.rst b/modules/softcascade/doc/softcascade_cuda.rst
index 504774898..92b3bf6be 100644
--- a/modules/softcascade/doc/softcascade_cuda.rst
+++ b/modules/softcascade/doc/softcascade_cuda.rst
@@ -30,28 +30,28 @@ Implementation of soft (stageless) cascaded detector. ::
 
 
 softcascade::SCascade::~SCascade
----------------------------
+---------------------------------
 Destructor for SCascade.
 
-.. ocv:function:: gpu::SCascade::~SCascade()
+.. ocv:function:: softcascade::SCascade::~SCascade()
 
 
 
 softcascade::SCascade::load
---------------------------
+----------------------------
 Load cascade from FileNode.
 
-.. ocv:function:: bool gpu::SCascade::load(const FileNode& fn)
+.. ocv:function:: bool softcascade::SCascade::load(const FileNode& fn)
 
     :param fn: File node from which the soft cascade are read.
 
 
 
 softcascade::SCascade::detect
---------------------------
+------------------------------
 Apply cascade to an input frame and return the vector of Decection objcts.
 
-.. ocv:function:: void gpu::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
+.. ocv:function:: void softcascade::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) const
 
     :param image: a frame on which detector will be applied.
 
diff --git a/modules/softcascade/include/opencv2/softcascade.hpp b/modules/softcascade/include/opencv2/softcascade.hpp
index 7ce31a613..396149c84 100644
--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
@@ -44,6 +44,7 @@
 #define __OPENCV_SOFTCASCADE_HPP__
 
 #include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"
 
 namespace cv { namespace softcascade {
 
diff --git a/modules/softcascade/src/cuda/channels.cu b/modules/softcascade/src/cuda/channels.cu
index 7b153413d..692867148 100644
--- a/modules/softcascade/src/cuda/channels.cu
+++ b/modules/softcascade/src/cuda/channels.cu
@@ -42,6 +42,9 @@
 
 #include "opencv2/core/cuda_devptrs.hpp"
 
+namespace cv { namespace softcascade { namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func);
+}}}
 #if defined(__GNUC__)
     #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
@@ -50,7 +53,7 @@
 
 static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
 {
-    // if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    if (cudaSuccess != err) cv::softcascade::internal::error(cudaGetErrorString(err), file, line, func);
 }
 
 __host__ __device__ __forceinline__ int divUp(int total, int grain)
@@ -490,16 +493,30 @@ namespace cv { namespace softcascade { namespace device
         B2Y        = 1868
     };
 
-    template <int bidx> static __device__ __forceinline__ unsigned char RGB2GrayConvert(uint src)
+    template <int bidx> static __device__ __forceinline__ unsigned char RGB2GrayConvert(unsigned char b, unsigned char g, unsigned char r)
     {
-        uint b = 0xffu & (src >> (bidx * 8));
-        uint g = 0xffu & (src >> 8);
-        uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+        // uint b = 0xffu & (src >> (bidx * 8));
+        // uint g = 0xffu & (src >> 8);
+        // uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
         return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
     }
 
+    __global__ void device_transform(const cv::gpu::PtrStepSz<uchar3> bgr, cv::gpu::PtrStepSzb gray)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        const uchar3 colored = (uchar3)(bgr.ptr(y))[x];
+
+        gray.ptr(y)[x] = RGB2GrayConvert<0>(colored.x, colored.y, colored.z);
+    }
+
+    ///////
     void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray)
     {
-
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(bgr.cols, block.x), divUp(bgr.rows, block.y));
+        device_transform<<<grid, block>>>(bgr, gray);
+        cudaSafeCall(cudaDeviceSynchronize());
     }
 }}}
\ No newline at end of file
diff --git a/modules/softcascade/src/detector_cuda.cpp b/modules/softcascade/src/detector_cuda.cpp
index 4652a2b2a..bbadc9c54 100644
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@@ -536,7 +536,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
 
     flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());
 
-    device::shrink(rois, flds.genRoiTmp);
+    device::shrink(rois, flds.mask);
     //cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
 
     if (type == CV_8UC3)
@@ -594,15 +594,16 @@ struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
 
     virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
     {
-        const cv::gpu::GpuMat frame = _frame.getGpuMat();
+        bgr = _frame.getGpuMat();
         //cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
 
-        _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
+        _shrunk.create(bgr.rows * (4 + bins) / shrinkage, bgr.cols / shrinkage, CV_8UC1);
         cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();
 
-        channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
+        channels.create(bgr.rows * (4 + bins), bgr.cols, CV_8UC1);
         setZero(channels, s);
 
+        gray.create(bgr.size(), CV_8UC1);
         cv::softcascade::device::transform(bgr, gray); //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
         cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
 
diff --git a/modules/softcascade/src/precomp.hpp b/modules/softcascade/src/precomp.hpp
index e72b77d40..2b6be2664 100644
--- a/modules/softcascade/src/precomp.hpp
+++ b/modules/softcascade/src/precomp.hpp
@@ -56,6 +56,7 @@
 
 namespace cv { namespace softcascade { namespace internal
 {
+
 namespace rnd {
 
 typedef cv::RNG_MT19937 engine;
diff --git a/modules/softcascade/src/softcascade_init.cpp b/modules/softcascade/src/softcascade_init.cpp
index 902ad48a1..9563ac629 100644
--- a/modules/softcascade/src/softcascade_init.cpp
+++ b/modules/softcascade/src/softcascade_init.cpp
@@ -63,4 +63,22 @@ bool initModule_softcascade(void)
     return (sc1->info() != 0) && (sc->info() != 0);
 }
 
+namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func)
+{
+    int code = CV_GpuApiCallError;
+
+    if (std::uncaught_exception())
+    {
+        const char* errorStr = cvErrorStr(code);
+        const char* function = func ? func : "unknown function";
+
+        std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
+        std::cerr.flush();
+    }
+    else
+        cv::error( cv::Exception(code, error_string, func, file, line) );
+}
+}
+
 } }
\ No newline at end of file
diff --git a/modules/softcascade/test/test_precomp.hpp b/modules/softcascade/test/test_precomp.hpp
index 03d049b94..80bff6536 100644
--- a/modules/softcascade/test/test_precomp.hpp
+++ b/modules/softcascade/test/test_precomp.hpp
@@ -55,5 +55,6 @@
 # include "opencv2/softcascade.hpp"
 # include "opencv2/imgproc.hpp"
 # include "opencv2/highgui.hpp"
+# include "utility.hpp"
 
 #endif
diff --git a/modules/softcascade/test/utility.hpp b/modules/softcascade/test/utility.hpp
index 2018a156e..9849b525e 100644
--- a/modules/softcascade/test/utility.hpp
+++ b/modules/softcascade/test/utility.hpp
@@ -42,10 +42,9 @@
 #ifndef __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
 #define __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
 
-#include "opencv2/core/core.hpp"
+#include "opencv2/core.hpp"
 #include "opencv2/core/gpumat.hpp"
-#include "opencv2/ts/ts.hpp"
-#include "opencv2/ts/ts_perf.hpp"
+#include "opencv2/ts.hpp"
 
 //////////////////////////////////////////////////////////////////////
 // Gpu devices
diff --git a/samples/cpp/peopledetect.cpp b/samples/cpp/peopledetect.cpp
index 893f8cb73..85d77b851 100644
--- a/samples/cpp/peopledetect.cpp
+++ b/samples/cpp/peopledetect.cpp
@@ -1,7 +1,7 @@
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/objdetect/objdetect.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <opencv2/softcascade/softcascade.hpp>
+#include "opencv2/imgproc.hpp"
+#include "opencv2/objdetect.hpp"
+#include "opencv2/highgui.hpp"
+#include <opencv2/softcascade.hpp>
 
 #include <iostream>
 #include <vector>
diff --git a/samples/gpu/softcascade.cpp b/samples/gpu/softcascade.cpp
index e3683583a..9313a5ab0 100644
--- a/samples/gpu/softcascade.cpp
+++ b/samples/gpu/softcascade.cpp
@@ -1,6 +1,6 @@
-#include <opencv2/gpu/gpu.hpp>
-#include <opencv2/softcascade/softcascade.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/gpu.hpp>
+#include <opencv2/softcascade.hpp>
+#include <opencv2/highgui.hpp>
 #include <iostream>
 
 int main(int argc, char** argv)

From 157a98edf71caa5047e2e75990a5b730bb022bed Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 14 Mar 2013 23:06:17 +0400
Subject: [PATCH 8/8] refactored cuda error handling; remove optional
 dependancies in soft cascade training app

---
 apps/sft/CMakeLists.txt                       |  2 +-
 doc/check_docs2.py                            |  1 +
 .../include/opencv2/core/stream_accessor.hpp  |  6 +-
 modules/core/src/cudastream.cpp               | 27 --------
 modules/core/src/gpumat.cpp                   |  8 ---
 modules/core/src/matrix_operations.cpp        | 27 --------
 modules/core/src/opengl_interop.cpp           | 12 ----
 modules/core/src/precomp.hpp                  | 19 ++++++
 modules/gpu/include/opencv2/gpu/gpu.hpp       |  5 +-
 .../include/opencv2/gpu/stream_accessor.hpp   | 64 -------------------
 modules/gpu/src/precomp.hpp                   |  2 +-
 modules/softcascade/src/cuda/icf-sc.cu        | 12 ++--
 samples/cpp/peopledetect.cpp                  |  6 +-
 13 files changed, 37 insertions(+), 154 deletions(-)
 delete mode 100644 modules/gpu/include/opencv2/gpu/stream_accessor.hpp

diff --git a/apps/sft/CMakeLists.txt b/apps/sft/CMakeLists.txt
index c7bd187a2..8b950225c 100644
--- a/apps/sft/CMakeLists.txt
+++ b/apps/sft/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(name sft)
 set(the_target opencv_${name})
 
-set(OPENCV_${the_target}_DEPS opencv_core opencv_softcascade opencv_highgui opencv_imgproc opencv_ml OPTIONAL opencv_gpu opencv_objdetect opencv_featurest2d)
+set(OPENCV_${the_target}_DEPS opencv_core opencv_softcascade opencv_highgui opencv_imgproc opencv_ml)
 ocv_check_dependencies(${OPENCV_${the_target}_DEPS})
 
 if(NOT OCV_DEPENDENCIES_FOUND)
diff --git a/doc/check_docs2.py b/doc/check_docs2.py
index 72661431e..963c7ff1d 100755
--- a/doc/check_docs2.py
+++ b/doc/check_docs2.py
@@ -199,6 +199,7 @@ def process_module(module, path):
     if module == "gpu":
         hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda_devptrs.hpp"))
         hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpumat.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "stream_accessor.hpp"))
 
     decls = []
     for hname in hdrlist:
diff --git a/modules/core/include/opencv2/core/stream_accessor.hpp b/modules/core/include/opencv2/core/stream_accessor.hpp
index 30dcc6042..9748bef80 100644
--- a/modules/core/include/opencv2/core/stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/stream_accessor.hpp
@@ -40,8 +40,8 @@
 //
 //M*/
 
-#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
-#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+#ifndef __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
+#define __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
 
 #include "opencv2/core/gpumat.hpp"
 #include "cuda_runtime_api.h"
@@ -61,4 +61,4 @@ namespace cv
     }
 }
 
-#endif /* __OPENCV_GPU_STREAM_ACCESSOR_HPP__ */
\ No newline at end of file
+#endif /* __OPENCV_CUDA_STREAM_ACCESSOR_HPP__ */
\ No newline at end of file
diff --git a/modules/core/src/cudastream.cpp b/modules/core/src/cudastream.cpp
index c22db8719..6244af9e5 100644
--- a/modules/core/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
@@ -74,33 +74,6 @@ void cv::gpu::Stream::release() { throw_nogpu(); }
 
 #include "opencv2/core/stream_accessor.hpp"
 
-namespace
-{
-#if defined(__GNUC__)
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
-#endif
-
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            std::ostringstream msg;
-            msg << "NPP API Call Error: " << err;
-            cv::gpu::error(msg.str().c_str(), file, line, func);
-        }
-    }
-}
-
 namespace cv { namespace gpu
 {
     void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 2a9331f87..cc7442a66 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -72,19 +72,11 @@ using namespace cv::gpu;
 namespace
 {
 #if defined(__GNUC__)
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
     #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
     #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
 #endif
 
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
     inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
     {
         if (err < 0)
diff --git a/modules/core/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
index eace5181d..f16a72cb0 100644
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -190,33 +190,6 @@ void cv::gpu::CudaMem::release() { CV_Error(CV_GpuNotSupported, "The library is
 GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return GpuMat(); }
 
 #else /* !defined (HAVE_CUDA) */
-#include <cuda_runtime_api.h>
-namespace
-{
-#if defined(__GNUC__)
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
-#endif
-
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            std::ostringstream msg;
-            msg << "NPP API Call Error: " << err;
-            cv::gpu::error(msg.str().c_str(), file, line, func);
-        }
-    }
-}
 
 void cv::gpu::registerPageLocked(Mat& m)
 {
diff --git a/modules/core/src/opengl_interop.cpp b/modules/core/src/opengl_interop.cpp
index cf2cc47cf..fb4c53f62 100644
--- a/modules/core/src/opengl_interop.cpp
+++ b/modules/core/src/opengl_interop.cpp
@@ -67,18 +67,6 @@ namespace
             void throw_nocuda() { CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); }
         #else
             void throw_nocuda() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); }
-
-            #if defined(__GNUC__)
-                #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-            #else /* defined(__CUDACC__) || defined(__MSVC__) */
-                #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-            #endif
-
-            void ___cudaSafeCall(cudaError_t err, const char* file, const int line, const char* func = "")
-            {
-                if (cudaSuccess != err)
-                    cv::gpu::error(cudaGetErrorString(err), file, line, func);
-            }
         #endif
     #endif
 }
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index edfbda65e..30ce3d17b 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -66,6 +66,25 @@
 #define GET_OPTIMIZED(func) (func)
 #endif
 
+#ifdef HAVE_CUDA
+#  include <cuda_runtime_api.h>
+#  include "opencv2/core/gpumat.hpp"
+
+#  if defined(__GNUC__)
+#    define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#  else
+#    define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#  endif
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+
+#else
+#  define cudaSafeCall(expr)
+#endif
+
 namespace cv
 {
 
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index ee42816db..d82211bf3 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -22,7 +23,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
deleted file mode 100644
index 1797749b6..000000000
--- a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
-#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
-
-#include "opencv2/gpu.hpp"
-#include "cuda_runtime_api.h"
-
-namespace cv
-{
-    namespace gpu
-    {
-        // This is only header file that depends on Cuda. All other headers are independent.
-        // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
-        // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
-        // In this case you have to install Cuda Toolkit.
-        struct StreamAccessor
-        {
-            CV_EXPORTS static cudaStream_t getStream(const cv::gpu::Stream& stream);
-        };
-    }
-}
-
-#endif /* __OPENCV_GPU_STREAM_ACCESSOR_HPP__ */
\ No newline at end of file
diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp
index 1aa442b99..caa46cfc8 100644
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -106,7 +106,7 @@
     #endif
 
     #include "internal_shared.hpp"
-    #include "opencv2/gpu/stream_accessor.hpp"
+    #include "opencv2/core/stream_accessor.hpp"
 
     #include "nvidia/core/NCV.hpp"
     #include "nvidia/NPP_staging/NPP_staging.hpp"
diff --git a/modules/softcascade/src/cuda/icf-sc.cu b/modules/softcascade/src/cuda/icf-sc.cu
index 9020fa375..8d65aad0e 100644
--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@@ -44,18 +44,18 @@
 #include <float.h>
 #include <stdio.h>
 
-namespace
-{
+namespace cv { namespace softcascade { namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func);
+}}}
 #if defined(__GNUC__)
     #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
     #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
 #endif
 
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        // if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err) cv::softcascade::internal::error(cudaGetErrorString(err), file, line, func);
 }
 
 #ifndef CV_PI
diff --git a/samples/cpp/peopledetect.cpp b/samples/cpp/peopledetect.cpp
index 85d77b851..c73735271 100644
--- a/samples/cpp/peopledetect.cpp
+++ b/samples/cpp/peopledetect.cpp
@@ -1,6 +1,6 @@
-#include "opencv2/imgproc.hpp"
-#include "opencv2/objdetect.hpp"
-#include "opencv2/highgui.hpp"
+#include <opencv2/imgproc.hpp>
+#include <opencv2/objdetect.hpp>
+#include <opencv2/highgui.hpp>
 #include <opencv2/softcascade.hpp>
 
 #include <iostream>