Merge pull request #648 from cuda-geek:move-gpu-soft-cascade-to-softcascade-module

2013-03-15 10:35:03 +04:00 · 2013-03-15 10:35:03 +04:00 · 1eb34e062c
commit 1eb34e062c
parent d9cd753835 157a98edf7
31 changed files with 1317 additions and 686 deletions
--- a/doc/check_docs2.py
+++ b/doc/check_docs2.py
@ -199,6 +199,7 @@ def process_module(module, path):
    if module == "gpu":
        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda_devptrs.hpp"))
        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpumat.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "stream_accessor.hpp"))

    decls = []
    for hname in hdrlist:
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -50,6 +50,141 @@

 namespace cv { namespace gpu
 {
+    //////////////////////////////// CudaMem ////////////////////////////////
+    // CudaMem is limited cv::Mat with page locked memory allocation.
+    // Page locked memory is only needed for async and faster coping to GPU.
+    // It is convertable to cv::Mat header without reference counting
+    // so you can use it with other opencv functions.
+
+    // Page-locks the matrix m memory and maps it for the device(s)
+    CV_EXPORTS void registerPageLocked(Mat& m);
+    // Unmaps the memory of matrix m, and makes it pageable again.
+    CV_EXPORTS void unregisterPageLocked(Mat& m);
+
+    class CV_EXPORTS CudaMem
+    {
+    public:
+        enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
+
+        CudaMem();
+        CudaMem(const CudaMem& m);
+
+        CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
+        CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+
+        //! creates from cv::Mat with coping data
+        explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        ~CudaMem();
+
+        CudaMem& operator = (const CudaMem& m);
+
+        //! returns deep copy of the matrix, i.e. the data is copied
+        CudaMem clone() const;
+
+        //! allocates new matrix data unless the matrix already has specified size and type.
+        void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+        void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
+
+        //! decrements reference counter and released memory if needed.
+        void release();
+
+        //! returns matrix header with disabled reference counting for CudaMem data.
+        Mat createMatHeader() const;
+        operator Mat() const;
+
+        //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+        GpuMat createGpuMatHeader() const;
+        operator GpuMat() const;
+
+        //returns if host memory can be mapperd to gpu address space;
+        static bool canMapHostMemory();
+
+        // Please see cv::Mat for descriptions
+        bool isContinuous() const;
+        size_t elemSize() const;
+        size_t elemSize1() const;
+        int type() const;
+        int depth() const;
+        int channels() const;
+        size_t step1() const;
+        Size size() const;
+        bool empty() const;
+
+
+        // Please see cv::Mat for descriptions
+        int flags;
+        int rows, cols;
+        size_t step;
+
+        uchar* data;
+        int* refcount;
+
+        uchar* datastart;
+        uchar* dataend;
+
+        int alloc_type;
+    };
+
+
+    //////////////////////////////// CudaStream ////////////////////////////////
+    // Encapculates Cuda Stream. Provides interface for async coping.
+    // Passed to each function that supports async kernel execution.
+    // Reference counting is enabled
+
+    class CV_EXPORTS Stream
+    {
+    public:
+        Stream();
+        ~Stream();
+
+        Stream(const Stream&);
+        Stream& operator =(const Stream&);
+
+        bool queryIfComplete();
+        void waitForCompletion();
+
+        //! downloads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
+        void enqueueDownload(const GpuMat& src, CudaMem& dst);
+        void enqueueDownload(const GpuMat& src, Mat& dst);
+
+        //! uploads asynchronously
+        // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
+        void enqueueUpload(const CudaMem& src, GpuMat& dst);
+        void enqueueUpload(const Mat& src, GpuMat& dst);
+
+        //! copy asynchronously
+        void enqueueCopy(const GpuMat& src, GpuMat& dst);
+
+        //! memory set asynchronously
+        void enqueueMemSet(GpuMat& src, Scalar val);
+        void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
+
+        //! converts matrix type, ex from float to uchar depending on type
+        void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
+
+        //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+        typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+        void enqueueHostCallback(StreamCallback callback, void* userData);
+
+        static Stream& Null();
+
+        operator bool() const;
+
+    private:
+        struct Impl;
+
+        explicit Stream(Impl* impl);
+        void create();
+        void release();
+
+        Impl *impl;
+
+        friend struct StreamAccessor;
+    };
+
    //////////////////////////////// Initialization & Info ////////////////////////

    //! This is the only function that do not throw exceptions if the library is compiled without Cuda.
--- a/modules/core/include/opencv2/core/stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/stream_accessor.hpp
@ -40,10 +40,10 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
-#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+#ifndef __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
+#define __OPENCV_CUDA_STREAM_ACCESSOR_HPP__

-#include "opencv2/gpu.hpp"
+#include "opencv2/core/gpumat.hpp"
 #include "cuda_runtime_api.h"

 namespace cv
@ -61,4 +61,4 @@ namespace cv
    }
 }

-#endif /* __OPENCV_GPU_STREAM_ACCESSOR_HPP__ */
+#endif /* __OPENCV_CUDA_STREAM_ACCESSOR_HPP__ */
--- a/modules/core/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
@ -41,11 +41,13 @@
 //M*/

 #include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"

 using namespace cv;
 using namespace cv::gpu;

 #if !defined (HAVE_CUDA)
+#define throw_nogpu() CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")

 cv::gpu::Stream::Stream() { throw_nogpu(); }
 cv::gpu::Stream::~Stream() {}
@ -70,7 +72,7 @@ void cv::gpu::Stream::release() { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

-#include "opencv2/gpu/stream_accessor.hpp"
+#include "opencv2/core/stream_accessor.hpp"

 namespace cv { namespace gpu
 {
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -72,19 +72,11 @@ using namespace cv::gpu;
 namespace
 {
 #if defined(__GNUC__)
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
 #endif

-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
    {
        if (err < 0)
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@ -41,6 +41,7 @@
 //M*/

 #include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"

 using namespace cv;
 using namespace cv::gpu;
@ -178,14 +179,15 @@ bool cv::gpu::CudaMem::empty() const
    return data == 0;
 }

-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA)

-void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }
-void cv::gpu::unregisterPageLocked(Mat&) { throw_nogpu(); }
-void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); }
-bool cv::gpu::CudaMem::canMapHostMemory() { throw_nogpu(); return false; }
-void cv::gpu::CudaMem::release() { throw_nogpu(); }
-GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_nogpu(); return GpuMat(); }
+void cv::gpu::registerPageLocked(Mat&) { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+void cv::gpu::unregisterPageLocked(Mat&) { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+void cv::gpu::CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/)
+{ CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+bool cv::gpu::CudaMem::canMapHostMemory() { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return false; }
+void cv::gpu::CudaMem::release() { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); return GpuMat(); }

 #else /* !defined (HAVE_CUDA) */

--- a/modules/core/src/opengl_interop.cpp
+++ b/modules/core/src/opengl_interop.cpp
@ -67,18 +67,6 @@ namespace
            void throw_nocuda() { CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); }
        #else
            void throw_nocuda() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); }
-
-            #if defined(__GNUC__)
-                #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-            #else /* defined(__CUDACC__) || defined(__MSVC__) */
-                #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-            #endif
-
-            void ___cudaSafeCall(cudaError_t err, const char* file, const int line, const char* func = "")
-            {
-                if (cudaSuccess != err)
-                    cv::gpu::error(cudaGetErrorString(err), file, line, func);
-            }
        #endif
    #endif
 }
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -66,6 +66,25 @@
 #define GET_OPTIMIZED(func) (func)
 #endif

+#ifdef HAVE_CUDA
+#  include <cuda_runtime_api.h>
+#  include "opencv2/core/gpumat.hpp"
+
+#  if defined(__GNUC__)
+#    define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#  else
+#    define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#  endif
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+
+#else
+#  define cudaSafeCall(expr)
+#endif
+
 namespace cv
 {

--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@ -199,88 +199,6 @@ Returns block descriptors computed for the whole image.
 The function is mainly used to learn the classifier.


-Soft Cascade Classifier
-==========================
-
-Soft Cascade Classifier for Object Detection
----------------------------------------------------------
-
-Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows:
-
-.. math::
-    \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)}
-
-where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let
-
-.. math::
-    \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)}
-
-be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*.
-After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*.
-
-The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_.
-
-gpu::SCascade
-----------------------------------------------
-.. ocv:class:: gpu::SCascade : public Algorithm
-
-Implementation of soft (stageless) cascaded detector. ::
-
-    class CV_EXPORTS SCascade : public Algorithm
-    {
-        struct CV_EXPORTS Detection
-        {
-              ushort x;
-              ushort y;
-              ushort w;
-              ushort h;
-              float confidence;
-              int kind;
-
-              enum {PEDESTRIAN = 0};
-        };
-
-        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
-        virtual ~SCascade();
-        virtual bool load(const FileNode& fn);
-        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
-    };
-
-
-gpu::SCascade::~SCascade
---------------------------
-Destructor for SCascade.
-
-.. ocv:function:: gpu::SCascade::~SCascade()
-
-
-
-gpu::SCascade::load
--------------------------
-Load cascade from FileNode.
-
-.. ocv:function:: bool gpu::SCascade::load(const FileNode& fn)
-
-    :param fn: File node from which the soft cascade are read.
-
-
-
-gpu::SCascade::detect
--------------------------
-Apply cascade to an input frame and return the vector of Decection objcts.
-
-.. ocv:function:: void gpu::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
-
-    :param image: a frame on which detector will be applied.
-
-    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
-
-    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
-
-    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
-
-
 gpu::CascadeClassifier_GPU
 --------------------------
 .. ocv:class:: gpu::CascadeClassifier_GPU
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@ -55,142 +55,6 @@
 #include "opencv2/features2d.hpp"

 namespace cv { namespace gpu {
-
-//////////////////////////////// CudaMem ////////////////////////////////
-// CudaMem is limited cv::Mat with page locked memory allocation.
-// Page locked memory is only needed for async and faster coping to GPU.
-// It is convertable to cv::Mat header without reference counting
-// so you can use it with other opencv functions.
-
-// Page-locks the matrix m memory and maps it for the device(s)
-CV_EXPORTS void registerPageLocked(Mat& m);
-// Unmaps the memory of matrix m, and makes it pageable again.
-CV_EXPORTS void unregisterPageLocked(Mat& m);
-
-class CV_EXPORTS CudaMem
-{
-public:
-    enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
-
-    CudaMem();
-    CudaMem(const CudaMem& m);
-
-    CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
-    CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-
-    //! creates from cv::Mat with coping data
-    explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
-
-    ~CudaMem();
-
-    CudaMem& operator = (const CudaMem& m);
-
-    //! returns deep copy of the matrix, i.e. the data is copied
-    CudaMem clone() const;
-
-    //! allocates new matrix data unless the matrix already has specified size and type.
-    void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-    void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-    //! decrements reference counter and released memory if needed.
-    void release();
-
-    //! returns matrix header with disabled reference counting for CudaMem data.
-    Mat createMatHeader() const;
-    operator Mat() const;
-
-    //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
-    GpuMat createGpuMatHeader() const;
-    operator GpuMat() const;
-
-    //returns if host memory can be mapperd to gpu address space;
-    static bool canMapHostMemory();
-
-    // Please see cv::Mat for descriptions
-    bool isContinuous() const;
-    size_t elemSize() const;
-    size_t elemSize1() const;
-    int type() const;
-    int depth() const;
-    int channels() const;
-    size_t step1() const;
-    Size size() const;
-    bool empty() const;
-
-
-    // Please see cv::Mat for descriptions
-    int flags;
-    int rows, cols;
-    size_t step;
-
-    uchar* data;
-    int* refcount;
-
-    uchar* datastart;
-    uchar* dataend;
-
-    int alloc_type;
-};
-
-//////////////////////////////// CudaStream ////////////////////////////////
-// Encapculates Cuda Stream. Provides interface for async coping.
-// Passed to each function that supports async kernel execution.
-// Reference counting is enabled
-
-class CV_EXPORTS Stream
-{
-public:
-    Stream();
-    ~Stream();
-
-    Stream(const Stream&);
-    Stream& operator =(const Stream&);
-
-    bool queryIfComplete();
-    void waitForCompletion();
-
-    //! downloads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
-    void enqueueDownload(const GpuMat& src, CudaMem& dst);
-    void enqueueDownload(const GpuMat& src, Mat& dst);
-
-    //! uploads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
-    void enqueueUpload(const CudaMem& src, GpuMat& dst);
-    void enqueueUpload(const Mat& src, GpuMat& dst);
-
-    //! copy asynchronously
-    void enqueueCopy(const GpuMat& src, GpuMat& dst);
-
-    //! memory set asynchronously
-    void enqueueMemSet(GpuMat& src, Scalar val);
-    void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
-
-    //! converts matrix type, ex from float to uchar depending on type
-    void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
-
-    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
-    typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
-    void enqueueHostCallback(StreamCallback callback, void* userData);
-
-    static Stream& Null();
-
-    operator bool() const;
-
-private:
-    struct Impl;
-
-    explicit Stream(Impl* impl);
-    void create();
-    void release();
-
-    Impl *impl;
-
-    friend struct StreamAccessor;
-};
-
-
 //////////////////////////////// Filter Engine ////////////////////////////////

 /*!
@ -1522,97 +1386,6 @@ private:
    friend class CascadeClassifier_GPU_LBP;
 };

-// ======================== GPU version for soft cascade ===================== //
-
-class CV_EXPORTS ChannelsProcessor
-{
-public:
-    enum
-    {
-        GENERIC   = 1 << 4,
-        SEPARABLE = 2 << 4
-    };
-
-    // Appends specified number of HOG first-order features integrals into given vector.
-    // Param frame is an input 3-channel bgr image.
-    // Param channels is a GPU matrix of optionally shrinked channels
-    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
-    virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0;
-
-    // Creates a specific preprocessor implementation.
-    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
-    // Param bins is a number of HOG-like channels.
-    // Param flags is a channel computing extra flags.
-    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
-
-    virtual ~ChannelsProcessor();
-
-protected:
-    ChannelsProcessor();
-};
-
-// Implementation of soft (stage-less) cascaded detector.
-class CV_EXPORTS SCascade : public cv::Algorithm
-{
-public:
-
-    // Representation of detectors result.
-    struct CV_EXPORTS Detection
-    {
-        ushort x;
-        ushort y;
-        ushort w;
-        ushort h;
-        float confidence;
-        int kind;
-
-        enum {PEDESTRIAN = 0};
-    };
-
-    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
-
-    // An empty cascade will be created.
-    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applied.
-    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applied.
-    // Param scales is a number of scales from minScale to maxScale.
-    // Param flags is an extra tuning flags.
-    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
-        const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
-
-    virtual ~SCascade();
-
-    cv::AlgorithmInfo* info() const;
-
-    // Load cascade from FileNode.
-    // Param fn is a root node for cascade. Should be <cascade>.
-    virtual bool load(const FileNode& fn);
-
-    // Load cascade config.
-    virtual void read(const FileNode& fn);
-
-    // Return the matrix of of detected objects.
-    // Param image is a frame on which detector will be applied.
-    // Param rois is a regions of interests mask generated by genRoi.
-    //    Only the objects that fall into one of the regions will be returned.
-    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
-    //    The first element of the matrix is  actually a count of detections.
-    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
-    virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-
-private:
-
-    struct Fields;
-    Fields* fields;
-
-    double minScale;
-    double maxScale;
-    int scales;
-
-    int flags;
-};
-
-CV_EXPORTS bool initModule_gpu(void);
-
 ////////////////////////////////// SURF //////////////////////////////////////////

 class CV_EXPORTS SURF_GPU
--- a/modules/gpu/src/gpu_init.cpp
+++ b/modules/gpu/src/gpu_init.cpp
@ -1,59 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace gpu
-{
-
-CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
-                  obj.info()->addParam(obj, "minScale", obj.minScale);
-                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
-                  obj.info()->addParam(obj, "scales",   obj.scales));
-
-bool initModule_gpu(void)
-{
-    Ptr<Algorithm> sc = createSCascade();
-    return sc->info() != 0;
-}
-
-} }
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@ -106,7 +106,7 @@
    #endif

    #include "internal_shared.hpp"
-    #include "opencv2/gpu/stream_accessor.hpp"
+    #include "opencv2/core/stream_accessor.hpp"

    #include "nvidia/core/NCV.hpp"
    #include "nvidia/NPP_staging/NPP_staging.hpp"
--- a/modules/softcascade/CMakeLists.txt
+++ b/modules/softcascade/CMakeLists.txt
@ -1,3 +1,3 @@
 set(the_description "Soft Cascade detection and training")
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310 -Wundef -Wsign-promo -Wmissing-declarations -Wmissing-prototypes)
 ocv_define_module(softcascade opencv_core opencv_imgproc opencv_ml)
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4310)
--- a/modules/softcascade/doc/softcascade.rst
+++ b/modules/softcascade/doc/softcascade.rst
@ -8,4 +8,5 @@ softcascade. Soft Cascade object detection and training.
    :maxdepth: 2

    softcascade_detector
-    softcascade_training
+    softcascade_training
+    softcascade_cuda
--- a/modules/softcascade/doc/softcascade_cuda.rst
+++ b/modules/softcascade/doc/softcascade_cuda.rst
@ -0,0 +1,62 @@
+CUDA version of Soft Cascade Classifier
+========================================
+
+softcascade::SCascade
+-----------------------------------------------
+.. ocv:class:: softcascade::SCascade : public Algorithm
+
+Implementation of soft (stageless) cascaded detector. ::
+
+    class CV_EXPORTS SCascade : public Algorithm
+    {
+        struct CV_EXPORTS Detection
+        {
+              ushort x;
+              ushort y;
+              ushort w;
+              ushort h;
+              float confidence;
+              int kind;
+
+              enum {PEDESTRIAN = 0};
+        };
+
+        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
+        virtual ~SCascade();
+        virtual bool load(const FileNode& fn);
+        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+    };
+
+
+softcascade::SCascade::~SCascade
+---------------------------------
+Destructor for SCascade.
+
+.. ocv:function:: softcascade::SCascade::~SCascade()
+
+
+
+softcascade::SCascade::load
+----------------------------
+Load cascade from FileNode.
+
+.. ocv:function:: bool softcascade::SCascade::load(const FileNode& fn)
+
+    :param fn: File node from which the soft cascade are read.
+
+
+
+softcascade::SCascade::detect
+------------------------------
+Apply cascade to an input frame and return the vector of Decection objcts.
+
+.. ocv:function:: void softcascade::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) const
+
+    :param image: a frame on which detector will be applied.
+
+    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
+
+    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
+
+    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
@ -44,6 +44,7 @@
 #define __OPENCV_SOFTCASCADE_HPP__

 #include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"

 namespace cv { namespace softcascade {

@ -212,6 +213,96 @@ public:

 CV_EXPORTS bool initModule_softcascade(void);

+// ======================== GPU version for soft cascade ===================== //
+
+class CV_EXPORTS ChannelsProcessor
+{
+public:
+    enum
+    {
+        // GENERIC   = 1 << 4, does not supported
+        SEPARABLE = 2 << 4
+    };
+
+    // Appends specified number of HOG first-order features integrals into given vector.
+    // Param frame is an input 3-channel bgr image.
+    // Param channels is a GPU matrix of optionally shrinked channels
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
+    virtual void apply(InputArray frame, OutputArray channels, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) = 0;
+
+    // Creates a specific preprocessor implementation.
+    // Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
+    // Param bins is a number of HOG-like channels.
+    // Param flags is a channel computing extra flags.
+    static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = SEPARABLE);
+
+    virtual ~ChannelsProcessor();
+
+protected:
+    ChannelsProcessor();
+};
+
+// Implementation of soft (stage-less) cascaded detector.
+class CV_EXPORTS SCascade : public cv::Algorithm
+{
+public:
+
+    // Representation of detectors result.
+    struct CV_EXPORTS Detection
+    {
+        ushort x;
+        ushort y;
+        ushort w;
+        ushort h;
+        float confidence;
+        int kind;
+
+        enum {PEDESTRIAN = 0};
+    };
+
+    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
+
+    // An empty cascade will be created.
+    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applied.
+    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applied.
+    // Param scales is a number of scales from minScale to maxScale.
+    // Param flags is an extra tuning flags.
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
+        const int flags = NO_REJECT | ChannelsProcessor::SEPARABLE);
+
+    virtual ~SCascade();
+
+    cv::AlgorithmInfo* info() const;
+
+    // Load cascade from FileNode.
+    // Param fn is a root node for cascade. Should be <cascade>.
+    virtual bool load(const FileNode& fn);
+
+    // Load cascade config.
+    virtual void read(const FileNode& fn);
+
+    // Return the matrix of of detected objects.
+    // Param image is a frame on which detector will be applied.
+    // Param rois is a regions of interests mask generated by genRoi.
+    //    Only the objects that fall into one of the regions will be returned.
+    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
+    //    The first element of the matrix is  actually a count of detections.
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, cv::gpu::Stream& stream = cv::gpu::Stream::Null()) const;
+
+private:
+
+    struct Fields;
+    Fields* fields;
+
+    double minScale;
+    double maxScale;
+    int scales;
+
+    int flags;
+};
+
+
 }} // namespace cv { namespace softcascade {

 #endif
--- a/modules/softcascade/perf/perf_cuda_softcascade.cpp
+++ b/modules/softcascade/perf/perf_cuda_softcascade.cpp
@ -1,5 +1,7 @@
 #include "perf_precomp.hpp"

+using std::tr1::get;
+
 #define SC_PERF_TEST_P(fixture, name, params)  \
    class fixture##_##name : public fixture {\
     public:\
@ -25,8 +27,8 @@ void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";
 namespace {
    struct DetectionLess
    {
-        bool operator()(const cv::gpu::SCascade::Detection& a,
-            const cv::gpu::SCascade::Detection& b) const
+        bool operator()(const cv::softcascade::SCascade::Detection& a,
+            const cv::softcascade::SCascade::Detection& b) const
        {
            if (a.x != b.x)      return a.x < b.x;
            else if (a.y != b.y) return a.y < b.y;
@ -39,7 +41,7 @@ namespace {
    {
        cv::Mat detections(objects);

-        typedef cv::gpu::SCascade::Detection Detection;
+        typedef cv::softcascade::SCascade::Detection Detection;
        Detection* begin = (Detection*)(detections.ptr<char>(0));
        Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
        std::sort(begin, end, DetectionLess());
@ -60,18 +62,18 @@ SC_PERF_TEST_P(SCascadeTest, detect,

 RUN_GPU(SCascadeTest, detect)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));;
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(1);

    cascade.detect(colored, rois, objectBoxes);
@ -118,13 +120,13 @@ SC_PERF_TEST_P(SCascadeTestRoi, detectInRoi,

 RUN_GPU(SCascadeTestRoi, detectInRoi)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@ -132,7 +134,7 @@ RUN_GPU(SCascadeTestRoi, detectInRoi)
    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

-    int nroi = GET_PARAM(2);
+    int nroi = get<2>(GetParam());
    cv::RNG rng;
    for (int i = 0; i < nroi; ++i)
    {
@ -163,13 +165,13 @@ SC_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,

 RUN_GPU(SCascadeTestRoi, detectEachRoi)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@ -177,7 +179,7 @@ RUN_GPU(SCascadeTestRoi, detectEachRoi)
    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

-    int idx = GET_PARAM(2);
+    int idx = get<2>(GetParam());
    cv::Rect r = getFromTable(idx);
    cv::gpu::GpuMat sub(rois, r);
    sub.setTo(1);
@ -202,18 +204,18 @@ SC_PERF_TEST_P(SCascadeTest, detectStream,

 RUN_GPU(SCascadeTest, detectStream)
 {
-    cv::Mat cpu = readImage (GET_PARAM(1));
+    cv::Mat cpu = cv::imread(getDataPath(get<1>(GetParam())));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    cv::FileStorage fs(getDataPath(get<0>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::softcascade::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(1);

    cv::gpu::Stream s;
--- a/modules/softcascade/src/cuda/channels.cu
+++ b/modules/softcascade/src/cuda/channels.cu
@ -0,0 +1,522 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/cuda_devptrs.hpp"
+
+namespace cv { namespace softcascade { namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func);
+}}}
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err) cv::softcascade::internal::error(cudaGetErrorString(err), file, line, func);
+}
+
+__host__ __device__ __forceinline__ int divUp(int total, int grain)
+{
+    return (total + grain - 1) / grain;
+}
+
+namespace cv { namespace softcascade { namespace device
+{
+    // Utility function to extract unsigned chars from an unsigned integer
+    __device__ uchar4 int_to_uchar4(unsigned int in)
+    {
+        uchar4 bytes;
+        bytes.x = (in & 0x000000ff) >>  0;
+        bytes.y = (in & 0x0000ff00) >>  8;
+        bytes.z = (in & 0x00ff0000) >> 16;
+        bytes.w = (in & 0xff000000) >> 24;
+        return bytes;
+    }
+
+    __global__ void shfl_integral_horizontal(const cv::gpu::PtrStep<uint4> img, cv::gpu::PtrStep<uint4> integral)
+    {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+        __shared__ int sums[128];
+
+        const int id = threadIdx.x;
+        const int lane_id = id % warpSize;
+        const int warp_id = id / warpSize;
+
+        const uint4 data = img(blockIdx.x, id);
+
+        const uchar4 a = int_to_uchar4(data.x);
+        const uchar4 b = int_to_uchar4(data.y);
+        const uchar4 c = int_to_uchar4(data.z);
+        const uchar4 d = int_to_uchar4(data.w);
+
+        int result[16];
+
+        result[0]  =              a.x;
+        result[1]  = result[0]  + a.y;
+        result[2]  = result[1]  + a.z;
+        result[3]  = result[2]  + a.w;
+
+        result[4]  = result[3]  + b.x;
+        result[5]  = result[4]  + b.y;
+        result[6]  = result[5]  + b.z;
+        result[7]  = result[6]  + b.w;
+
+        result[8]  = result[7]  + c.x;
+        result[9]  = result[8]  + c.y;
+        result[10] = result[9]  + c.z;
+        result[11] = result[10] + c.w;
+
+        result[12] = result[11] + d.x;
+        result[13] = result[12] + d.y;
+        result[14] = result[13] + d.z;
+        result[15] = result[14] + d.w;
+
+        int sum = result[15];
+
+        // the prefix sum for each thread's 16 value is computed,
+        // now the final sums (result[15]) need to be shared
+        // with the other threads and add.  To do this,
+        // the __shfl_up() instruction is used and a shuffle scan
+        // operation is performed to distribute the sums to the correct
+        // threads
+        #pragma unroll
+        for (int i = 1; i < 32; i *= 2)
+        {
+            const int n = __shfl_up(sum, i, 32);
+
+            if (lane_id >= i)
+            {
+                #pragma unroll
+                for (int i = 0; i < 16; ++i)
+                    result[i] += n;
+
+                sum += n;
+            }
+        }
+
+        // Now the final sum for the warp must be shared
+        // between warps.  This is done by each warp
+        // having a thread store to shared memory, then
+        // having some other warp load the values and
+        // compute a prefix sum, again by using __shfl_up.
+        // The results are uniformly added back to the warps.
+        // last thread in the warp holding sum of the warp
+        // places that in shared
+        if (threadIdx.x % warpSize == warpSize - 1)
+            sums[warp_id] = result[15];
+
+        __syncthreads();
+
+        if (warp_id == 0)
+        {
+            int warp_sum = sums[lane_id];
+
+            #pragma unroll
+            for (int i = 1; i <= 32; i *= 2)
+            {
+                const int n = __shfl_up(warp_sum, i, 32);
+
+                if (lane_id >= i)
+                    warp_sum += n;
+            }
+
+            sums[lane_id] = warp_sum;
+        }
+
+        __syncthreads();
+
+        int blockSum = 0;
+
+        // fold in unused warp
+        if (warp_id > 0)
+        {
+            blockSum = sums[warp_id - 1];
+
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+                result[i] += blockSum;
+        }
+
+        // assemble result
+        // Each thread has 16 values to write, which are
+        // now integer data (to avoid overflow).  Instead of
+        // each thread writing consecutive uint4s, the
+        // approach shown here experiments using
+        // the shuffle command to reformat the data
+        // inside the registers so that each thread holds
+        // consecutive data to be written so larger contiguous
+        // segments can be assembled for writing.
+
+        /*
+            For example data that needs to be written as
+
+            GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+            but is stored in registers (r0..r3), in four threads (0..3) as:
+
+            threadId   0  1  2  3
+              r0      x0 y0 z0 w0
+              r1      x1 y1 z1 w1
+              r2      x2 y2 z2 w2
+              r3      x3 y3 z3 w3
+
+              after apply __shfl_xor operations to move data between registers r1..r3:
+
+            threadId  00 01 10 11
+                      x0 y0 z0 w0
+             xor(01)->y1 x1 w1 z1
+             xor(10)->z2 w2 x2 y2
+             xor(11)->w3 z3 y3 x3
+
+             and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+             In the current code, each register above is actually representing
+             four integers to be written as uint4's to GMEM.
+        */
+
+        result[4]  = __shfl_xor(result[4] , 1, 32);
+        result[5]  = __shfl_xor(result[5] , 1, 32);
+        result[6]  = __shfl_xor(result[6] , 1, 32);
+        result[7]  = __shfl_xor(result[7] , 1, 32);
+
+        result[8]  = __shfl_xor(result[8] , 2, 32);
+        result[9]  = __shfl_xor(result[9] , 2, 32);
+        result[10] = __shfl_xor(result[10], 2, 32);
+        result[11] = __shfl_xor(result[11], 2, 32);
+
+        result[12] = __shfl_xor(result[12], 3, 32);
+        result[13] = __shfl_xor(result[13], 3, 32);
+        result[14] = __shfl_xor(result[14], 3, 32);
+        result[15] = __shfl_xor(result[15], 3, 32);
+
+        uint4* integral_row = integral.ptr(blockIdx.x);
+        uint4 output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+        // continuning from the above example,
+        // this use of __shfl_xor() places the y0..y3 and w0..w3 data
+        // in order.
+
+        #pragma unroll
+        for (int i = 0; i < 16; ++i)
+            result[i] = __shfl_xor(result[i], 1, 32);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+        ///////
+
+        if (threadIdx.x % 4 == 2)
+            output = make_uint4(result[0], result[1], result[2], result[3]);
+
+        if (threadIdx.x % 4 == 3)
+            output = make_uint4(result[4], result[5], result[6], result[7]);
+
+        if (threadIdx.x % 4 == 0)
+            output = make_uint4(result[8], result[9], result[10], result[11]);
+
+        if (threadIdx.x % 4 == 1)
+            output = make_uint4(result[12], result[13], result[14], result[15]);
+
+        integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+    #endif
+    }
+
+    // This kernel computes columnwise prefix sums.  When the data input is
+    // the row sums from above, this completes the integral image.
+    // The approach here is to have each block compute a local set of sums.
+    // First , the data covered by the block is loaded into shared memory,
+    // then instead of performing a sum in shared memory using __syncthreads
+    // between stages, the data is reformatted so that the necessary sums
+    // occur inside warps and the shuffle scan operation is used.
+    // The final set of sums from the block is then propgated, with the block
+    // computing "down" the image and adding the running sum to the local
+    // block sums.
+    __global__ void shfl_integral_vertical(cv::gpu::PtrStepSz<unsigned int> integral)
+    {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+        __shared__ unsigned int sums[32][9];
+
+        const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+        const int lane_id = tidx % 8;
+
+        if (tidx >= integral.cols)
+            return;
+
+        sums[threadIdx.x][threadIdx.y] = 0;
+        __syncthreads();
+
+        unsigned int stepSum = 0;
+
+        for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+        {
+            unsigned int* p = integral.ptr(y) + tidx;
+
+            unsigned int sum = *p;
+
+            sums[threadIdx.x][threadIdx.y] = sum;
+            __syncthreads();
+
+            // place into SMEM
+            // shfl scan reduce the SMEM, reformating so the column
+            // sums are computed in a warp
+            // then read out properly
+            const int j = threadIdx.x % 8;
+            const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+            int partial_sum = sums[k][j];
+
+            for (int i = 1; i <= 8; i *= 2)
+            {
+                int n = __shfl_up(partial_sum, i, 32);
+
+                if (lane_id >= i)
+                    partial_sum += n;
+            }
+
+            sums[k][j] = partial_sum;
+            __syncthreads();
+
+            if (threadIdx.y > 0)
+                sum += sums[threadIdx.x][threadIdx.y - 1];
+
+            sum += stepSum;
+            stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+            __syncthreads();
+
+            *p = sum;
+        }
+    #endif
+    }
+
+    void shfl_integral(const cv::gpu::PtrStepSzb& img, cv::gpu::PtrStepSz<unsigned int> integral, cudaStream_t stream)
+    {
+        {
+            // each thread handles 16 values, use 1 block/row
+            // save, becouse step is actually can't be less 512 bytes
+            int block = integral.cols / 16;
+
+            // launch 1 block / row
+            const int grid = img.rows;
+
+            cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+            shfl_integral_horizontal<<<grid, block, 0, stream>>>((const cv::gpu::PtrStepSz<uint4>) img, (cv::gpu::PtrStepSz<uint4>) integral);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        {
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(integral.cols, block.x), 1);
+
+            shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    __global__ void shfl_integral_vertical(cv::gpu::PtrStepSz<unsigned int> buffer, cv::gpu::PtrStepSz<unsigned int> integral)
+    {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+        __shared__ unsigned int sums[32][9];
+
+        const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+        const int lane_id = tidx % 8;
+
+        if (tidx >= integral.cols)
+            return;
+
+        sums[threadIdx.x][threadIdx.y] = 0;
+        __syncthreads();
+
+        unsigned int stepSum = 0;
+
+        for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+        {
+            unsigned int* p = buffer.ptr(y) + tidx;
+            unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
+
+            unsigned int sum = *p;
+
+            sums[threadIdx.x][threadIdx.y] = sum;
+            __syncthreads();
+
+            // place into SMEM
+            // shfl scan reduce the SMEM, reformating so the column
+            // sums are computed in a warp
+            // then read out properly
+            const int j = threadIdx.x % 8;
+            const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+            int partial_sum = sums[k][j];
+
+            for (int i = 1; i <= 8; i *= 2)
+            {
+                int n = __shfl_up(partial_sum, i, 32);
+
+                if (lane_id >= i)
+                    partial_sum += n;
+            }
+
+            sums[k][j] = partial_sum;
+            __syncthreads();
+
+            if (threadIdx.y > 0)
+                sum += sums[threadIdx.x][threadIdx.y - 1];
+
+            sum += stepSum;
+            stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+            __syncthreads();
+
+            *dst = sum;
+        }
+    #endif
+    }
+
+    // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
+    void shfl_integral_gpu_buffered(cv::gpu::PtrStepSzb img, cv::gpu::PtrStepSz<uint4> buffer, cv::gpu::PtrStepSz<unsigned int> integral,
+        int blockStep, cudaStream_t stream)
+    {
+        {
+            const int block = blockStep;
+            const int grid = img.rows;
+
+            cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+            shfl_integral_horizontal<<<grid, block, 0, stream>>>((cv::gpu::PtrStepSz<uint4>) img, buffer);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        {
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(integral.cols, block.x), 1);
+
+            shfl_integral_vertical<<<grid, block, 0, stream>>>((cv::gpu::PtrStepSz<uint>)buffer, integral);
+            cudaSafeCall( cudaGetLastError() );
+        }
+    }
+    // 0
+#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+
+    enum
+    {
+        yuv_shift  = 14,
+        xyz_shift  = 12,
+        R2Y        = 4899,
+        G2Y        = 9617,
+        B2Y        = 1868
+    };
+
+    template <int bidx> static __device__ __forceinline__ unsigned char RGB2GrayConvert(unsigned char b, unsigned char g, unsigned char r)
+    {
+        // uint b = 0xffu & (src >> (bidx * 8));
+        // uint g = 0xffu & (src >> 8);
+        // uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+        return CV_DESCALE((uint)(b * B2Y + g * G2Y + r * R2Y), yuv_shift);
+    }
+
+    __global__ void device_transform(const cv::gpu::PtrStepSz<uchar3> bgr, cv::gpu::PtrStepSzb gray)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        const uchar3 colored = (uchar3)(bgr.ptr(y))[x];
+
+        gray.ptr(y)[x] = RGB2GrayConvert<0>(colored.x, colored.y, colored.z);
+    }
+
+    ///////
+    void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(bgr.cols, block.x), divUp(bgr.rows, block.y));
+        device_transform<<<grid, block>>>(bgr, gray);
+        cudaSafeCall(cudaDeviceSynchronize());
+    }
+}}}
--- a/modules/softcascade/src/cuda/icf-sc.cu
+++ b/modules/softcascade/src/cuda/icf-sc.cu
@ -40,15 +40,31 @@
 //
 //M*/

-#include <opencv2/gpu/device/common.hpp>
-#include <opencv2/gpu/device/saturate_cast.hpp>
-
-#include <icf.hpp>
+#include <cuda_invoker.hpp>
 #include <float.h>
 #include <stdio.h>

-namespace cv { namespace gpu { namespace device {
-namespace icf {
+namespace cv { namespace softcascade { namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func);
+}}}
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err) cv::softcascade::internal::error(cudaGetErrorString(err), file, line, func);
+}
+
+#ifndef CV_PI
+    #define CV_PI   3.1415926535897932384626433832795
+#endif
+
+namespace cv { namespace softcascade { namespace device {
+
+typedef unsigned char uchar;

    template <int FACTOR>
    __device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
@ -125,7 +141,7 @@ namespace icf {
        luvg[luvgPitch * (y + 2 * 480) + x] = v;
    }

-    void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv)
+    void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv)
    {
        dim3 block(32, 8);
        dim3 grid(bgr.cols / 32, bgr.rows / 8);
@ -207,7 +223,7 @@ namespace icf {
    texture<uchar,  cudaTextureType2D, cudaReadModeElementType> tgray;

    template<bool isDefaultNum>
-    __global__ void gray2hog(PtrStepSzb mag)
+    __global__ void gray2hog(cv::gpu::PtrStepSzb mag)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -222,7 +238,7 @@ namespace icf {
        mag( 480 * fast_angle_bin<isDefaultNum>(dy, dx) + y, x) = cmag;
    }

-    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins)
+    void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins)
    {
        dim3 block(32, 8);
        dim3 grid(gray.cols / 32, gray.rows / 8);
@ -303,7 +319,7 @@ namespace icf {
                    excluded = excluded || (suppessed == i);
                }

-            #if __CUDA_ARCH__ >= 120
+            #if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 120)
                if (__all(excluded)) break;
            #endif
            }
@ -325,8 +341,8 @@ namespace icf {
        }
    }

-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
-        PtrStepSzb suppressed, cudaStream_t stream)
+    void suppress(const cv::gpu::PtrStepSzb& objects, cv::gpu::PtrStepSzb overlaps, cv::gpu::PtrStepSzi ndetections,
+        cv::gpu::PtrStepSzb suppressed, cudaStream_t stream)
    {
        int block = 192;
        int grid = 1;
@ -348,7 +364,7 @@ namespace icf {
    template<typename Policy>
    struct PrefixSum
    {
-    __device static void apply(float& impact)
+    __device_inline__ static void apply(float& impact)
        {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
    #pragma unroll
@ -442,6 +458,7 @@ namespace icf {
    {
        x += area.x;
        y += area.y;
+
        int a = tex2D(thogluv, x, y);
        int b = tex2D(thogluv, x + area.z, y);
        int c = tex2D(thogluv, x + area.z, y + area.w);
@ -454,7 +471,7 @@ namespace icf {

 template<typename Policy>
 template<bool isUp>
-__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
+__device_inline__ void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
 {
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    const int x = blockIdx.x;
@ -527,8 +544,8 @@ __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* ob
 }

 template<typename Policy>
-void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
+void CascadeInvoker<Policy>::operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv,
+    cv::gpu::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
 {
    int fw = roi.rows;
    int fh = roi.cols;
@ -560,8 +577,7 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
    }
 }

-template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
+template void CascadeInvoker<GK107PolicyX4>::operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv,
+    cv::gpu::PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;

-}
 }}}
--- a/modules/softcascade/src/cuda_invoker.hpp
+++ b/modules/softcascade/src/cuda_invoker.hpp
@ -22,7 +22,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and / or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@ -44,17 +44,20 @@
 #ifndef __OPENCV_ICF_HPP__
 #define __OPENCV_ICF_HPP__

-#include <opencv2/gpu/device/common.hpp>
+// #include <opencv2/gpu/device/common.hpp>
+#include "opencv2/core/cuda_devptrs.hpp"
+#include "cuda_runtime_api.h"

 #if defined __CUDACC__
-# define __device __device__ __forceinline__
+# define __device_inline__ __device__ __forceinline__
 #else
-# define __device
+# define __device_inline__
 #endif


-namespace cv { namespace gpu { namespace device {
-namespace icf {
+namespace cv { namespace softcascade { namespace device {
+
+typedef unsigned char uchar;

 struct Octave
 {
@ -68,20 +71,19 @@ struct Octave
    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
 };

-struct Level //is actually 24 bytes
+struct Level
 {
    int octave;
    int step;

    float relScale;
-    float scaling[2]; // calculated according to Dollal paper
+    float scaling[2];// calculated according to Dollar paper

-    // for 640x480 we can not get overflow
    uchar2 workRect;
    uchar2 objSize;

    Level(int idx, const Octave& oct, const float scale, const int w, const int h);
-    __device Level(){}
+    __device_inline__ Level(){}
 };

 struct Node
@ -106,7 +108,7 @@ struct Detection
    int kind;

    Detection(){}
-    __device Detection(int _x, int _y, uchar _w, uchar _h, float c)
+    __device_inline__ Detection(int _x, int _y, uchar _w, uchar _h, float c)
    : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };

@ -125,8 +127,8 @@ struct CascadeInvoker
 {
    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}

-    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages,
-                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    CascadeInvoker(const cv::gpu::PtrStepSzb& _levels, const cv::gpu::PtrStepSzf& _stages,
+                   const cv::gpu::PtrStepSzb& _nodes,  const cv::gpu::PtrStepSzf& _leaves)
    : levels((const Level*)_levels.ptr()),
      stages((const float*)_stages.ptr()),
      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
@ -141,14 +143,13 @@ struct CascadeInvoker

    int scales;

-    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+    void operator()(const cv::gpu::PtrStepSzb& roi, const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects,
        const int downscales, const cudaStream_t& stream = 0) const;

    template<bool isUp>
-    __device void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
+    __device_inline__ void detect(Detection* objects, const unsigned int ndetections, unsigned int* ctr, const int downscales) const;
 };

-}
 }}}

 #endif
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@ -43,26 +43,41 @@
 #include "precomp.hpp"

 #if !defined (HAVE_CUDA)
-cv::gpu::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
+#define throw_nogpu() CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
+cv::softcascade::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }

-cv::gpu::SCascade::~SCascade() { throw_nogpu(); }
+cv::softcascade::SCascade::~SCascade() { throw_nogpu(); }

-bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
+bool cv::softcascade::SCascade::load(const FileNode&) { throw_nogpu(); return false;}

-void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
+void cv::softcascade::SCascade::detect(InputArray, InputArray, OutputArray, cv::gpu::Stream&) const { throw_nogpu(); }

-void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
+void cv::softcascade::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }

-cv::gpu::ChannelsProcessor::ChannelsProcessor() { throw_nogpu(); }
- cv::gpu::ChannelsProcessor::~ChannelsProcessor() { throw_nogpu(); }
+cv::softcascade::ChannelsProcessor::ChannelsProcessor() { throw_nogpu(); }
+ cv::softcascade::ChannelsProcessor::~ChannelsProcessor() { throw_nogpu(); }

-cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int, const int, const int)
-{ throw_nogpu(); return cv::Ptr<cv::gpu::ChannelsProcessor>(0); }
+cv::Ptr<cv::softcascade::ChannelsProcessor> cv::softcascade::ChannelsProcessor::create(const int, const int, const int)
+{ throw_nogpu(); return cv::Ptr<cv::softcascade::ChannelsProcessor>(0); }

 #else
-# include "icf.hpp"
+# include "cuda_invoker.hpp"
+# include "opencv2/core/stream_accessor.hpp"
+namespace
+{
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif

-cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err) cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+}
+
+cv::softcascade::device::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
 :  octave(idx), step(oct.stages), relScale(scale / oct.scale)
 {
    workRect.x = cvRound(w / (float)oct.shrinkage);
@ -81,23 +96,23 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale
    }
 }

-namespace cv { namespace gpu { namespace device {
+namespace cv { namespace softcascade { namespace device {

-namespace icf {
    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
        const int fw, const int fh, const int bins, cudaStream_t stream);

-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
-        PtrStepSzb suppressed, cudaStream_t stream);
+    void suppress(const cv::gpu::PtrStepSzb& objects, cv::gpu::PtrStepSzb overlaps, cv::gpu::PtrStepSzi ndetections,
+        cv::gpu::PtrStepSzb suppressed, cudaStream_t stream);

-    void bgr2Luv(const PtrStepSzb& bgr, PtrStepSzb luv);
-    void gray2hog(const PtrStepSzb& gray, PtrStepSzb mag, const int bins);
+    void bgr2Luv(const cv::gpu::PtrStepSzb& bgr, cv::gpu::PtrStepSzb luv);
+    void transform(const cv::gpu::PtrStepSz<uchar3>& bgr, cv::gpu::PtrStepSzb gray);
+    void gray2hog(const cv::gpu::PtrStepSzb& gray, cv::gpu::PtrStepSzb mag, const int bins);
    void shrink(const cv::gpu::PtrStepSzb& channels, cv::gpu::PtrStepSzb shrunk);
-}

+    void shfl_integral(const cv::gpu::PtrStepSzb& img, cv::gpu::PtrStepSz<unsigned int> integral, cudaStream_t stream);
 }}}

-struct cv::gpu::SCascade::Fields
+struct cv::softcascade::SCascade::Fields
 {
    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals, const int method)
    {
@ -138,11 +153,9 @@ struct cv::gpu::SCascade::Fields
        FileNode fn = root[SC_OCTAVES];
        if (fn.empty()) return 0;

-        using namespace device::icf;
-
-        std::vector<Octave>  voctaves;
+        std::vector<device::Octave>  voctaves;
        std::vector<float>   vstages;
-        std::vector<Node>    vnodes;
+        std::vector<device::Node>    vnodes;
        std::vector<float>   vleaves;

        FileNodeIterator it = fn.begin(), it_end = fn.end();
@ -158,7 +171,7 @@ struct cv::gpu::SCascade::Fields
            size.x = cvRound(origWidth * scale);
            size.y = cvRound(origHeight * scale);

-            Octave octave(octIndex, nweaks, shrinkage, size, scale);
+            device::Octave octave(octIndex, nweaks, shrinkage, size, scale);
            CV_Assert(octave.stages > 0);
            voctaves.push_back(octave);

@ -227,7 +240,7 @@ struct cv::gpu::SCascade::Fields
                    rect.w = saturate_cast<uchar>(r.height);

                    unsigned int channel = saturate_cast<unsigned int>(feature_channels[featureIdx]);
-                    vnodes.push_back(Node(rect, channel, th));
+                    vnodes.push_back(device::Node(rect, channel, th));
                }

                intfns = octfn[SC_LEAF];
@ -239,13 +252,13 @@ struct cv::gpu::SCascade::Fields
            }
        }

-        cv::Mat hoctaves(1, (int) (voctaves.size() * sizeof(Octave)), CV_8UC1, (uchar*)&(voctaves[0]));
+        cv::Mat hoctaves(1, (int) (voctaves.size() * sizeof(device::Octave)), CV_8UC1, (uchar*)&(voctaves[0]));
        CV_Assert(!hoctaves.empty());

        cv::Mat hstages(cv::Mat(vstages).reshape(1,1));
        CV_Assert(!hstages.empty());

-        cv::Mat hnodes(1, (int) (vnodes.size() * sizeof(Node)), CV_8UC1, (uchar*)&(vnodes[0]) );
+        cv::Mat hnodes(1, (int) (vnodes.size() * sizeof(device::Node)), CV_8UC1, (uchar*)&(vnodes[0]) );
        CV_Assert(!hnodes.empty());

        cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1));
@ -272,8 +285,7 @@ struct cv::gpu::SCascade::Fields

    int createLevels(const int fh, const int fw)
    {
-        using namespace device::icf;
-        std::vector<Level> vlevels;
+        std::vector<device::Level> vlevels;
        float logFactor = (::log(maxScale) - ::log(minScale)) / (totals -1);

        float scale = minScale;
@ -286,7 +298,7 @@ struct cv::gpu::SCascade::Fields
            float logScale = ::log(scale);
            int fit = fitOctave(voctaves, logScale);

-            Level level(fit, voctaves[fit], scale, width, height);
+            device::Level level(fit, voctaves[fit], scale, width, height);

            if (!width || !height)
                break;
@ -300,7 +312,7 @@ struct cv::gpu::SCascade::Fields
            scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
        }

-        cv::Mat hlevels = cv::Mat(1, (int) (vlevels.size() * sizeof(Level)), CV_8UC1, (uchar*)&(vlevels[0]) );
+        cv::Mat hlevels = cv::Mat(1, (int) (vlevels.size() * sizeof(device::Level)), CV_8UC1, (uchar*)&(vlevels[0]) );
        CV_Assert(!hlevels.empty());
        levels.upload(hlevels);
        downscales = dcs;
@ -334,7 +346,7 @@ struct cv::gpu::SCascade::Fields
        preprocessor = ChannelsProcessor::create(shrinkage, 6, method);
    }

-    void detect(cv::gpu::GpuMat& objects, Stream& s) const
+    void detect(cv::gpu::GpuMat& objects, cv::gpu::Stream& s) const
    {
        if (s)
            s.enqueueMemSet(objects, 0);
@ -343,16 +355,16 @@ struct cv::gpu::SCascade::Fields

        cudaSafeCall( cudaGetLastError());

-        device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
-        = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
+        device::CascadeInvoker<device::GK107PolicyX4> invoker
+        = device::CascadeInvoker<device::GK107PolicyX4>(levels, stages, nodes, leaves);

-        cudaStream_t stream = StreamAccessor::getStream(s);
+        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
        invoker(mask, hogluv, objects, downscales, stream);
    }

-    void suppress(GpuMat& objects, Stream& s)
+    void suppress(cv::gpu::GpuMat& objects, cv::gpu::Stream& s)
    {
-        GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+        cv::gpu::GpuMat ndetections = cv::gpu::GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);

        if (s)
@ -366,20 +378,20 @@ struct cv::gpu::SCascade::Fields
            suppressed.setTo(0);
        }

-        cudaStream_t stream = StreamAccessor::getStream(s);
-        device::icf::suppress(objects, overlaps, ndetections, suppressed, stream);
+        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+        device::suppress(objects, overlaps, ndetections, suppressed, stream);
    }

 private:

-    typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
-    static int fitOctave(const std::vector<device::icf::Octave>& octs, const float& logFactor)
+    typedef std::vector<device::Octave>::const_iterator  octIt_t;
+    static int fitOctave(const std::vector<device::Octave>& octs, const float& logFactor)
    {
        float minAbsLog = FLT_MAX;
        int res =  0;
        for (int oct = 0; oct < (int)octs.size(); ++oct)
        {
-            const device::icf::Octave& octave =octs[oct];
+            const device::Octave& octave =octs[oct];
            float logOctave = ::log(octave.scale);
            float logAbsScale = ::fabs(logFactor - logOctave);

@ -410,37 +422,37 @@ public:


    // 160x120x10
-    GpuMat shrunk;
+    cv::gpu::GpuMat shrunk;

    // temporal mat for integral
-    GpuMat integralBuffer;
+    cv::gpu::GpuMat integralBuffer;

    // 161x121x10
-    GpuMat hogluv;
+    cv::gpu::GpuMat hogluv;


    // used for suppression
-    GpuMat suppressed;
+    cv::gpu::GpuMat suppressed;
    // used for area overlap computing during
-    GpuMat overlaps;
+    cv::gpu::GpuMat overlaps;


    // Cascade from xml
-    GpuMat octaves;
-    GpuMat stages;
-    GpuMat nodes;
-    GpuMat leaves;
-    GpuMat levels;
+    cv::gpu::GpuMat octaves;
+    cv::gpu::GpuMat stages;
+    cv::gpu::GpuMat nodes;
+    cv::gpu::GpuMat leaves;
+    cv::gpu::GpuMat levels;


    // For ROI
-    GpuMat mask;
-    GpuMat genRoiTmp;
+    cv::gpu::GpuMat mask;
+    cv::gpu::GpuMat genRoiTmp;

-//     GpuMat collected;
+//     cv::gpu::GpuMat collected;


-    std::vector<device::icf::Octave> voctaves;
+    std::vector<device::Octave> voctaves;

 //     DeviceInfo info;

@ -453,19 +465,58 @@ public:
    };
 };

-cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int fl)
+cv::softcascade::SCascade::SCascade(const double mins, const double maxs, const int sc, const int fl)
 : fields(0),  minScale(mins), maxScale(maxs), scales(sc), flags(fl) {}

-cv::gpu::SCascade::~SCascade() { delete fields; }
+cv::softcascade::SCascade::~SCascade() { delete fields; }

-bool cv::gpu::SCascade::load(const FileNode& fn)
+bool cv::softcascade::SCascade::load(const FileNode& fn)
 {
    if (fields) delete fields;
    fields = Fields::parseCascade(fn, (float)minScale, (float)maxScale, scales, flags);
    return fields != 0;
 }

-void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, Stream& s) const
+namespace {
+
+void integral(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& sum, cv::gpu::GpuMat& buffer, cv::gpu::Stream& s)
+{
+    CV_Assert(src.type() == CV_8UC1);
+
+    cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
+
+    cv::Size whole;
+    cv::Point offset;
+
+    src.locateROI(whole, offset);
+
+    if (cv::gpu::deviceSupports(cv::gpu::WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
+    {
+        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
+
+        cv::softcascade::device::shfl_integral(src, buffer, stream);
+
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        if (s)
+            s.enqueueMemSet(sum, cv::Scalar::all(0));
+        else
+            sum.setTo(cv::Scalar::all(0));
+
+        cv::gpu::GpuMat inner = sum(cv::Rect(1, 1, src.cols, src.rows));
+        cv::gpu::GpuMat res = buffer(cv::Rect(0, 0, src.cols, src.rows));
+
+        if (s)
+            s.enqueueCopy(res, inner);
+        else
+            res.copyTo(inner);
+    }
+    else {CV_Error(CV_GpuNotSupported, ": CC 3.x required.");}
+}
+
+}
+
+void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, OutputArray _objects, cv::gpu::Stream& s) const
 {
    CV_Assert(fields);

@ -473,11 +524,11 @@ void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray
    int type = _image.type();
    CV_Assert(type == CV_8UC3 || type == CV_32SC1 || (!_rois.empty()));

-    const GpuMat image = _image.getGpuMat();
+    const cv::gpu::GpuMat image = _image.getGpuMat();

    if (_objects.empty()) _objects.create(1, 4096 * sizeof(Detection), CV_8UC1);

-    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
+    cv::gpu::GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();

    /// roi
    Fields& flds = *fields;
@ -485,8 +536,8 @@ void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray

    flds.mask.create( rois.cols / shr, rois.rows / shr, rois.type());

-    cv::gpu::resize(rois, flds.genRoiTmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, s);
-    cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);
+    device::shrink(rois, flds.mask);
+    //cv::gpu::transpose(flds.genRoiTmp, flds.mask, s);

    if (type == CV_8UC3)
    {
@ -496,7 +547,7 @@ void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray
            flds.createLevels(image.rows, image.cols);

        flds.preprocessor->apply(image, flds.shrunk);
-        cv::gpu::integralBuffered(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
+        integral(flds.shrunk, flds.hogluv, flds.integralBuffer, s);
    }
    else
    {
@ -510,13 +561,13 @@ void cv::gpu::SCascade::detect(InputArray _image, InputArray _rois, OutputArray

    if ( (flags && NMS_MASK) != NO_REJECT)
    {
-        GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
+        cv::gpu::GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
        flds.suppress(objects, s);
        flds.suppressed.copyTo(spr);
    }
 }

-void cv::gpu::SCascade::read(const FileNode& fn)
+void cv::softcascade::SCascade::read(const FileNode& fn)
 {
    Algorithm::read(fn);
 }
@ -528,7 +579,7 @@ using cv::OutputArray;
 using cv::gpu::Stream;
 using cv::gpu::GpuMat;

-inline void setZero(cv::gpu::GpuMat& m, Stream& s)
+inline void setZero(cv::gpu::GpuMat& m, cv::gpu::Stream& s)
 {
    if (s)
        s.enqueueMemSet(m, 0);
@ -536,144 +587,49 @@ inline void setZero(cv::gpu::GpuMat& m, Stream& s)
        m.setTo(0);
 }

-struct GenricPreprocessor : public cv::gpu::ChannelsProcessor
+struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
 {
-    GenricPreprocessor(const int s, const int b) : cv::gpu::ChannelsProcessor(), shrinkage(s), bins(b) {}
-    virtual ~GenricPreprocessor() {}
-
-    virtual void apply(InputArray _frame, OutputArray _shrunk, Stream& s = Stream::Null())
-    {
-        const GpuMat frame = _frame.getGpuMat();
-
-        _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
-        GpuMat shrunk = _shrunk.getGpuMat();
-
-        channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
-        setZero(channels, s);
-
-        cv::gpu::cvtColor(frame, gray, CV_BGR2GRAY, s);
-        createHogBins(s);
-
-        createLuvBins(frame, s);
-
-        cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
-    }
-
-private:
-
-    void createHogBins(Stream& s)
-    {
-        static const int fw = gray.cols;
-        static const int fh = gray.rows;
-
-        fplane.create(fh * HOG_BINS, fw, CV_32FC1);
-
-        GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
-        GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
-
-        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
-        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, cv::BORDER_DEFAULT, -1, s);
-
-        GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
-        GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
-
-        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
-
-        // normalize magnitude to uchar interval and angles to 6 bins
-        GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
-        GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
-
-        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2.0f))), nmag, 1, -1, s);
-        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
-
-        //create uchar magnitude
-        GpuMat cmag(channels, cv::Rect(0, fh * HOG_BINS, fw, fh));
-        if (s)
-            s.enqueueConvert(nmag, cmag, CV_8UC1);
-        else
-            nmag.convertTo(cmag, CV_8UC1);
-
-        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
-        cv::gpu::device::icf::fillBins(channels, nang, fw, fh, HOG_BINS, stream);
-    }
-
-    void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s)
-    {
-        static const int fw = colored.cols;
-        static const int fh = colored.rows;
-
-        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
-
-        std::vector<GpuMat> splited;
-        for(int i = 0; i < LUV_BINS; ++i)
-        {
-            splited.push_back(GpuMat(channels, cv::Rect(0, fh * (7 + i), fw, fh)));
-        }
-
-        cv::gpu::split(luv, splited, s);
-    }
-
-    enum {HOG_BINS = 6, LUV_BINS = 3};
-
-    const int shrinkage;
-    const int bins;
-
-    GpuMat gray;
-    GpuMat luv;
-    GpuMat channels;
-
-    // preallocated buffer for floating point operations
-    GpuMat fplane;
-    GpuMat sobelBuf;
-};
-
-
-struct SeparablePreprocessor : public cv::gpu::ChannelsProcessor
-{
-    SeparablePreprocessor(const int s, const int b) : cv::gpu::ChannelsProcessor(), shrinkage(s), bins(b) {}
+    SeparablePreprocessor(const int s, const int b) : cv::softcascade::ChannelsProcessor(), shrinkage(s), bins(b) {}
    virtual ~SeparablePreprocessor() {}

-    virtual void apply(InputArray _frame, OutputArray _shrunk, Stream& s = Stream::Null())
+    virtual void apply(InputArray _frame, OutputArray _shrunk, cv::gpu::Stream& s = cv::gpu::Stream::Null())
    {
-        const GpuMat frame = _frame.getGpuMat();
-        cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);
+        bgr = _frame.getGpuMat();
+        //cv::gpu::GaussianBlur(frame, bgr, cv::Size(3, 3), -1.0);

-        _shrunk.create(frame.rows * (4 + bins) / shrinkage, frame.cols / shrinkage, CV_8UC1);
-        GpuMat shrunk = _shrunk.getGpuMat();
+        _shrunk.create(bgr.rows * (4 + bins) / shrinkage, bgr.cols / shrinkage, CV_8UC1);
+        cv::gpu::GpuMat shrunk = _shrunk.getGpuMat();

-        channels.create(frame.rows * (4 + bins), frame.cols, CV_8UC1);
+        channels.create(bgr.rows * (4 + bins), bgr.cols, CV_8UC1);
        setZero(channels, s);

-        cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
-        cv::gpu::device::icf::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);
+        gray.create(bgr.size(), CV_8UC1);
+        cv::softcascade::device::transform(bgr, gray); //cv::gpu::cvtColor(bgr, gray, CV_BGR2GRAY);
+        cv::softcascade::device::gray2hog(gray, channels(cv::Rect(0, 0, bgr.cols, bgr.rows * (bins + 1))), bins);

        cv::gpu::GpuMat luv(channels, cv::Rect(0, bgr.rows * (bins + 1), bgr.cols, bgr.rows * 3));
-        cv::gpu::device::icf::bgr2Luv(bgr, luv);
-        cv::gpu::device::icf::shrink(channels, shrunk);
+        cv::softcascade::device::bgr2Luv(bgr, luv);
+        cv::softcascade::device::shrink(channels, shrunk);
    }

 private:
    const int shrinkage;
    const int bins;

-    GpuMat bgr;
-    GpuMat gray;
-    GpuMat channels;
+    cv::gpu::GpuMat bgr;
+    cv::gpu::GpuMat gray;
+    cv::gpu::GpuMat channels;
 };

 }

-cv::Ptr<cv::gpu::ChannelsProcessor> cv::gpu::ChannelsProcessor::create(const int s, const int b, const int m)
+cv::Ptr<cv::softcascade::ChannelsProcessor> cv::softcascade::ChannelsProcessor::create(const int s, const int b, const int m)
 {
-    CV_Assert((m && SEPARABLE) || (m && GENERIC));
-
-    if (m && GENERIC)
-        return cv::Ptr<cv::gpu::ChannelsProcessor>(new GenricPreprocessor(s, b));
-
-    return cv::Ptr<cv::gpu::ChannelsProcessor>(new SeparablePreprocessor(s, b));
+    CV_Assert((m && SEPARABLE));
+    return cv::Ptr<cv::softcascade::ChannelsProcessor>(new SeparablePreprocessor(s, b));
 }

-cv::gpu::ChannelsProcessor::ChannelsProcessor() { }
-cv::gpu::ChannelsProcessor::~ChannelsProcessor() { }
+cv::softcascade::ChannelsProcessor::ChannelsProcessor() { }
+cv::softcascade::ChannelsProcessor::~ChannelsProcessor() { }

 #endif
--- a/modules/softcascade/src/precomp.hpp
+++ b/modules/softcascade/src/precomp.hpp
@ -56,6 +56,7 @@

 namespace cv { namespace softcascade { namespace internal
 {
+
 namespace rnd {

 typedef cv::RNG_MT19937 engine;
--- a/modules/softcascade/src/softcascade_init.cpp
+++ b/modules/softcascade/src/softcascade_init.cpp
@ -51,11 +51,34 @@ CV_INIT_ALGORITHM(Detector, "SoftCascade.Detector",
                  obj.info()->addParam(obj, "scales",      obj.scales);
                  obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));

+CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
+                  obj.info()->addParam(obj, "minScale", obj.minScale);
+                  obj.info()->addParam(obj, "maxScale", obj.maxScale);
+                  obj.info()->addParam(obj, "scales",   obj.scales));

 bool initModule_softcascade(void)
 {
+    Ptr<Algorithm> sc = createSCascade();
    Ptr<Algorithm> sc1 = createDetector();
-    return (sc1->info() != 0);
+    return (sc1->info() != 0) && (sc->info() != 0);
+}
+
+namespace internal {
+void error(const char *error_string, const char *file, const int line, const char *func)
+{
+    int code = CV_GpuApiCallError;
+
+    if (std::uncaught_exception())
+    {
+        const char* errorStr = cvErrorStr(code);
+        const char* function = func ? func : "unknown function";
+
+        std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
+        std::cerr.flush();
+    }
+    else
+        cv::error( cv::Exception(code, error_string, func, file, line) );
+}
 }

 } }
--- a/modules/softcascade/test/test_cuda_softcascade.cpp
+++ b/modules/softcascade/test/test_cuda_softcascade.cpp
@ -41,10 +41,11 @@
 //M*/

 #include "test_precomp.hpp"
+#include "opencv2/core/gpumat.hpp"
+

 #ifdef HAVE_CUDA
-
-using cv::gpu::GpuMat;
+using std::tr1::get;

 // show detection results on input image with cv::imshow
 //#define SHOW_DETECTIONS
@ -59,7 +60,7 @@ using cv::gpu::GpuMat;

 static std::string path(std::string relative)
 {
-    return cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/" + relative;
+    return cvtest::TS::ptr()->get_data_path() + "cascadeandhog/" + relative;
 }

 TEST(SCascadeTest, readCascade)
@ -67,7 +68,7 @@ TEST(SCascadeTest, readCascade)
    std::string xml = path("cascades/inria_caltech-17.01.2013.xml");
    cv::FileStorage fs(xml, cv::FileStorage::READ);

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
@ -75,7 +76,7 @@ TEST(SCascadeTest, readCascade)

 namespace
 {
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;

    cv::Rect getFromTable(int idx)
    {
@ -97,7 +98,6 @@ namespace
        return rois[idx];
    }

-
    void print(std::ostream &out, const Detection& d)
    {
    #if defined SHOW_DETECTIONS
@ -156,36 +156,36 @@ namespace
 #endif
 }

-PARAM_TEST_CASE(SCascadeTestRoi, cv::gpu::DeviceInfo, std::string, std::string, int)
+class SCascadeTestRoi : public ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> >
 {
    virtual void SetUp()
    {
-        cv::gpu::setDevice(GET_PARAM(0).deviceID());
+        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
    }
 };

-GPU_TEST_P(SCascadeTestRoi, Detect)
+TEST_P(SCascadeTestRoi, Detect)
 {
-    cv::Mat coloredCpu = cv::imread(path(GET_PARAM(2)));
+    cv::Mat coloredCpu = cv::imread(path(get<2>(GetParam())));
    ASSERT_FALSE(coloredCpu.empty());

-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

-    cv::FileStorage fs(path(GET_PARAM(1)), cv::FileStorage::READ);
+    cv::FileStorage fs(path(get<1>(GetParam())), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());

    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));

-    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);

-    int nroi = GET_PARAM(3);
+    int nroi = get<3>(GetParam());
    cv::Mat result(coloredCpu);
    cv::RNG rng;
    for (int i = 0; i < nroi; ++i)
    {
        cv::Rect r = getFromTable(rng(10));
-        GpuMat sub(rois, r);
+        cv::gpu::GpuMat sub(rois, r);
        sub.setTo(1);
        cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
    }
@ -194,7 +194,7 @@ GPU_TEST_P(SCascadeTestRoi, Detect)
    cascade.detect(colored, rois, objectBoxes);

    cv::Mat dt(objectBoxes);
-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;

    Detection* dts = ((Detection*)dt.data) + 1;
    int* count = dt.ptr<int>(0);
@ -211,15 +211,13 @@ GPU_TEST_P(SCascadeTestRoi, Detect)
    SHOW(result);
 }

-INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestRoi, testing::Combine(
+INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestRoi, testing::Combine(
    ALL_DEVICES,
    testing::Values(std::string("cascades/inria_caltech-17.01.2013.xml"),
                    std::string("cascades/sc_cvpr_2012_to_opencv_new_format.xml")),
    testing::Values(std::string("images/image_00000000_0.png")),
    testing::Range(0, 5)));

-////////////////////////////////////////
-
 namespace {

 struct Fixture
@ -232,23 +230,24 @@ struct Fixture
 };
 }

-PARAM_TEST_CASE(SCascadeTestAll, cv::gpu::DeviceInfo, Fixture)
+typedef std::tr1::tuple<cv::gpu::DeviceInfo, Fixture> SCascadeTestAllFixture;
+class SCascadeTestAll : public ::testing::TestWithParam<SCascadeTestAllFixture>
 {
-
+protected:
    std::string xml;
    int expected;

    virtual void SetUp()
    {
-        cv::gpu::setDevice(GET_PARAM(0).deviceID());
-        xml = path(GET_PARAM(1).path);
-        expected = GET_PARAM(1).expected;
+        cv::gpu::setDevice(get<0>(GetParam()).deviceID());
+        xml = path(get<1>(GetParam()).path);
+        expected = get<1>(GetParam()).expected;
    }
 };

-GPU_TEST_P(SCascadeTestAll, detect)
+TEST_P(SCascadeTestAll, detect)
 {
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

    cv::FileStorage fs(xml, cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
@ -258,12 +257,12 @@ GPU_TEST_P(SCascadeTestAll, detect)
    cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
    ASSERT_FALSE(coloredCpu.empty());

-    GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes, rois(colored.size(), CV_8UC1);
    rois.setTo(1);

    cascade.detect(colored, rois, objectBoxes);

-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
    cv::Mat dt(objectBoxes);


@ -283,9 +282,9 @@ GPU_TEST_P(SCascadeTestAll, detect)
    ASSERT_EQ(*count, expected);
 }

-GPU_TEST_P(SCascadeTestAll, detectStream)
+TEST_P(SCascadeTestAll, detectStream)
 {
-    cv::gpu::SCascade cascade;
+    cv::softcascade::SCascade cascade;

    cv::FileStorage fs(xml, cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
@ -295,7 +294,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream)
    cv::Mat coloredCpu = cv::imread(path("images/image_00000000_0.png"));
    ASSERT_FALSE(coloredCpu.empty());

-    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
+    cv::gpu::GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(cv::Scalar::all(1));

    cv::gpu::Stream s;
@ -304,14 +303,14 @@ GPU_TEST_P(SCascadeTestAll, detectStream)
    cascade.detect(colored, rois, objectBoxes, s);
    s.waitForCompletion();

-    typedef cv::gpu::SCascade::Detection Detection;
+    typedef cv::softcascade::SCascade::Detection Detection;
    cv::Mat detections(objectBoxes);
    int a = *(detections.ptr<int>(0));
    ASSERT_EQ(a, expected);
 }

-INSTANTIATE_TEST_CASE_P(GPU_SoftCascade, SCascadeTestAll, testing::Combine( ALL_DEVICES,
+INSTANTIATE_TEST_CASE_P(cuda_accelerated, SCascadeTestAll, testing::Combine( ALL_DEVICES,
                    testing::Values(Fixture("cascades/inria_caltech-17.01.2013.xml", 7),
                                    Fixture("cascades/sc_cvpr_2012_to_opencv_new_format.xml", 1291))));

-#endif
+#endif
--- a/modules/softcascade/test/test_main.cpp
+++ b/modules/softcascade/test/test_main.cpp
@ -42,4 +42,4 @@

 #include "test_precomp.hpp"

-CV_TEST_MAIN("cv")
+CV_TEST_MAIN("cv")
--- a/modules/softcascade/test/test_precomp.hpp
+++ b/modules/softcascade/test/test_precomp.hpp
@ -55,5 +55,6 @@
 # include "opencv2/softcascade.hpp"
 # include "opencv2/imgproc.hpp"
 # include "opencv2/highgui.hpp"
+# include "utility.hpp"

 #endif
--- a/modules/softcascade/test/utility.cpp
+++ b/modules/softcascade/test/utility.cpp
@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+using namespace testing::internal;
+
+//////////////////////////////////////////////////////////////////////
+// Gpu devices
+
+bool supportFeature(const DeviceInfo& info, FeatureSet feature)
+{
+    return TargetArchs::builtWith(feature) && info.supports(feature);
+}
+
+DeviceManager& DeviceManager::instance()
+{
+    static DeviceManager obj;
+    return obj;
+}
+
+void DeviceManager::load(int i)
+{
+    devices_.clear();
+    devices_.reserve(1);
+
+    std::ostringstream msg;
+
+    if (i < 0 || i >= getCudaEnabledDeviceCount())
+    {
+        msg << "Incorrect device number - " << i;
+        CV_Error(CV_StsBadArg, msg.str());
+    }
+
+    DeviceInfo info(i);
+
+    if (!info.isCompatible())
+    {
+        msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
+        CV_Error(CV_StsBadArg, msg.str());
+    }
+
+    devices_.push_back(info);
+}
+
+void DeviceManager::loadAll()
+{
+    int deviceCount = getCudaEnabledDeviceCount();
+
+    devices_.clear();
+    devices_.reserve(deviceCount);
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        DeviceInfo info(i);
+        if (info.isCompatible())
+        {
+            devices_.push_back(info);
+        }
+    }
+}
+
+#endif // HAVE_CUDA
--- a/modules/softcascade/test/utility.hpp
+++ b/modules/softcascade/test/utility.hpp
@ -0,0 +1,75 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
+#define __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/ts.hpp"
+
+//////////////////////////////////////////////////////////////////////
+// Gpu devices
+//! return true if device supports specified feature and gpu module was built with support the feature.
+bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+
+
+#if defined(HAVE_CUDA)
+class DeviceManager
+{
+public:
+    static DeviceManager& instance();
+
+    void load(int i);
+    void loadAll();
+
+    const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
+
+private:
+    std::vector<cv::gpu::DeviceInfo> devices_;
+    DeviceManager() {loadAll();}
+};
+# define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
+#else
+# define ALL_DEVICES testing::ValuesIn(std::vector<cv::gpu::DeviceInfo>())
+#endif
+
+#endif // __OPENCV_GPU_TEST_UTILITY_HPP__
--- a/samples/cpp/peopledetect.cpp
+++ b/samples/cpp/peopledetect.cpp
@ -1,7 +1,7 @@
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/objdetect/objdetect.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include <opencv2/softcascade/softcascade.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/objdetect.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/softcascade.hpp>

 #include <iostream>
 #include <vector>
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@ -1,7 +1,7 @@
 SET(OPENCV_GPU_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc opencv_highgui
                                     opencv_ml opencv_video opencv_objdetect opencv_features2d
                                     opencv_calib3d opencv_legacy opencv_contrib opencv_gpu
-                                     opencv_nonfree)
+                                     opencv_nonfree opencv_softcascade)

 ocv_check_dependencies(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})

--- a/samples/gpu/softcascade.cpp
+++ b/samples/gpu/softcascade.cpp
@ -1,5 +1,6 @@
-#include <opencv2/gpu/gpu.hpp>
-#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/gpu.hpp>
+#include <opencv2/softcascade.hpp>
+#include <opencv2/highgui.hpp>
 #include <iostream>

 int main(int argc, char** argv)
@ -46,7 +47,7 @@ int main(int argc, char** argv)
    float maxScale =  parser.get<float>("max_scale");
    int scales     =  parser.get<int>("total_scales");

-    using cv::gpu::SCascade;
+    using cv::softcascade::SCascade;
    SCascade cascade(minScale, maxScale, scales);

    if (!cascade.load(fs.getFirstTopLevelNode()))
@ -79,7 +80,7 @@ int main(int argc, char** argv)
        cascade.detect(dframe, roi, objects);

        cv::Mat dt(objects);
-        typedef cv::gpu::SCascade::Detection Detection;
+        typedef cv::softcascade::SCascade::Detection Detection;

        Detection* dts = ((Detection*)dt.data) + 1;
        int* count = dt.ptr<int>(0);