Merge pull request #3600 from jet47:cuda-objdetect-module

2015-01-20 13:29:32 +00:00
parent 579ce93885 dccdadc38a
commit 95ecdc3af9
23 changed files with 2809 additions and 2397 deletions
--- a/modules/cuda/CMakeLists.txt
+++ b/modules/cuda/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "CUDA-accelerated Computer Vision")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)

-ocv_define_module(cuda opencv_calib3d opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
+ocv_define_module(cuda opencv_calib3d opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
--- a/modules/cuda/include/opencv2/cuda.hpp
+++ b/modules/cuda/include/opencv2/cuda.hpp
@@ -53,274 +53,11 @@
    @addtogroup cuda
    @{
        @defgroup cuda_calib3d Camera Calibration and 3D Reconstruction
-        @defgroup cuda_objdetect Object Detection
    @}
 */

 namespace cv { namespace cuda {

-//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
-
-//! @addtogroup cuda_objdetect
-//! @{
-
-struct CV_EXPORTS HOGConfidence
-{
-   double scale;
-   std::vector<Point> locations;
-   std::vector<double> confidences;
-   std::vector<double> part_scores[4];
-};
-
-/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
-
-Interfaces of all methods are kept similar to the CPU HOG descriptor and detector analogues as much
-as possible.
-
-@note
-   -   An example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/cpp/peopledetect.cpp
-    -   A CUDA example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/gpu/hog.cpp
-    -   (Python) An example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/python2/peopledetect.py
- */
-struct CV_EXPORTS HOGDescriptor
-{
-    enum { DEFAULT_WIN_SIGMA = -1 };
-    enum { DEFAULT_NLEVELS = 64 };
-    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
-
-    /** @brief Creates the HOG descriptor and detector.
-
-    @param win_size Detection window size. Align to block size and block stride.
-    @param block_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
-    @param block_stride Block stride. It must be a multiple of cell size.
-    @param cell_size Cell size. Only (8, 8) is supported for now.
-    @param nbins Number of bins. Only 9 bins per cell are supported for now.
-    @param win_sigma Gaussian smoothing window parameter.
-    @param threshold_L2hys L2-Hys normalization method shrinkage.
-    @param gamma_correction Flag to specify whether the gamma correction preprocessing is required or
-    not.
-    @param nlevels Maximum number of detection window increases.
-     */
-    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
-                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
-                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
-                  double threshold_L2hys=0.2, bool gamma_correction=true,
-                  int nlevels=DEFAULT_NLEVELS);
-
-    /** @brief Returns the number of coefficients required for the classification.
-     */
-    size_t getDescriptorSize() const;
-    /** @brief Returns the block histogram size.
-    */
-    size_t getBlockHistogramSize() const;
-
-    /** @brief Sets coefficients for the linear SVM classifier.
-    */
-    void setSVMDetector(const std::vector<float>& detector);
-
-    /** @brief Returns coefficients of the classifier trained for people detection (for default window size).
-    */
-    static std::vector<float> getDefaultPeopleDetector();
-    /** @brief Returns coefficients of the classifier trained for people detection (for 48x96 windows).
-    */
-    static std::vector<float> getPeopleDetector48x96();
-    /** @brief Returns coefficients of the classifier trained for people detection (for 64x128 windows).
-    */
-    static std::vector<float> getPeopleDetector64x128();
-
-    /** @brief Performs object detection without a multi-scale window.
-
-    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
-    @param found_locations Left-top corner points of detected objects boundaries.
-    @param hit_threshold Threshold for the distance between features and SVM classifying plane.
-    Usually it is 0 and should be specfied in the detector coefficients (as the last free
-    coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
-    manually here.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-     */
-    void detect(const GpuMat& img, std::vector<Point>& found_locations,
-                double hit_threshold=0, Size win_stride=Size(),
-                Size padding=Size());
-
-    /** @brief Performs object detection with a multi-scale window.
-
-    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
-    @param found_locations Detected objects boundaries.
-    @param hit_threshold Threshold for the distance between features and SVM classifying plane. See
-    cuda::HOGDescriptor::detect for details.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-    @param scale0 Coefficient of the detection window increase.
-    @param group_threshold Coefficient to regulate the similarity threshold. When detected, some
-    objects can be covered by many rectangles. 0 means not to perform grouping. See groupRectangles .
-     */
-    void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                          double hit_threshold=0, Size win_stride=Size(),
-                          Size padding=Size(), double scale0=1.05,
-                          int group_threshold=2);
-
-    void computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
-                                                Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences);
-
-    void computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                                                                    double hit_threshold, Size win_stride, Size padding,
-                                                                    std::vector<HOGConfidence> &conf_out, int group_threshold);
-
-    /** @brief Returns block descriptors computed for the whole image.
-
-    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param descriptors 2D array of descriptors.
-    @param descr_format Descriptor storage format:
-    -   **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
-    -   **DESCR_FORMAT_COL_BY_COL** - Column-major order.
-
-    The function is mainly used to learn the classifier.
-     */
-    void getDescriptors(const GpuMat& img, Size win_stride,
-                        GpuMat& descriptors,
-                        int descr_format=DESCR_FORMAT_COL_BY_COL);
-
-    Size win_size;
-    Size block_size;
-    Size block_stride;
-    Size cell_size;
-    int nbins;
-    double win_sigma;
-    double threshold_L2hys;
-    bool gamma_correction;
-    int nlevels;
-
-protected:
-    void computeBlockHistograms(const GpuMat& img);
-    void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
-
-    double getWinSigma() const;
-    bool checkDetectorSize() const;
-
-    static int numPartsWithin(int size, int part_size, int stride);
-    static Size numPartsWithin(Size size, Size part_size, Size stride);
-
-    // Coefficients of the separating plane
-    float free_coef;
-    GpuMat detector;
-
-    // Results of the last classification step
-    GpuMat labels, labels_buf;
-    Mat labels_host;
-
-    // Results of the last histogram evaluation step
-    GpuMat block_hists, block_hists_buf;
-
-    // Gradients conputation results
-    GpuMat grad, qangle, grad_buf, qangle_buf;
-
-    // returns subbuffer with required size, reallocates buffer if nessesary.
-    static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
-    static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);
-
-    std::vector<GpuMat> image_scales;
-};
-
-//////////////////////////// CascadeClassifier ////////////////////////////
-
-/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
-
-@note
-   -   A cascade classifier example can be found at
-        opencv_source_code/samples/gpu/cascadeclassifier.cpp
-    -   A Nvidea API specific cascade classifier example can be found at
-        opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp
- */
-class CV_EXPORTS CascadeClassifier_CUDA
-{
-public:
-    CascadeClassifier_CUDA();
-    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
-
-    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
-    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
-    type of OpenCV XML cascade supported for LBP.
-     */
-    CascadeClassifier_CUDA(const String& filename);
-    ~CascadeClassifier_CUDA();
-
-    /** @brief Checks whether the classifier is loaded or not.
-    */
-    bool empty() const;
-    /** @brief Loads the classifier from a file. The previous content is destroyed.
-
-    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
-    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
-    type of OpenCV XML cascade supported for LBP.
-     */
-    bool load(const String& filename);
-    /** @brief Destroys the loaded classifier.
-    */
-    void release();
-
-    /** @overload */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
-    /** @brief Detects objects of different sizes in the input image.
-
-    @param image Matrix of type CV_8U containing an image where objects should be detected.
-    @param objectsBuf Buffer to store detected objects (rectangles). If it is empty, it is allocated
-    with the default size. If not empty, the function searches not more than N objects, where
-    N = sizeof(objectsBufer's data)/sizeof(cv::Rect).
-    @param maxObjectSize Maximum possible object size. Objects larger than that are ignored. Used for
-    second signature and supported only for LBP cascades.
-    @param scaleFactor Parameter specifying how much the image size is reduced at each image scale.
-    @param minNeighbors Parameter specifying how many neighbors each candidate rectangle should have
-    to retain it.
-    @param minSize Minimum possible object size. Objects smaller than that are ignored.
-
-    The detected objects are returned as a list of rectangles.
-
-    The function returns the number of detected objects, so you can retrieve them as in the following
-    example:
-    @code
-        cuda::CascadeClassifier_CUDA cascade_gpu(...);
-
-        Mat image_cpu = imread(...)
-        GpuMat image_gpu(image_cpu);
-
-        GpuMat objbuf;
-        int detections_number = cascade_gpu.detectMultiScale( image_gpu,
-                  objbuf, 1.2, minNeighbors);
-
-        Mat obj_host;
-        // download only detected number of rectangles
-        objbuf.colRange(0, detections_number).download(obj_host);
-
-        Rect* faces = obj_host.ptr<Rect>();
-        for(int i = 0; i < detections_num; ++i)
-           cv::rectangle(image_cpu, faces[i], Scalar(255));
-
-        imshow("Faces", image_cpu);
-    @endcode
-    @sa CascadeClassifier::detectMultiScale
-     */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
-
-    bool findLargestObject;
-    bool visualizeInPlace;
-
-    Size getClassifierSize() const;
-
-private:
-    struct CascadeClassifierImpl;
-    CascadeClassifierImpl* impl;
-    struct HaarCascade;
-    struct LbpCascade;
-    friend class CascadeClassifier_CUDA_LBP;
-};
-
-//! @} cuda_objdetect
-
 //////////////////////////// Labeling ////////////////////////////

 //! @addtogroup cuda
--- a/modules/cuda/perf/perf_precomp.hpp
+++ b/modules/cuda/perf/perf_precomp.hpp
@@ -56,7 +56,6 @@

 #include "opencv2/cuda.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
--- a/modules/cuda/src/hog.cpp
+++ b/modules/cuda/src/hog.cpp
--- a/modules/cuda/src/precomp.hpp
+++ b/modules/cuda/src/precomp.hpp
@@ -47,7 +47,6 @@
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudawarping.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"

 #include "opencv2/core/private.cuda.hpp"
 #include "opencv2/core/utility.hpp"
--- a/modules/cuda/test/test_precomp.hpp
+++ b/modules/cuda/test/test_precomp.hpp
@@ -60,7 +60,6 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/opengl.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"

 #include "cvconfig.h"

--- a/modules/cudaobjdetect/CMakeLists.txt
+++ b/modules/cudaobjdetect/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudaobjdetect)
+endif()
+
+set(the_description "CUDA-accelerated Object Detection")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudaobjdetect opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
--- a/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
+++ b/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
@@ -0,0 +1,288 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CUDAOBJDETECT_HPP__
+#define __OPENCV_CUDAOBJDETECT_HPP__
+
+#ifndef __cplusplus
+#  error cudaobjdetect.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+      @defgroup cudaobjdetect Object Detection
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudaobjdetect
+//! @{
+
+//
+// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector
+//
+
+/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
+
+@note
+    -   An example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/cpp/peopledetect.cpp
+    -   A CUDA example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/gpu/hog.cpp
+    -   (Python) An example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/python2/peopledetect.py
+ */
+class CV_EXPORTS HOG : public Algorithm
+{
+public:
+    enum
+    {
+        DESCR_FORMAT_ROW_BY_ROW,
+        DESCR_FORMAT_COL_BY_COL
+    };
+
+    /** @brief Creates the HOG descriptor and detector.
+
+    @param win_size Detection window size. Align to block size and block stride.
+    @param block_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
+    @param block_stride Block stride. It must be a multiple of cell size.
+    @param cell_size Cell size. Only (8, 8) is supported for now.
+    @param nbins Number of bins. Only 9 bins per cell are supported for now.
+     */
+    static Ptr<HOG> create(Size win_size = Size(64, 128),
+                           Size block_size = Size(16, 16),
+                           Size block_stride = Size(8, 8),
+                           Size cell_size = Size(8, 8),
+                           int nbins = 9);
+
+    //! Gaussian smoothing window parameter.
+    virtual void setWinSigma(double win_sigma) = 0;
+    virtual double getWinSigma() const = 0;
+
+    //! L2-Hys normalization method shrinkage.
+    virtual void setL2HysThreshold(double threshold_L2hys) = 0;
+    virtual double getL2HysThreshold() const = 0;
+
+    //! Flag to specify whether the gamma correction preprocessing is required or not.
+    virtual void setGammaCorrection(bool gamma_correction) = 0;
+    virtual bool getGammaCorrection() const = 0;
+
+    //! Maximum number of detection window increases.
+    virtual void setNumLevels(int nlevels) = 0;
+    virtual int getNumLevels() const = 0;
+
+    //! Threshold for the distance between features and SVM classifying plane.
+    //! Usually it is 0 and should be specfied in the detector coefficients (as the last free
+    //! coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
+    //! manually here.
+    virtual void setHitThreshold(double hit_threshold) = 0;
+    virtual double getHitThreshold() const = 0;
+
+    //! Window stride. It must be a multiple of block stride.
+    virtual void setWinStride(Size win_stride) = 0;
+    virtual Size getWinStride() const = 0;
+
+    //! Coefficient of the detection window increase.
+    virtual void setScaleFactor(double scale0) = 0;
+    virtual double getScaleFactor() const = 0;
+
+    //! Coefficient to regulate the similarity threshold. When detected, some
+    //! objects can be covered by many rectangles. 0 means not to perform grouping.
+    //! See groupRectangles.
+    virtual void setGroupThreshold(int group_threshold) = 0;
+    virtual int getGroupThreshold() const = 0;
+
+    //! Descriptor storage format:
+    //!   - **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
+    //!   - **DESCR_FORMAT_COL_BY_COL** - Column-major order.
+    virtual void setDescriptorFormat(int descr_format) = 0;
+    virtual int getDescriptorFormat() const = 0;
+
+    /** @brief Returns the number of coefficients required for the classification.
+     */
+    virtual size_t getDescriptorSize() const = 0;
+
+    /** @brief Returns the block histogram size.
+     */
+    virtual size_t getBlockHistogramSize() const = 0;
+
+    /** @brief Sets coefficients for the linear SVM classifier.
+     */
+    virtual void setSVMDetector(InputArray detector) = 0;
+
+    /** @brief Returns coefficients of the classifier trained for people detection.
+     */
+    virtual Mat getDefaultPeopleDetector() const = 0;
+
+    /** @brief Performs object detection without a multi-scale window.
+
+    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
+    @param found_locations Left-top corner points of detected objects boundaries.
+    @param confidences Optional output array for confidences.
+     */
+    virtual void detect(InputArray img,
+                        std::vector<Point>& found_locations,
+                        std::vector<double>* confidences = NULL) = 0;
+
+    /** @brief Performs object detection with a multi-scale window.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param found_locations Detected objects boundaries.
+    @param confidences Optional output array for confidences.
+     */
+    virtual void detectMultiScale(InputArray img,
+                                  std::vector<Rect>& found_locations,
+                                  std::vector<double>* confidences = NULL) = 0;
+
+    /** @brief Returns block descriptors computed for the whole image.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param descriptors 2D array of descriptors.
+    @param stream CUDA stream.
+     */
+    virtual void compute(InputArray img,
+                         OutputArray descriptors,
+                         Stream& stream = Stream::Null()) = 0;
+};
+
+//
+// CascadeClassifier
+//
+
+/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
+
+@note
+   -   A cascade classifier example can be found at
+        opencv_source_code/samples/gpu/cascadeclassifier.cpp
+    -   A Nvidea API specific cascade classifier example can be found at
+        opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp
+ */
+class CV_EXPORTS CascadeClassifier : public Algorithm
+{
+public:
+    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
+
+    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
+    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
+    type of OpenCV XML cascade supported for LBP.
+     */
+    static Ptr<CascadeClassifier> create(const String& filename);
+    /** @overload
+     */
+    static Ptr<CascadeClassifier> create(const FileStorage& file);
+
+    //! Maximum possible object size. Objects larger than that are ignored. Used for
+    //! second signature and supported only for LBP cascades.
+    virtual void setMaxObjectSize(Size maxObjectSize) = 0;
+    virtual Size getMaxObjectSize() const = 0;
+
+    //! Minimum possible object size. Objects smaller than that are ignored.
+    virtual void setMinObjectSize(Size minSize) = 0;
+    virtual Size getMinObjectSize() const = 0;
+
+    //! Parameter specifying how much the image size is reduced at each image scale.
+    virtual void setScaleFactor(double scaleFactor) = 0;
+    virtual double getScaleFactor() const = 0;
+
+    //! Parameter specifying how many neighbors each candidate rectangle should have
+    //! to retain it.
+    virtual void setMinNeighbors(int minNeighbors) = 0;
+    virtual int getMinNeighbors() const = 0;
+
+    virtual void setFindLargestObject(bool findLargestObject) = 0;
+    virtual bool getFindLargestObject() = 0;
+
+    virtual void setMaxNumObjects(int maxNumObjects) = 0;
+    virtual int getMaxNumObjects() const = 0;
+
+    virtual Size getClassifierSize() const = 0;
+
+    /** @brief Detects objects of different sizes in the input image.
+
+    @param image Matrix of type CV_8U containing an image where objects should be detected.
+    @param objects Buffer to store detected objects (rectangles).
+    @param stream CUDA stream.
+
+    To get final array of detected objects use CascadeClassifier::convert method.
+
+    @code
+        Ptr<cuda::CascadeClassifier> cascade_gpu = cuda::CascadeClassifier::create(...);
+
+        Mat image_cpu = imread(...)
+        GpuMat image_gpu(image_cpu);
+
+        GpuMat objbuf;
+        cascade_gpu->detectMultiScale(image_gpu, objbuf);
+
+        std::vector<Rect> faces;
+        cascade_gpu->convert(objbuf, faces);
+
+        for(int i = 0; i < detections_num; ++i)
+           cv::rectangle(image_cpu, faces[i], Scalar(255));
+
+        imshow("Faces", image_cpu);
+    @endcode
+
+    @sa CascadeClassifier::detectMultiScale
+     */
+    virtual void detectMultiScale(InputArray image,
+                                  OutputArray objects,
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts objects array from internal representation to standard vector.
+
+    @param gpu_objects Objects array in internal representation.
+    @param objects Resulting array.
+     */
+    virtual void convert(OutputArray gpu_objects,
+                         std::vector<Rect>& objects) = 0;
+};
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* __OPENCV_CUDAOBJDETECT_HPP__ */
--- a/modules/cudaobjdetect/perf/perf_main.cpp
+++ b/modules/cudaobjdetect/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudaobjdetect)
--- a/modules/cudaobjdetect/perf/perf_objdetect.cpp
+++ b/modules/cudaobjdetect/perf/perf_objdetect.cpp
@@ -71,10 +71,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
        const cv::cuda::GpuMat d_img(img);
        std::vector<cv::Rect> gpu_found_locations;

-        cv::cuda::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+        d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());

-        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);
+        TEST_CYCLE() d_hog->detectMultiScale(d_img, gpu_found_locations);

        SANITY_CHECK(gpu_found_locations);
    }
@@ -82,8 +82,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
    {
        std::vector<cv::Rect> cpu_found_locations;

+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+
        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog.setSVMDetector(d_hog->getDefaultPeopleDetector());

        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);

@@ -105,18 +107,17 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::CascadeClassifier_CUDA d_cascade;
-        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+        cv::Ptr<cv::cuda::CascadeClassifier> d_cascade =
+                cv::cuda::CascadeClassifier::create(perf::TestBase::getDataPath(GetParam().second));

        const cv::cuda::GpuMat d_img(img);
        cv::cuda::GpuMat objects_buffer;
-        int detections_num = 0;

-        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
+        TEST_CYCLE() d_cascade->detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects;
+        d_cascade->convert(objects_buffer, gpu_rects);

-        std::vector<cv::Rect> gpu_rects(detections_num);
-        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
-        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
        cv::groupRectangles(gpu_rects, 3, 0.2);
        SANITY_CHECK(gpu_rects);
    }
@@ -144,18 +145,17 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::CascadeClassifier_CUDA d_cascade;
-        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+        cv::Ptr<cv::cuda::CascadeClassifier> d_cascade =
+                cv::cuda::CascadeClassifier::create(perf::TestBase::getDataPath(GetParam().second));

        const cv::cuda::GpuMat d_img(img);
        cv::cuda::GpuMat objects_buffer;
-        int detections_num = 0;

-        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
+        TEST_CYCLE() d_cascade->detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects;
+        d_cascade->convert(objects_buffer, gpu_rects);

-        std::vector<cv::Rect> gpu_rects(detections_num);
-        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
-        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
        cv::groupRectangles(gpu_rects, 3, 0.2);
        SANITY_CHECK(gpu_rects);
    }
--- a/modules/cudaobjdetect/perf/perf_precomp.hpp
+++ b/modules/cudaobjdetect/perf/perf_precomp.hpp
@@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/objdetect.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
--- a/modules/cudaobjdetect/src/cascadeclassifier.cpp
+++ b/modules/cudaobjdetect/src/cascadeclassifier.cpp
@@ -48,160 +48,185 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA()               { throw_no_cuda(); }
-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String&)  { throw_no_cuda(); }
-cv::cuda::CascadeClassifier_CUDA::~CascadeClassifier_CUDA()              { throw_no_cuda(); }
-bool cv::cuda::CascadeClassifier_CUDA::empty() const                    { throw_no_cuda(); return true; }
-bool cv::cuda::CascadeClassifier_CUDA::load(const String&)              { throw_no_cuda(); return true; }
-Size cv::cuda::CascadeClassifier_CUDA::getClassifierSize() const        { throw_no_cuda(); return Size();}
-void cv::cuda::CascadeClassifier_CUDA::release()                        { throw_no_cuda(); }
-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_no_cuda(); return -1;}
-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_no_cuda(); return -1;}
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const String&) { throw_no_cuda(); return Ptr<cuda::CascadeClassifier>(); }
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const FileStorage&) { throw_no_cuda(); return Ptr<cuda::CascadeClassifier>(); }

 #else

-struct cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
+//
+// CascadeClassifierBase
+//
+
+namespace
 {
-public:
-    CascadeClassifierImpl(){}
-    virtual ~CascadeClassifierImpl(){}
+    class CascadeClassifierBase : public cuda::CascadeClassifier
+    {
+    public:
+        CascadeClassifierBase();

-    virtual unsigned int process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize, cv::Size maxObjectSize) = 0;
+        virtual void setMaxObjectSize(Size maxObjectSize) { maxObjectSize_ = maxObjectSize; }
+        virtual Size getMaxObjectSize() const { return maxObjectSize_; }

-    virtual cv::Size getClassifierCvSize() const = 0;
-    virtual bool read(const String& classifierAsXml) = 0;
-};
+        virtual void setMinObjectSize(Size minSize) { minObjectSize_ = minSize; }
+        virtual Size getMinObjectSize() const { return minObjectSize_; }

-#ifndef HAVE_OPENCV_CUDALEGACY
+        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
+        virtual double getScaleFactor() const { return scaleFactor_; }

-struct cv::cuda::CascadeClassifier_CUDA::HaarCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
+        virtual void setMinNeighbors(int minNeighbors) { minNeighbors_ = minNeighbors; }
+        virtual int getMinNeighbors() const { return minNeighbors_; }
+
+        virtual void setFindLargestObject(bool findLargestObject) { findLargestObject_ = findLargestObject; }
+        virtual bool getFindLargestObject() { return findLargestObject_; }
+
+        virtual void setMaxNumObjects(int maxNumObjects) { maxNumObjects_ = maxNumObjects; }
+        virtual int getMaxNumObjects() const { return maxNumObjects_; }
+
+    protected:
+        Size maxObjectSize_;
+        Size minObjectSize_;
+        double scaleFactor_;
+        int minNeighbors_;
+        bool findLargestObject_;
+        int maxNumObjects_;
+    };
+
+    CascadeClassifierBase::CascadeClassifierBase() :
+        maxObjectSize_(),
+        minObjectSize_(),
+        scaleFactor_(1.2),
+        minNeighbors_(4),
+        findLargestObject_(false),
+        maxNumObjects_(100)
+    {
+    }
+}
+
+//
+// HaarCascade
+//
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+
+namespace
 {
-public:
-    HaarCascade()
+    class HaarCascade_Impl : public CascadeClassifierBase
    {
-        throw_no_cuda();
+    public:
+        explicit HaarCascade_Impl(const String& filename);
+
+        virtual Size getClassifierSize() const;
+
+        virtual void detectMultiScale(InputArray image,
+                                      OutputArray objects,
+                                      Stream& stream);
+
+        virtual void convert(OutputArray gpu_objects,
+                             std::vector<Rect>& objects);
+
+    private:
+        NCVStatus load(const String& classifierFile);
+        NCVStatus calculateMemReqsAndAllocate(const Size& frameSize);
+        NCVStatus process(const GpuMat& src, GpuMat& objects, cv::Size ncvMinSize, /*out*/ unsigned int& numDetections);
+
+        Size lastAllocatedFrameSize;
+
+        Ptr<NCVMemStackAllocator> gpuAllocator;
+        Ptr<NCVMemStackAllocator> cpuAllocator;
+
+        cudaDeviceProp devProp;
+        NCVStatus ncvStat;
+
+        Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
+        Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+
+        Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
+        Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
+        Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+
+        HaarClassifierCascadeDescriptor haar;
+
+        Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
+        Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
+        Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+    };
+
+    static void NCVDebugOutputHandler(const String &msg)
+    {
+        CV_Error(Error::GpuApiCallError, msg.c_str());
    }

-    unsigned int process(const GpuMat&, GpuMat&, float, int, bool, bool, cv::Size, cv::Size)
-    {
-        throw_no_cuda();
-        return 0;
-    }
-
-    cv::Size getClassifierCvSize() const
-    {
-        throw_no_cuda();
-        return cv::Size();
-    }
-
-    bool read(const String&)
-    {
-        throw_no_cuda();
-        return false;
-    }
-};
-
-#else
-
-struct cv::cuda::CascadeClassifier_CUDA::HaarCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
-{
-public:
-    HaarCascade() : lastAllocatedFrameSize(-1, -1)
+    HaarCascade_Impl::HaarCascade_Impl(const String& filename) :
+        lastAllocatedFrameSize(-1, -1)
    {
        ncvSetDebugOutputHandler(NCVDebugOutputHandler);
-    }
-
-    bool read(const String& filename)
-    {
        ncvSafeCall( load(filename) );
-        return true;
    }

-    NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize,
-                      /*out*/unsigned int& numDetections)
+    Size HaarCascade_Impl::getClassifierSize() const
    {
-        calculateMemReqsAndAllocate(src.size());
-
-        NCVMemPtr src_beg;
-        src_beg.ptr = (void*)src.ptr<Ncv8u>();
-        src_beg.memtype = NCVMemoryTypeDevice;
-
-        NCVMemSegment src_seg;
-        src_seg.begin = src_beg;
-        src_seg.size  = src.step * src.rows;
-
-        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
-        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
-
-        CV_Assert(objects.rows == 1);
-
-        NCVMemPtr objects_beg;
-        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
-        objects_beg.memtype = NCVMemoryTypeDevice;
-
-        NCVMemSegment objects_seg;
-        objects_seg.begin = objects_beg;
-        objects_seg.size = objects.step * objects.rows;
-        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
-        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
-
-        NcvSize32u roi;
-        roi.width = d_src.width();
-        roi.height = d_src.height();
-
-        NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
-
-        Ncv32u flags = 0;
-        flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;
-        flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace  : 0;
-
-        ncvStat = ncvDetectObjectsMultiScale_device(
-            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
-            *d_haarStages, *d_haarNodes, *d_haarFeatures,
-            winMinSize,
-            minNeighbors,
-            scaleStep, 1,
-            flags,
-            *gpuAllocator, *cpuAllocator, devProp, 0);
-        ncvAssertReturnNcvStat(ncvStat);
-        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-        return NCV_SUCCESS;
+        return Size(haar.ClassifierSize.width, haar.ClassifierSize.height);
    }

-    unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
+    void HaarCascade_Impl::detectMultiScale(InputArray _image,
+                                            OutputArray _objects,
+                                            Stream& stream)
    {
-        CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);
+        const GpuMat image = _image.getGpuMat();

-        const int defaultObjSearchNum = 100;
-        if (objectsBuf.empty())
+        CV_Assert( image.depth() == CV_8U);
+        CV_Assert( scaleFactor_ > 1 );
+        CV_Assert( !stream );
+
+        Size ncvMinSize = getClassifierSize();
+        if (ncvMinSize.width < minObjectSize_.width && ncvMinSize.height < minObjectSize_.height)
        {
-            objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);
+            ncvMinSize.width = minObjectSize_.width;
+            ncvMinSize.height = minObjectSize_.height;
        }

-        cv::Size ncvMinSize = this->getClassifierCvSize();
-
-        if (ncvMinSize.width < minSize.width && ncvMinSize.height < minSize.height)
-        {
-            ncvMinSize.width = minSize.width;
-            ncvMinSize.height = minSize.height;
-        }
+        BufferPool pool(stream);
+        GpuMat objectsBuf = pool.getBuffer(1, maxNumObjects_, DataType<Rect>::type);

        unsigned int numDetections;
-        ncvSafeCall(this->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections));
+        ncvSafeCall( process(image, objectsBuf, ncvMinSize, numDetections) );

-        return numDetections;
+        if (numDetections > 0)
+        {
+            objectsBuf.colRange(0, numDetections).copyTo(_objects);
+        }
+        else
+        {
+            _objects.release();
+        }
    }

-    cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }
+    void HaarCascade_Impl::convert(OutputArray _gpu_objects, std::vector<Rect>& objects)
+    {
+        if (_gpu_objects.empty())
+        {
+            objects.clear();
+            return;
+        }

-private:
-    static void NCVDebugOutputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
+        Mat gpu_objects;
+        if (_gpu_objects.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_objects.getGpuMat().download(gpu_objects);
+        }
+        else
+        {
+            gpu_objects = _gpu_objects.getMat();
+        }

-    NCVStatus load(const String& classifierFile)
+        CV_Assert( gpu_objects.rows == 1 );
+        CV_Assert( gpu_objects.type() == DataType<Rect>::type );
+
+        Rect* ptr = gpu_objects.ptr<Rect>();
+        objects.assign(ptr, ptr + gpu_objects.cols);
+    }
+
+    NCVStatus HaarCascade_Impl::load(const String& classifierFile)
    {
        int devId = cv::cuda::getDevice();
        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
@@ -246,7 +271,7 @@ private:
        return NCV_SUCCESS;
    }

-    NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)
+    NCVStatus HaarCascade_Impl::calculateMemReqsAndAllocate(const Size& frameSize)
    {
        if (lastAllocatedFrameSize == frameSize)
        {
@@ -289,59 +314,123 @@ private:
        return NCV_SUCCESS;
    }

-    cudaDeviceProp devProp;
-    NCVStatus ncvStat;
+    NCVStatus HaarCascade_Impl::process(const GpuMat& src, GpuMat& objects, cv::Size ncvMinSize, /*out*/ unsigned int& numDetections)
+    {
+        calculateMemReqsAndAllocate(src.size());

-    Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
-    Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+        NCVMemPtr src_beg;
+        src_beg.ptr = (void*)src.ptr<Ncv8u>();
+        src_beg.memtype = NCVMemoryTypeDevice;

-    Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
-    Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
-    Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+        NCVMemSegment src_seg;
+        src_seg.begin = src_beg;
+        src_seg.size  = src.step * src.rows;

-    HaarClassifierCascadeDescriptor haar;
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
+        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);

-    Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
-    Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
-    Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+        CV_Assert(objects.rows == 1);

-    Size lastAllocatedFrameSize;
+        NCVMemPtr objects_beg;
+        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
+        objects_beg.memtype = NCVMemoryTypeDevice;

-    Ptr<NCVMemStackAllocator> gpuAllocator;
-    Ptr<NCVMemStackAllocator> cpuAllocator;
+        NCVMemSegment objects_seg;
+        objects_seg.begin = objects_beg;
+        objects_seg.size = objects.step * objects.rows;
+        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
+        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);

-    virtual ~HaarCascade(){}
-};
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+
+        NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
+
+        Ncv32u flags = 0;
+        flags |= findLargestObject_ ? NCVPipeObjDet_FindLargestObject : 0;
+
+        ncvStat = ncvDetectObjectsMultiScale_device(
+            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures,
+            winMinSize,
+            minNeighbors_,
+            scaleFactor_, 1,
+            flags,
+            *gpuAllocator, *cpuAllocator, devProp, 0);
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+}

 #endif

-cv::Size operator -(const cv::Size& a, const cv::Size& b)
+//
+// LbpCascade
+//
+
+namespace cv { namespace cuda { namespace device
 {
+    namespace lbp
+    {
+        void classifyPyramid(int frameW,
+                             int frameH,
+                             int windowW,
+                             int windowH,
+                             float initalScale,
+                             float factor,
+                             int total,
+                             const PtrStepSzb& mstages,
+                             const int nstages,
+                             const PtrStepSzi& mnodes,
+                             const PtrStepSzf& mleaves,
+                             const PtrStepSzi& msubsets,
+                             const PtrStepSzb& mfeatures,
+                             const int subsetSize,
+                             PtrStepSz<int4> objects,
+                             unsigned int* classified,
+                             PtrStepSzi integral);
+
+        void connectedConmonents(PtrStepSz<int4> candidates,
+                                 int ncandidates,
+                                 PtrStepSz<int4> objects,
+                                 int groupThreshold,
+                                 float grouping_eps,
+                                 unsigned int* nclasses);
+    }
+}}}
+
+namespace
+{
+    cv::Size operator -(const cv::Size& a, const cv::Size& b)
+    {
        return cv::Size(a.width - b.width, a.height - b.height);
-}
+    }

-cv::Size operator +(const cv::Size& a, const int& i)
-{
+    cv::Size operator +(const cv::Size& a, const int& i)
+    {
        return cv::Size(a.width + i, a.height + i);
-}
+    }

-cv::Size operator *(const cv::Size& a, const float& f)
-{
+    cv::Size operator *(const cv::Size& a, const float& f)
+    {
        return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
-}
+    }

-cv::Size operator /(const cv::Size& a, const float& f)
-{
+    cv::Size operator /(const cv::Size& a, const float& f)
+    {
        return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
-}
+    }

-bool operator <=(const cv::Size& a, const cv::Size& b)
-{
+    bool operator <=(const cv::Size& a, const cv::Size& b)
+    {
        return a.width <= b.width && a.height <= b.width;
-}
+    }

-struct PyrLavel
-{
+    struct PyrLavel
+    {
        PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
        {
            do
@@ -370,37 +459,27 @@ struct PyrLavel
        cv::Size sFrame;
        cv::Size workArea;
        cv::Size sWindow;
-};
+    };

-namespace cv { namespace cuda { namespace device
-{
-    namespace lbp
+    class LbpCascade_Impl : public CascadeClassifierBase
    {
-        void classifyPyramid(int frameW,
-                             int frameH,
-                             int windowW,
-                             int windowH,
-                             float initalScale,
-                             float factor,
-                             int total,
-                             const PtrStepSzb& mstages,
-                             const int nstages,
-                             const PtrStepSzi& mnodes,
-                             const PtrStepSzf& mleaves,
-                             const PtrStepSzi& msubsets,
-                             const PtrStepSzb& mfeatures,
-                             const int subsetSize,
-                             PtrStepSz<int4> objects,
-                             unsigned int* classified,
-                             PtrStepSzi integral);
+    public:
+        explicit LbpCascade_Impl(const FileStorage& file);

-        void connectedConmonents(PtrStepSz<int4>  candidates, int ncandidates, PtrStepSz<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
-    }
-}}}
+        virtual Size getClassifierSize() const { return NxM; }

-struct cv::cuda::CascadeClassifier_CUDA::LbpCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
-{
-public:
+        virtual void detectMultiScale(InputArray image,
+                                      OutputArray objects,
+                                      Stream& stream);
+
+        virtual void convert(OutputArray gpu_objects,
+                             std::vector<Rect>& objects);
+
+    private:
+        bool load(const FileNode &root);
+        void allocateBuffers(cv::Size frame);
+
+    private:
        struct Stage
        {
            int    first;
@@ -408,28 +487,60 @@ public:
            float  threshold;
        };

-    LbpCascade(){}
-    virtual ~LbpCascade(){}
+        enum stage { BOOST = 0 };
+        enum feature { LBP = 1, HAAR = 2 };

-    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool /*findLargestObject*/,
-        bool /*visualizeInPlace*/, cv::Size minObjectSize, cv::Size maxObjectSize)
+        static const stage stageType = BOOST;
+        static const feature featureType = LBP;
+
+        cv::Size NxM;
+        bool isStumps;
+        int ncategories;
+        int subsetSize;
+        int nodeStep;
+
+        // gpu representation of classifier
+        GpuMat stage_mat;
+        GpuMat trees_mat;
+        GpuMat nodes_mat;
+        GpuMat leaves_mat;
+        GpuMat subsets_mat;
+        GpuMat features_mat;
+
+        GpuMat integral;
+        GpuMat integralBuffer;
+        GpuMat resuzeBuffer;
+
+        GpuMat candidates;
+        static const int integralFactor = 4;
+    };
+
+    LbpCascade_Impl::LbpCascade_Impl(const FileStorage& file)
    {
-        CV_Assert(scaleFactor > 1 && image.depth() == CV_8U);
+        load(file.getFirstTopLevelNode());
+    }
+
+    void LbpCascade_Impl::detectMultiScale(InputArray _image,
+                                           OutputArray _objects,
+                                           Stream& stream)
+    {
+        const GpuMat image = _image.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U);
+        CV_Assert( scaleFactor_ > 1 );
+        CV_Assert( !stream );

-        // const int defaultObjSearchNum = 100;
        const float grouping_eps = 0.2f;

-        if( !objects.empty() && objects.depth() == CV_32S)
-            objects.reshape(4, 1);
-        else
-            objects.create(1 , image.cols >> 4, CV_32SC4);
+        BufferPool pool(stream);
+        GpuMat objects = pool.getBuffer(1, maxNumObjects_, DataType<Rect>::type);

        // used for debug
        // candidates.setTo(cv::Scalar::all(0));
        // objects.setTo(cv::Scalar::all(0));

-        if (maxObjectSize == cv::Size())
-            maxObjectSize = image.size();
+        if (maxObjectSize_ == cv::Size())
+            maxObjectSize_ = image.size();

        allocateBuffers(image.size());

@@ -437,9 +548,9 @@ public:
        GpuMat dclassified(1, 1, CV_32S);
        cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );

-        PyrLavel level(0, scaleFactor, image.size(), NxM, minObjectSize);
+        PyrLavel level(0, scaleFactor_, image.size(), NxM, minObjectSize_);

-        while (level.isFeasible(maxObjectSize))
+        while (level.isFeasible(maxObjectSize_))
        {
            int acc = level.sFrame.width + 1;
            float iniScale = level.scale;
@@ -449,7 +560,7 @@ public:

            int total = 0, prev  = 0;

-            while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
+            while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize_))
            {
                // create sutable matrix headers
                GpuMat src  = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
@@ -464,7 +575,7 @@ public:
                total += totalWidth * (level.workArea.height / step);

                // go to next pyramide level
-                level = level.next(scaleFactor, image.size(), NxM, minObjectSize);
+                level = level.next(scaleFactor_, image.size(), NxM, minObjectSize_);
                area = level.workArea;

                step = (1 + (level.scale <= 2.f));
@@ -472,60 +583,55 @@ public:
                acc += level.sFrame.width + 1;
            }

-            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
+            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor_, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
                leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
        }

-        if (groupThreshold <= 0  || objects.empty())
-            return 0;
+        if (minNeighbors_ <= 0  || objects.empty())
+            return;

        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
-        device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
+        device::lbp::connectedConmonents(candidates, classified, objects, minNeighbors_, grouping_eps, dclassified.ptr<unsigned int>());

        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
        cudaSafeCall( cudaDeviceSynchronize() );
-        return classified;
+
+        if (classified > 0)
+        {
+            objects.colRange(0, classified).copyTo(_objects);
+        }
+        else
+        {
+            _objects.release();
+        }
    }

-    virtual cv::Size getClassifierCvSize() const { return NxM; }
-
-    bool read(const String& classifierAsXml)
+    void LbpCascade_Impl::convert(OutputArray _gpu_objects, std::vector<Rect>& objects)
    {
-        FileStorage fs(classifierAsXml, FileStorage::READ);
-        return fs.isOpened() ? read(fs.getFirstTopLevelNode()) : false;
-    }
-
-private:
-
-    void allocateBuffers(cv::Size frame)
+        if (_gpu_objects.empty())
        {
-        if (frame == cv::Size())
+            objects.clear();
            return;
+        }

-        if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
+        Mat gpu_objects;
+        if (_gpu_objects.kind() == _InputArray::CUDA_GPU_MAT)
        {
-            resuzeBuffer.create(frame, CV_8UC1);
-
-            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
-
-#ifdef HAVE_OPENCV_CUDALEGACY
-            NcvSize32u roiSize;
-            roiSize.width = frame.width;
-            roiSize.height = frame.height;
-
-            cudaDeviceProp prop;
-            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
-
-            Ncv32u bufSize;
-            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-            integralBuffer.create(1, bufSize, CV_8UC1);
-#endif
-
-            candidates.create(1 , frame.width >> 1, CV_32SC4);
+            _gpu_objects.getGpuMat().download(gpu_objects);
        }
+        else
+        {
+            gpu_objects = _gpu_objects.getMat();
        }

-    bool read(const FileNode &root)
+        CV_Assert( gpu_objects.rows == 1 );
+        CV_Assert( gpu_objects.type() == DataType<Rect>::type );
+
+        Rect* ptr = gpu_objects.ptr<Rect>();
+        objects.assign(ptr, ptr + gpu_objects.cols);
+    }
+
+    bool LbpCascade_Impl::load(const FileNode &root)
    {
        const char *CUDA_CC_STAGE_TYPE       = "stageType";
        const char *CUDA_CC_FEATURE_TYPE     = "featureType";
@@ -666,92 +772,90 @@ private:
        return true;
    }

-    enum stage { BOOST = 0 };
-    enum feature { LBP = 1, HAAR = 2 };
-    static const stage stageType = BOOST;
-    static const feature featureType = LBP;
+    void LbpCascade_Impl::allocateBuffers(cv::Size frame)
+    {
+        if (frame == cv::Size())
+            return;

-    cv::Size NxM;
-    bool isStumps;
-    int ncategories;
-    int subsetSize;
-    int nodeStep;
+        if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
+        {
+            resuzeBuffer.create(frame, CV_8UC1);

-    // gpu representation of classifier
-    GpuMat stage_mat;
-    GpuMat trees_mat;
-    GpuMat nodes_mat;
-    GpuMat leaves_mat;
-    GpuMat subsets_mat;
-    GpuMat features_mat;
+            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);

-    GpuMat integral;
-    GpuMat integralBuffer;
-    GpuMat resuzeBuffer;
+        #ifdef HAVE_OPENCV_CUDALEGACY
+            NcvSize32u roiSize;
+            roiSize.width = frame.width;
+            roiSize.height = frame.height;

-    GpuMat candidates;
-    static const int integralFactor = 4;
-};
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );

-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA()
-: findLargestObject(false), visualizeInPlace(false), impl(0) {}
+            Ncv32u bufSize;
+            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+            integralBuffer.create(1, bufSize, CV_8UC1);
+        #endif

-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String& filename)
-: findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }
+            candidates.create(1 , frame.width >> 1, CV_32SC4);
+        }
+    }

-cv::cuda::CascadeClassifier_CUDA::~CascadeClassifier_CUDA() { release(); }
-
-void cv::cuda::CascadeClassifier_CUDA::release() { if (impl) { delete impl; impl = 0; } }
-
-bool cv::cuda::CascadeClassifier_CUDA::empty() const { return impl == 0; }
-
-Size cv::cuda::CascadeClassifier_CUDA::getClassifierSize() const
-{
-    return this->empty() ? Size() : impl->getClassifierCvSize();
 }

-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
-{
-    CV_Assert( !this->empty());
-    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
-}
+//
+// create
+//

-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const String& filename)
 {
-    CV_Assert( !this->empty());
-    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, maxObjectSize);
-}
-
-bool cv::cuda::CascadeClassifier_CUDA::load(const String& filename)
-{
-    release();
-
    String fext = filename.substr(filename.find_last_of(".") + 1);
    fext = fext.toLowerCase();

    if (fext == "nvbin")
    {
-        impl = new HaarCascade();
-        return impl->read(filename);
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
    }

    FileStorage fs(filename, FileStorage::READ);

    if (!fs.isOpened())
    {
-        impl = new HaarCascade();
-        return impl->read(filename);
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
    }

    const char *CUDA_CC_LBP = "LBP";
    String featureTypeStr = (String)fs.getFirstTopLevelNode()["featureType"];
    if (featureTypeStr == CUDA_CC_LBP)
-        impl = new LbpCascade();
+    {
+        return makePtr<LbpCascade_Impl>(fs);
+    }
    else
-        impl = new HaarCascade();
+    {
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
+    }

-    impl->read(filename);
-    return !this->empty();
+    CV_Error(Error::StsUnsupportedFormat, "Unsupported format for CUDA CascadeClassifier");
+    return Ptr<cuda::CascadeClassifier>();
+}
+
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const FileStorage& file)
+{
+    return makePtr<LbpCascade_Impl>(file);
 }

 #endif
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
--- a/modules/cudaobjdetect/src/cuda/lbp.cu
+++ b/modules/cudaobjdetect/src/cuda/lbp.cu
--- a/modules/cudaobjdetect/src/cuda/lbp.hpp
+++ b/modules/cudaobjdetect/src/cuda/lbp.hpp
--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
--- a/modules/cudaobjdetect/src/precomp.hpp
+++ b/modules/cudaobjdetect/src/precomp.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/utility.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cudaobjdetect/test/test_main.cpp
+++ b/modules/cudaobjdetect/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
--- a/modules/cudaobjdetect/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@@ -48,9 +48,10 @@ using namespace cvtest;

 //#define DUMP

-struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescriptor
+struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
    cv::cuda::DeviceInfo devInfo;
+    cv::Ptr<cv::cuda::HOG> hog;

 #ifdef DUMP
    std::ofstream f;
@@ -69,23 +70,13 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
        devInfo = GetParam();

        cv::cuda::setDevice(devInfo.deviceID());
+
+        hog = cv::cuda::HOG::create();
    }

 #ifdef DUMP
-    void dump(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void dump(const std::vector<cv::Point>& locations)
    {
-        f.write((char*)&blockHists.rows, sizeof(blockHists.rows));
-        f.write((char*)&blockHists.cols, sizeof(blockHists.cols));
-
-        for (int i = 0; i < blockHists.rows; ++i)
-        {
-            for (int j = 0; j < blockHists.cols; ++j)
-            {
-                float val = blockHists.at<float>(i, j);
-                f.write((char*)&val, sizeof(val));
-            }
-        }
-
        int nlocations = locations.size();
        f.write((char*)&nlocations, sizeof(nlocations));

@@ -93,21 +84,18 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
            f.write((char*)&locations[i], sizeof(locations[i]));
    }
 #else
-    void compare(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void compare(const std::vector<cv::Point>& locations)
    {
+        // skip block_hists check
        int rows, cols;
        f.read((char*)&rows, sizeof(rows));
        f.read((char*)&cols, sizeof(cols));
-        ASSERT_EQ(rows, blockHists.rows);
-        ASSERT_EQ(cols, blockHists.cols);
-
-        for (int i = 0; i < blockHists.rows; ++i)
+        for (int i = 0; i < rows; ++i)
        {
-            for (int j = 0; j < blockHists.cols; ++j)
+            for (int j = 0; j < cols; ++j)
            {
                float val;
                f.read((char*)&val, sizeof(val));
-                ASSERT_NEAR(val, blockHists.at<float>(i, j), 1e-3);
            }
        }

@@ -126,54 +114,41 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript

    void testDetect(const cv::Mat& img)
    {
-        gamma_correction = false;
-        setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog->setGammaCorrection(false);
+        hog->setSVMDetector(hog->getDefaultPeopleDetector());

        std::vector<cv::Point> locations;

        // Test detect
-        detect(loadMat(img), locations, 0);
+        hog->detect(loadMat(img), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif

        // Test detect on smaller image
        cv::Mat img2;
        cv::resize(img, img2, cv::Size(img.cols / 2, img.rows / 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif

        // Test detect on greater image
        cv::resize(img, img2, cv::Size(img.cols * 2, img.rows * 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
    }
-
-    // Does not compare border value, as interpolation leads to delta
-    void compare_inner_parts(cv::Mat d1, cv::Mat d2)
-    {
-        for (int i = 1; i < blocks_per_win_y - 1; ++i)
-            for (int j = 1; j < blocks_per_win_x - 1; ++j)
-                for (int k = 0; k < block_hist_size; ++k)
-                {
-                    float a = d1.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
-                    float b = d2.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
-                    ASSERT_FLOAT_EQ(a, b);
-                }
-    }
 };

 // desabled while resize does not fixed
@@ -182,13 +157,8 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
    cv::Mat img_rgb = readImage("hog/road.png");
    ASSERT_FALSE(img_rgb.empty());

-#ifdef DUMP
    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
    ASSERT_TRUE(f.is_open());
-#else
-    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
-    ASSERT_TRUE(f.is_open());
-#endif

    // Test on color image
    cv::Mat img;
@@ -198,8 +168,6 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
    // Test on gray image
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
    testDetect(img);
-
-    f.close();
 }

 CUDA_TEST_P(HOG, GetDescriptors)
@@ -216,8 +184,14 @@ CUDA_TEST_P(HOG, GetDescriptors)

    // Convert train images into feature vectors (train table)
    cv::cuda::GpuMat descriptors, descriptors_by_cols;
-    getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
-    getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);
+
+    hog->setWinStride(Size(64, 128));
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_ROW_BY_ROW);
+    hog->compute(d_img, descriptors);
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_COL_BY_COL);
+    hog->compute(d_img, descriptors_by_cols);

    // Check size of the result train table
    wins_per_img_x = 3;
@@ -242,48 +216,6 @@ CUDA_TEST_P(HOG, GetDescriptors)
                    ASSERT_EQ(l[(y * blocks_per_win_x + x) * block_hist_size + k],
                              r[(x * blocks_per_win_y + y) * block_hist_size + k]);
    }
-
-    /* Now we want to extract the same feature vectors, but from single images. NOTE: results will
-    be defferent, due to border values interpolation. Using of many small images is slower, however we
-    wont't call getDescriptors and will use computeBlockHistograms instead of. computeBlockHistograms
-    works good, it can be checked in the gpu_hog sample */
-
-    img_rgb = readImage("hog/positive1.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    // Everything is fine with interpolation for left top subimage
-    ASSERT_EQ(0.0, cv::norm((cv::Mat)block_hists, (cv::Mat)descriptors.rowRange(0, 1)));
-
-    img_rgb = readImage("hog/positive2.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));
-
-    img_rgb = readImage("hog/negative1.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));
-
-    img_rgb = readImage("hog/negative2.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));
-
-    img_rgb = readImage("hog/positive3.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));
-
-    img_rgb = readImage("hog/negative3.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
@@ -310,12 +242,12 @@ CUDA_TEST_P(CalTech, HOG)
    cv::cuda::GpuMat d_img(img);
    cv::Mat markedImage(img.clone());

-    cv::cuda::HOGDescriptor d_hog;
-    d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
-    d_hog.nlevels = d_hog.nlevels + 32;
+    cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+    d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
+    d_hog->setNumLevels(d_hog->getNumLevels() + 32);

    std::vector<cv::Rect> found_locations;
-    d_hog.detectMultiScale(d_img, found_locations);
+    d_hog->detectMultiScale(d_img, found_locations);

 #if defined (LOG_CASCADE_STATISTIC)
    for (int i = 0; i < (int)found_locations.size(); i++)
@@ -326,7 +258,8 @@ CUDA_TEST_P(CalTech, HOG)
        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
    }

-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
 #endif
 }

@@ -354,9 +287,15 @@ PARAM_TEST_CASE(LBP_Read_classifier, cv::cuda::DeviceInfo, int)

 CUDA_TEST_P(LBP_Read_classifier, Accuracy)
 {
-    cv::cuda::CascadeClassifier_CUDA classifier;
    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
-    ASSERT_TRUE(classifier.load(classifierXmlPath));
+
+    cv::Ptr<cv::cuda::CascadeClassifier> d_cascade;
+
+    ASSERT_NO_THROW(
+        d_cascade = cv::cuda::CascadeClassifier::create(classifierXmlPath);
+    );
+
+    ASSERT_FALSE(d_cascade.empty());
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_Read_classifier,
@@ -396,29 +335,28 @@ CUDA_TEST_P(LBP_classify, Accuracy)
    for (; it != rects.end(); ++it)
        cv::rectangle(markedImage, *it, cv::Scalar(255, 0, 0));

-    cv::cuda::CascadeClassifier_CUDA gpuClassifier;
-    ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
+    cv::Ptr<cv::cuda::CascadeClassifier> gpuClassifier =
+            cv::cuda::CascadeClassifier::create(classifierXmlPath);

-    cv::cuda::GpuMat gpu_rects;
    cv::cuda::GpuMat tested(grey);
-    int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
+    cv::cuda::GpuMat gpu_rects_buf;
+    gpuClassifier->detectMultiScale(tested, gpu_rects_buf);
+
+    std::vector<cv::Rect> gpu_rects;
+    gpuClassifier->convert(gpu_rects_buf, gpu_rects);

 #if defined (LOG_CASCADE_STATISTIC)
-    cv::Mat downloaded(gpu_rects);
-    const cv::Rect* faces = downloaded.ptr<cv::Rect>();
-    for (int i = 0; i < count; i++)
+    for (size_t i = 0; i < gpu_rects.size(); i++)
    {
-        cv::Rect r = faces[i];
+        cv::Rect r = gpu_rects[i];

        std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
    }
-#endif

-#if defined (LOG_CASCADE_STATISTIC)
-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
 #endif
-    (void)count;
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_classify,
--- a/modules/cudaobjdetect/test/test_precomp.hpp
+++ b/modules/cudaobjdetect/test/test_precomp.hpp
@@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include <fstream>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "cvconfig.h"
+
+#endif
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -3,7 +3,7 @@ SET(OPENCV_CUDA_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc op
                                     opencv_calib3d opencv_cuda opencv_superres
                                     opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc
                                     opencv_cudafeatures2d opencv_cudaoptflow opencv_cudabgsegm
-                                     opencv_cudastereo opencv_cudalegacy)
+                                     opencv_cudastereo opencv_cudalegacy opencv_cudaobjdetect)

 ocv_check_dependencies(${OPENCV_CUDA_SAMPLES_REQUIRED_DEPS})

--- a/samples/gpu/cascadeclassifier.cpp
+++ b/samples/gpu/cascadeclassifier.cpp
@@ -9,7 +9,7 @@
 #include "opencv2/objdetect/objdetect.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/cuda.hpp"
+#include "opencv2/cudaobjdetect.hpp"
 #include "opencv2/cudaimgproc.hpp"
 #include "opencv2/cudawarping.hpp"

@@ -173,13 +173,9 @@ int main(int argc, const char *argv[])
        }
    }

-    CascadeClassifier_CUDA cascade_gpu;
-    if (!cascade_gpu.load(cascadeName))
-    {
-        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
-    }
+    Ptr<cuda::CascadeClassifier> cascade_gpu = cuda::CascadeClassifier::create(cascadeName);

-    CascadeClassifier cascade_cpu;
+    cv::CascadeClassifier cascade_cpu;
    if (!cascade_cpu.load(cascadeName))
    {
        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
@@ -206,8 +202,8 @@ int main(int argc, const char *argv[])

    namedWindow("result", 1);

-    Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp;
-    vector<Rect> facesBuf_cpu;
+    Mat frame, frame_cpu, gray_cpu, resized_cpu, frameDisp;
+    vector<Rect> faces;

    GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu;

@@ -218,7 +214,6 @@ int main(int argc, const char *argv[])
    bool filterRects = true;
    bool helpScreen = false;

-    int detections_num;
    for (;;)
    {
        if (isInputCamera || isInputVideo)
@@ -241,40 +236,26 @@ int main(int argc, const char *argv[])

        if (useGPU)
        {
-            //cascade_gpu.visualizeInPlace = true;
-            cascade_gpu.findLargestObject = findLargestObject;
+            cascade_gpu->setFindLargestObject(findLargestObject);
+            cascade_gpu->setScaleFactor(1.2);
+            cascade_gpu->setMinNeighbors((filterRects || findLargestObject) ? 4 : 0);

-            detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2,
-                                                          (filterRects || findLargestObject) ? 4 : 0);
-            facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);
+            cascade_gpu->detectMultiScale(resized_gpu, facesBuf_gpu);
+            cascade_gpu->convert(facesBuf_gpu, faces);
        }
        else
        {
-            Size minSize = cascade_gpu.getClassifierSize();
-            cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2,
+            Size minSize = cascade_gpu->getClassifierSize();
+            cascade_cpu.detectMultiScale(resized_cpu, faces, 1.2,
                                         (filterRects || findLargestObject) ? 4 : 0,
                                         (findLargestObject ? CASCADE_FIND_BIGGEST_OBJECT : 0)
                                            | CASCADE_SCALE_IMAGE,
                                         minSize);
-            detections_num = (int)facesBuf_cpu.size();
        }

-        if (!useGPU && detections_num)
+        for (size_t i = 0; i < faces.size(); ++i)
        {
-            for (int i = 0; i < detections_num; ++i)
-            {
-                rectangle(resized_cpu, facesBuf_cpu[i], Scalar(255));
-            }
-        }
-
-        if (useGPU)
-        {
-            resized_gpu.download(resized_cpu);
-
-             for (int i = 0; i < detections_num; ++i)
-             {
-                rectangle(resized_cpu, faces_downloaded.ptr<cv::Rect>()[i], Scalar(255));
-             }
+            rectangle(resized_cpu, faces[i], Scalar(255));
        }

        tm.stop();
@@ -283,16 +264,15 @@ int main(int argc, const char *argv[])

        //print detections to console
        cout << setfill(' ') << setprecision(2);
-        cout << setw(6) << fixed << fps << " FPS, " << detections_num << " det";
-        if ((filterRects || findLargestObject) && detections_num > 0)
+        cout << setw(6) << fixed << fps << " FPS, " << faces.size() << " det";
+        if ((filterRects || findLargestObject) && !faces.empty())
        {
-            Rect *faceRects = useGPU ? faces_downloaded.ptr<Rect>() : &facesBuf_cpu[0];
-            for (int i = 0; i < min(detections_num, 2); ++i)
+            for (size_t i = 0; i < faces.size(); ++i)
            {
-                cout << ", [" << setw(4) << faceRects[i].x
-                     << ", " << setw(4) << faceRects[i].y
-                     << ", " << setw(4) << faceRects[i].width
-                     << ", " << setw(4) << faceRects[i].height << "]";
+                cout << ", [" << setw(4) << faces[i].x
+                     << ", " << setw(4) << faces[i].y
+                     << ", " << setw(4) << faces[i].width
+                     << ", " << setw(4) << faces[i].height << "]";
            }
        }
        cout << endl;
--- a/samples/gpu/hog.cpp
+++ b/samples/gpu/hog.cpp
@@ -5,7 +5,7 @@
 #include <iomanip>
 #include <stdexcept>
 #include <opencv2/core/utility.hpp>
-#include "opencv2/cuda.hpp"
+#include "opencv2/cudaobjdetect.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/objdetect.hpp"
 #include "opencv2/imgproc.hpp"
@@ -252,19 +252,13 @@ void App::run()
    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
    Size win_stride(args.win_stride_width, args.win_stride_height);

-    // Create HOG descriptors and detectors here
-    vector<float> detector;
-    if (win_size == Size(64, 128))
-        detector = cv::cuda::HOGDescriptor::getPeopleDetector64x128();
-    else
-        detector = cv::cuda::HOGDescriptor::getPeopleDetector48x96();
+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size);
+    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9);

-    cv::cuda::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::cuda::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::cuda::HOGDescriptor::DEFAULT_NLEVELS);
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
-                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
-    gpu_hog.setSVMDetector(detector);
+    // Create HOG descriptors and detectors here
+    Mat detector = gpu_hog->getDefaultPeopleDetector();
+
+    gpu_hog->setSVMDetector(detector);
    cpu_hog.setSVMDetector(detector);

    while (running)
@@ -315,9 +309,6 @@ void App::run()
            else img = img_aux;
            img_to_show = img;

-            gpu_hog.nlevels = nlevels;
-            cpu_hog.nlevels = nlevels;
-
            vector<Rect> found;

            // Perform HOG classification
@@ -325,11 +316,19 @@ void App::run()
            if (use_gpu)
            {
                gpu_img.upload(img);
-                gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
+                gpu_hog->setNumLevels(nlevels);
+                gpu_hog->setHitThreshold(hit_threshold);
+                gpu_hog->setWinStride(win_stride);
+                gpu_hog->setScaleFactor(scale);
+                gpu_hog->setGroupThreshold(gr_threshold);
+                gpu_hog->detectMultiScale(gpu_img, found);
+            }
+            else
+            {
+                cpu_hog.nlevels = nlevels;
+                cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
                                          Size(0, 0), scale, gr_threshold);
            }
-            else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
-                                          Size(0, 0), scale, gr_threshold);
            hogWorkEnd();

            // Draw positive classified windows