refactor CUDA HOG algorithm:

use abstract interface with hidden implementation
2015-01-14 18:18:51 +03:00
parent 0af7597d36
commit 8257dc3c1e
5 changed files with 1697 additions and 1720 deletions
--- a/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
+++ b/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
@@ -65,19 +65,8 @@ namespace cv { namespace cuda {
 // HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector
 //
 struct CV_EXPORTS HOGConfidence
 {
   double scale;
   std::vector<Point> locations;
   std::vector<double> confidences;
   std::vector<double> part_scores[4];
 };
 /** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
 Interfaces of all methods are kept similar to the CPU HOG descriptor and detector analogues as much
 as possible.
@note
    -   An example applying the HOG descriptor for people detection can be found at
        opencv_source_code/samples/cpp/peopledetect.cpp
@@ -86,11 +75,14 @@ as possible.
    -   (Python) An example applying the HOG descriptor for people detection can be found at
        opencv_source_code/samples/python2/peopledetect.py
 */
-struct CV_EXPORTS HOGDescriptor
+class CV_EXPORTS HOG : public cv::Algorithm
 {
-    enum { DEFAULT_WIN_SIGMA = -1 };
+public:
-    enum { DEFAULT_NLEVELS = 64 };
+    enum
-    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
+    {
        DESCR_FORMAT_ROW_BY_ROW,
        DESCR_FORMAT_COL_BY_COL
    };
    /** @brief Creates the HOG descriptor and detector.
@@ -99,132 +91,105 @@ struct CV_EXPORTS HOGDescriptor
    @param block_stride Block stride. It must be a multiple of cell size.
    @param cell_size Cell size. Only (8, 8) is supported for now.
    @param nbins Number of bins. Only 9 bins per cell are supported for now.
    @param win_sigma Gaussian smoothing window parameter.
    @param threshold_L2hys L2-Hys normalization method shrinkage.
    @param gamma_correction Flag to specify whether the gamma correction preprocessing is required or
    not.
    @param nlevels Maximum number of detection window increases.
     */
-    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
+    static Ptr<HOG> create(Size win_size = Size(64, 128),
-                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
+                           Size block_size = Size(16, 16),
-                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
+                           Size block_stride = Size(8, 8),
-                  double threshold_L2hys=0.2, bool gamma_correction=true,
+                           Size cell_size = Size(8, 8),
-                  int nlevels=DEFAULT_NLEVELS);
+                           int nbins = 9);
    //! Gaussian smoothing window parameter.
    virtual void setWinSigma(double win_sigma) = 0;
    virtual double getWinSigma() const = 0;
    //! L2-Hys normalization method shrinkage.
    virtual void setL2HysThreshold(double threshold_L2hys) = 0;
    virtual double getL2HysThreshold() const = 0;
    //! Flag to specify whether the gamma correction preprocessing is required or not.
    virtual void setGammaCorrection(bool gamma_correction) = 0;
    virtual bool getGammaCorrection() const = 0;
    //! Maximum number of detection window increases.
    virtual void setNumLevels(int nlevels) = 0;
    virtual int getNumLevels() const = 0;
    //! Threshold for the distance between features and SVM classifying plane.
    //! Usually it is 0 and should be specfied in the detector coefficients (as the last free
    //! coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
    //! manually here.
    virtual void setHitThreshold(double hit_threshold) = 0;
    virtual double getHitThreshold() const = 0;
    //! Window stride. It must be a multiple of block stride.
    virtual void setWinStride(Size win_stride) = 0;
    virtual Size getWinStride() const = 0;
    //! Coefficient of the detection window increase.
    virtual void setScaleFactor(double scale0) = 0;
    virtual double getScaleFactor() const = 0;
    //! Coefficient to regulate the similarity threshold. When detected, some
    //! objects can be covered by many rectangles. 0 means not to perform grouping.
    //! See groupRectangles.
    virtual void setGroupThreshold(int group_threshold) = 0;
    virtual int getGroupThreshold() const = 0;
    //! Descriptor storage format:
    //!   - **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
    //!   - **DESCR_FORMAT_COL_BY_COL** - Column-major order.
    virtual void setDescriptorFormat(int descr_format) = 0;
    virtual int getDescriptorFormat() const = 0;
    /** @brief Returns the number of coefficients required for the classification.
     */
-    size_t getDescriptorSize() const;
+    virtual size_t getDescriptorSize() const = 0;
    /** @brief Returns the block histogram size.
     */
-    size_t getBlockHistogramSize() const;
+    virtual size_t getBlockHistogramSize() const = 0;
    /** @brief Sets coefficients for the linear SVM classifier.
     */
-    void setSVMDetector(const std::vector<float>& detector);
+    virtual void setSVMDetector(InputArray detector) = 0;
-    /** @brief Returns coefficients of the classifier trained for people detection (for default window size).
+    /** @brief Returns coefficients of the classifier trained for people detection.
     */
-    static std::vector<float> getDefaultPeopleDetector();
+    virtual Mat getDefaultPeopleDetector() const = 0;
    /** @brief Returns coefficients of the classifier trained for people detection (for 48x96 windows).
    */
    static std::vector<float> getPeopleDetector48x96();
    /** @brief Returns coefficients of the classifier trained for people detection (for 64x128 windows).
    */
    static std::vector<float> getPeopleDetector64x128();
    /** @brief Performs object detection without a multi-scale window.
    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
    @param found_locations Left-top corner points of detected objects boundaries.
-    @param hit_threshold Threshold for the distance between features and SVM classifying plane.
+    @param confidences Optional output array for confidences.
    Usually it is 0 and should be specfied in the detector coefficients (as the last free
    coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
    manually here.
    @param win_stride Window stride. It must be a multiple of block stride.
    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
     */
-    void detect(const GpuMat& img, std::vector<Point>& found_locations,
+    virtual void detect(InputArray img,
-                double hit_threshold=0, Size win_stride=Size(),
+                        std::vector<Point>& found_locations,
-                Size padding=Size());
+                        std::vector<double>* confidences = NULL) = 0;
    /** @brief Performs object detection with a multi-scale window.
    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
    @param found_locations Detected objects boundaries.
    @param confidences Optional output array for confidences.
    @param hit_threshold Threshold for the distance between features and SVM classifying plane. See
    cuda::HOGDescriptor::detect for details.
    @param win_stride Window stride. It must be a multiple of block stride.
    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
    @param scale0 Coefficient of the detection window increase.
    @param group_threshold Coefficient to regulate the similarity threshold. When detected, some
    objects can be covered by many rectangles. 0 means not to perform grouping. See groupRectangles .
     */
-    void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
+    virtual void detectMultiScale(InputArray img,
-                          double hit_threshold=0, Size win_stride=Size(),
+                                  std::vector<Rect>& found_locations,
-                          Size padding=Size(), double scale0=1.05,
+                                  std::vector<double>* confidences = NULL) = 0;
                          int group_threshold=2);
    void computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
                                                Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences);
    void computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
                                                                    double hit_threshold, Size win_stride, Size padding,
                                                                    std::vector<HOGConfidence> &conf_out, int group_threshold);
    /** @brief Returns block descriptors computed for the whole image.
    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
    @param win_stride Window stride. It must be a multiple of block stride.
    @param descriptors 2D array of descriptors.
-    @param descr_format Descriptor storage format:
+    @param stream CUDA stream.
    -   **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
    -   **DESCR_FORMAT_COL_BY_COL** - Column-major order.
    The function is mainly used to learn the classifier.
     */
-    void getDescriptors(const GpuMat& img, Size win_stride,
+    virtual void compute(InputArray img,
-                        GpuMat& descriptors,
+                         OutputArray descriptors,
-                        int descr_format=DESCR_FORMAT_COL_BY_COL);
+                         Stream& stream = Stream::Null()) = 0;
    Size win_size;
    Size block_size;
    Size block_stride;
    Size cell_size;
    int nbins;
    double win_sigma;
    double threshold_L2hys;
    bool gamma_correction;
    int nlevels;
 protected:
    void computeBlockHistograms(const GpuMat& img);
    void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
    double getWinSigma() const;
    bool checkDetectorSize() const;
    static int numPartsWithin(int size, int part_size, int stride);
    static Size numPartsWithin(Size size, Size part_size, Size stride);
    // Coefficients of the separating plane
    float free_coef;
    GpuMat detector;
    // Results of the last classification step
    GpuMat labels, labels_buf;
    Mat labels_host;
    // Results of the last histogram evaluation step
    GpuMat block_hists, block_hists_buf;
    // Gradients conputation results
    GpuMat grad, qangle, grad_buf, qangle_buf;
    // returns subbuffer with required size, reallocates buffer if nessesary.
    static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
    static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);
    std::vector<GpuMat> image_scales;
 };
 //
--- a/modules/cudaobjdetect/perf/perf_objdetect.cpp
+++ b/modules/cudaobjdetect/perf/perf_objdetect.cpp
@@ -71,10 +71,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
        const cv::cuda::GpuMat d_img(img);
        std::vector<cv::Rect> gpu_found_locations;
-        cv::cuda::HOGDescriptor d_hog;
+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
-        d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
-        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);
+        TEST_CYCLE() d_hog->detectMultiScale(d_img, gpu_found_locations);
        SANITY_CHECK(gpu_found_locations);
    }
@@ -82,8 +82,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
    {
        std::vector<cv::Rect> cpu_found_locations;
        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog.setSVMDetector(d_hog->getDefaultPeopleDetector());
        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);
--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
@@ -42,23 +42,12 @@
 #include "precomp.hpp"
 using namespace cv;
 using namespace cv::cuda;
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-cv::cuda::HOGDescriptor::HOGDescriptor(Size, Size, Size, Size, int, double, double, bool, int) { throw_no_cuda(); }
+Ptr<cuda::HOG> cv::cuda::HOG::create(Size, Size, Size, Size, int) { throw_no_cuda(); return Ptr<cuda::HOG>(); }
 size_t cv::cuda::HOGDescriptor::getDescriptorSize() const { throw_no_cuda(); return 0; }
 size_t cv::cuda::HOGDescriptor::getBlockHistogramSize() const { throw_no_cuda(); return 0; }
 double cv::cuda::HOGDescriptor::getWinSigma() const { throw_no_cuda(); return 0; }
 bool cv::cuda::HOGDescriptor::checkDetectorSize() const { throw_no_cuda(); return false; }
 void cv::cuda::HOGDescriptor::setSVMDetector(const std::vector<float>&) { throw_no_cuda(); }
 void cv::cuda::HOGDescriptor::detect(const GpuMat&, std::vector<Point>&, double, Size, Size) { throw_no_cuda(); }
 void cv::cuda::HOGDescriptor::detectMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, double, int) { throw_no_cuda(); }
 void cv::cuda::HOGDescriptor::computeBlockHistograms(const GpuMat&) { throw_no_cuda(); }
 void cv::cuda::HOGDescriptor::getDescriptors(const GpuMat&, Size, GpuMat&, int) { throw_no_cuda(); }
 std::vector<float> cv::cuda::HOGDescriptor::getDefaultPeopleDetector() { throw_no_cuda(); return std::vector<float>(); }
 std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96() { throw_no_cuda(); return std::vector<float>(); }
 std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128() { throw_no_cuda(); return std::vector<float>(); }
 void cv::cuda::HOGDescriptor::computeConfidence(const GpuMat&, std::vector<Point>&, double, Size, Size, std::vector<Point>&, std::vector<double>&) { throw_no_cuda(); }
 void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, std::vector<HOGConfidence>&, int) { throw_no_cuda(); }
 #else
@@ -102,244 +91,323 @@ namespace cv { namespace cuda { namespace device
    }
 }}}
-using namespace ::cv::cuda::device;
+using namespace cv::cuda::device;
-cv::cuda::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
+namespace
-                                      int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
+{
-        : win_size(win_size_),
+    class HOG_Impl : public cv::cuda::HOG
-          block_size(block_size_),
+    {
-          block_stride(block_stride_),
+    public:
-          cell_size(cell_size_),
+        HOG_Impl(Size win_size,
-          nbins(nbins_),
+                 Size block_size,
-          win_sigma(win_sigma_),
+                 Size block_stride,
-          threshold_L2hys(threshold_L2hys_),
+                 Size cell_size,
-          gamma_correction(gamma_correction_),
+                 int nbins);
-          nlevels(nlevels_)
+
        virtual void setWinSigma(double win_sigma) { win_sigma_ = win_sigma; }
        virtual double getWinSigma() const;
        virtual void setL2HysThreshold(double threshold_L2hys) { threshold_L2hys_ = threshold_L2hys; }
        virtual double getL2HysThreshold() const { return threshold_L2hys_; }
        virtual void setGammaCorrection(bool gamma_correction) { gamma_correction_ = gamma_correction; }
        virtual bool getGammaCorrection() const { return gamma_correction_; }
        virtual void setNumLevels(int nlevels) { nlevels_ = nlevels; }
        virtual int getNumLevels() const { return nlevels_; }
        virtual void setHitThreshold(double hit_threshold) { hit_threshold_ = hit_threshold; }
        virtual double getHitThreshold() const { return hit_threshold_; }
        virtual void setWinStride(Size win_stride) { win_stride_ = win_stride; }
        virtual Size getWinStride() const { return win_stride_; }
        virtual void setScaleFactor(double scale0) { scale0_ = scale0; }
        virtual double getScaleFactor() const { return scale0_; }
        virtual void setGroupThreshold(int group_threshold) { group_threshold_ = group_threshold; }
        virtual int getGroupThreshold() const { return group_threshold_; }
        virtual void setDescriptorFormat(int descr_format) { descr_format_ = descr_format; }
        virtual int getDescriptorFormat() const { return descr_format_; }
        virtual size_t getDescriptorSize() const;
        virtual size_t getBlockHistogramSize() const;
        virtual void setSVMDetector(InputArray detector);
        virtual Mat getDefaultPeopleDetector() const;
        virtual void detect(InputArray img,
                            std::vector<Point>& found_locations,
                            std::vector<double>* confidences);
        virtual void detectMultiScale(InputArray img,
                                      std::vector<Rect>& found_locations,
                                      std::vector<double>* confidences);
        virtual void compute(InputArray img,
                             OutputArray descriptors,
                             Stream& stream);
    private:
        Size win_size_;
        Size block_size_;
        Size block_stride_;
        Size cell_size_;
        int nbins_;
        double win_sigma_;
        double threshold_L2hys_;
        bool gamma_correction_;
        int nlevels_;
        double hit_threshold_;
        Size win_stride_;
        double scale0_;
        int group_threshold_;
        int descr_format_;
    private:
        int getTotalHistSize(Size img_size) const;
        void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists);
        void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
        // Coefficients of the separating plane
        float free_coef_;
        GpuMat detector_;
    };
    HOG_Impl::HOG_Impl(Size win_size,
                       Size block_size,
                       Size block_stride,
                       Size cell_size,
                       int nbins) :
        win_size_(win_size),
        block_size_(block_size),
        block_stride_(block_stride),
        cell_size_(cell_size),
        nbins_(nbins),
        win_sigma_(-1.0),
        threshold_L2hys_(0.2),
        gamma_correction_(true),
        nlevels_(64),
        hit_threshold_(0.0),
        win_stride_(block_stride),
        scale0_(1.05),
        group_threshold_(2),
        descr_format_(DESCR_FORMAT_COL_BY_COL)
    {
        CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
                  (win_size.height - block_size.height) % block_stride.height == 0);
-    CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
+        CV_Assert(block_size.width % cell_size.width == 0 &&
                  block_size.height % cell_size.height == 0);
        CV_Assert(block_stride == cell_size);
        CV_Assert(cell_size == Size(8, 8));
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+        Size cells_per_block(block_size.width / cell_size.width, block_size.height / cell_size.height);
        CV_Assert(cells_per_block == Size(2, 2));
    }
-size_t cv::cuda::HOGDescriptor::getDescriptorSize() const
+    static int numPartsWithin(int size, int part_size, int stride)
    {
-    return numPartsWithin(win_size, block_size, block_stride).area() * getBlockHistogramSize();
+        return (size - part_size + stride) / stride;
    }
-size_t cv::cuda::HOGDescriptor::getBlockHistogramSize() const
+    static Size numPartsWithin(Size size, Size part_size, Size stride)
    {
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+        return Size(numPartsWithin(size.width, part_size.width, stride.width),
-    return (size_t)(nbins * cells_per_block.area());
+                    numPartsWithin(size.height, part_size.height, stride.height));
    }
-double cv::cuda::HOGDescriptor::getWinSigma() const
+    size_t HOG_Impl::getDescriptorSize() const
    {
-    return win_sigma >= 0 ? win_sigma : (block_size.width + block_size.height) / 8.0;
+        return numPartsWithin(win_size_, block_size_, block_stride_).area() * getBlockHistogramSize();
    }
-bool cv::cuda::HOGDescriptor::checkDetectorSize() const
+    size_t HOG_Impl::getBlockHistogramSize() const
    {
-    size_t detector_size = detector.rows * detector.cols;
+        Size cells_per_block(block_size_.width / cell_size_.width, block_size_.height / cell_size_.height);
-    size_t descriptor_size = getDescriptorSize();
+        return nbins_ * cells_per_block.area();
    return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
    }
-void cv::cuda::HOGDescriptor::setSVMDetector(const std::vector<float>& _detector)
+    double HOG_Impl::getWinSigma() const
    {
-    std::vector<float> detector_reordered(_detector.size());
+        return win_sigma_ >= 0 ? win_sigma_ : (block_size_.width + block_size_.height) / 8.0;
    }
    void HOG_Impl::setSVMDetector(InputArray _detector)
    {
        const int descriptor_size = static_cast<int>(getDescriptorSize());
        const Mat detector = _detector.getMat();
        CV_Assert( detector.type() == CV_32FC1 );
        CV_Assert( detector.rows == 1 );
        CV_Assert( detector.cols == descriptor_size || detector.cols == descriptor_size + 1 );
        std::vector<float> detector_reordered(detector.ptr<float>(), detector.ptr<float>() + detector.cols);
        size_t block_hist_size = getBlockHistogramSize();
-    cv::Size blocks_per_img = numPartsWithin(win_size, block_size, block_stride);
+        Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
-    for (int i = 0; i < blocks_per_img.height; ++i)
+        for (int i = 0; i < blocks_per_win.height; ++i)
        for (int j = 0; j < blocks_per_img.width; ++j)
        {
-            const float* src = &_detector[0] + (j * blocks_per_img.height + i) * block_hist_size;
+            for (int j = 0; j < blocks_per_win.width; ++j)
-            float* dst = &detector_reordered[0] + (i * blocks_per_img.width + j) * block_hist_size;
+            {
                const float* src = detector.ptr<float>() + (j * blocks_per_win.height + i) * block_hist_size;
                float* dst = &detector_reordered[0] + (i * blocks_per_win.width + j) * block_hist_size;
                for (size_t k = 0; k < block_hist_size; ++k)
                    dst[k] = src[k];
            }
    this->detector.upload(Mat(detector_reordered).reshape(1, 1));
    size_t descriptor_size = getDescriptorSize();
    free_coef = _detector.size() > descriptor_size ? _detector[descriptor_size] : 0;
    CV_Assert(checkDetectorSize());
        }
-cv::cuda::GpuMat cv::cuda::HOGDescriptor::getBuffer(const Size& sz, int type, GpuMat& buf)
+        detector_.upload(Mat(detector_reordered).reshape(1, 1));
        free_coef_ = detector.cols > descriptor_size ? detector.at<float>(0, descriptor_size) : 0;
    }
    static Mat getPeopleDetector64x128();
    static Mat getPeopleDetector48x96();
    Mat HOG_Impl::getDefaultPeopleDetector() const
    {
-    if (buf.empty() || buf.type() != type)
+        CV_Assert( win_size_ == Size(64, 128) || win_size_ == Size(48, 96) );
-        buf.create(sz, type);
+
        if (win_size_ == Size(64, 128))
            return getPeopleDetector64x128();
        else
-        if (buf.cols < sz.width || buf.rows < sz.height)
+            return getPeopleDetector48x96();
            buf.create(std::max(buf.rows, sz.height), std::max(buf.cols, sz.width), type);
    return buf(Rect(Point(0,0), sz));
    }
-cv::cuda::GpuMat cv::cuda::HOGDescriptor::getBuffer(int rows, int cols, int type, GpuMat& buf)
+    void HOG_Impl::detect(InputArray _img, std::vector<Point>& hits, std::vector<double>* confidences)
    {
-    return getBuffer(Size(cols, rows), type, buf);
+        const GpuMat img = _img.getGpuMat();
 }
 void cv::cuda::HOGDescriptor::computeGradient(const GpuMat& img, GpuMat& _grad, GpuMat& _qangle)
 {
        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
-
+        CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
    //   grad.create(img.size(), CV_32FC2);
    _grad = getBuffer(img.size(), CV_32FC2, grad_buf);
    //   qangle.create(img.size(), CV_8UC2);
    _qangle = getBuffer(img.size(), CV_8UC2, qangle_buf);
    float angleScale = (float)(nbins / CV_PI);
    switch (img.type())
    {
        case CV_8UC1:
            hog::compute_gradients_8UC1(nbins, img.rows, img.cols, img, angleScale, _grad, _qangle, gamma_correction);
            break;
        case CV_8UC4:
            hog::compute_gradients_8UC4(nbins, img.rows, img.cols, img, angleScale, _grad, _qangle, gamma_correction);
            break;
    }
 }
 void cv::cuda::HOGDescriptor::computeBlockHistograms(const GpuMat& img)
 {
    cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
    hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
    computeGradient(img, grad, qangle);
    size_t block_hist_size = getBlockHistogramSize();
    Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
    //   block_hists.create(1, block_hist_size * blocks_per_img.area(), CV_32F);
    block_hists = getBuffer(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F, block_hists_buf);
    hog::compute_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols,
                        grad, qangle, (float)getWinSigma(), block_hists.ptr<float>());
    hog::normalize_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols,
                         block_hists.ptr<float>(), (float)threshold_L2hys);
 }
 void cv::cuda::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format)
 {
    CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
    computeBlockHistograms(img);
    const size_t block_hist_size = getBlockHistogramSize();
    Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
    Size wins_per_img   = numPartsWithin(img.size(), win_size, win_stride);
    descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
    switch (descr_format)
    {
    case DESCR_FORMAT_ROW_BY_ROW:
        hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), descriptors);
        break;
    case DESCR_FORMAT_COL_BY_COL:
        hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), descriptors);
        break;
    default:
        CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
    }
 }
 void cv::cuda::HOGDescriptor::computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
                          Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences)
 {
  CV_Assert(padding == Size(0, 0));
        hits.clear();
-  if (detector.empty())
+        if (detector_.empty())
            return;
-  computeBlockHistograms(img);
+        BufferPool pool(Stream::Null());
-  if (win_stride == Size())
+        GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
-    win_stride = block_stride;
+        computeBlockHistograms(img, block_hists);
  else
    CV_Assert(win_stride.width % block_stride.width == 0 &&
         win_stride.height % block_stride.height == 0);
-  Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
+        Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
  labels.create(1, wins_per_img.area(), CV_32F);
-  hog::compute_confidence_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
+        if (confidences == NULL)
-               win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(),
+        {
-               detector.ptr<float>(), (float)free_coef, (float)hit_threshold, labels.ptr<float>());
+            GpuMat labels = pool.getBuffer(1, wins_per_img.area(), CV_8UC1);
            hog::classify_hists(win_size_.height, win_size_.width,
                                block_stride_.height, block_stride_.width,
                                win_stride_.height, win_stride_.width,
                                img.rows, img.cols,
                                block_hists.ptr<float>(),
                                detector_.ptr<float>(),
                                (float)free_coef_,
                                (float)hit_threshold_,
                                labels.ptr());
            Mat labels_host;
            labels.download(labels_host);
-  float* vec = labels_host.ptr<float>();
+            unsigned char* vec = labels_host.ptr();
  // does not support roi for now..
  locations.clear();
  confidences.clear();
            for (int i = 0; i < wins_per_img.area(); i++)
            {
                int y = i / wins_per_img.width;
                int x = i - wins_per_img.width * y;
-      if (vec[i] >= hit_threshold)
+                if (vec[i])
-   hits.push_back(Point(x * win_stride.width, y * win_stride.height));
+                    hits.push_back(Point(x * win_stride_.width, y * win_stride_.height));
      Point pt(win_stride.width * x, win_stride.height * y);
      locations.push_back(pt);
      confidences.push_back((double)vec[i]);
            }
        }
-
+        else
 void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
                            double hit_threshold, Size win_stride, Size padding,
                            std::vector<HOGConfidence> &conf_out, int group_threshold)
        {
            GpuMat labels = pool.getBuffer(1, wins_per_img.area(), CV_32FC1);
            hog::compute_confidence_hists(win_size_.height, win_size_.width,
                                          block_stride_.height, block_stride_.width,
                                          win_stride_.height, win_stride_.width,
                                          img.rows, img.cols,
                                          block_hists.ptr<float>(),
                                          detector_.ptr<float>(),
                                          (float)free_coef_,
                                          (float)hit_threshold_,
                                          labels.ptr<float>());
            Mat labels_host;
            labels.download(labels_host);
            float* vec = labels_host.ptr<float>();
            confidences->clear();
            for (int i = 0; i < wins_per_img.area(); i++)
            {
                int y = i / wins_per_img.width;
                int x = i - wins_per_img.width * y;
                if (vec[i] >= hit_threshold_)
                {
                    hits.push_back(Point(x * win_stride_.width, y * win_stride_.height));
                    confidences->push_back((double)vec[i]);
                }
            }
        }
    }
    void HOG_Impl::detectMultiScale(InputArray _img,
                                    std::vector<Rect>& found_locations,
                                    std::vector<double>* confidences)
    {
        const GpuMat img = _img.getGpuMat();
        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
        CV_Assert( confidences == NULL || group_threshold_ == 0 );
        std::vector<double> level_scale;
-    double scale = 1.;
+        double scale = 1.0;
        int levels = 0;
-
+        for (levels = 0; levels < nlevels_; levels++)
    for (levels = 0; levels < (int)conf_out.size(); levels++)
        {
        scale = conf_out[levels].scale;
            level_scale.push_back(scale);
-        if (cvRound(img.cols/scale) < win_size.width || cvRound(img.rows/scale) < win_size.height)
+
            if (cvRound(img.cols / scale) < win_size_.width ||
                cvRound(img.rows / scale) < win_size_.height ||
                scale0_ <= 1)
            {
                break;
            }
            scale *= scale0_;
        }
        levels = std::max(levels, 1);
        level_scale.resize(levels);
-    std::vector<Rect> all_candidates;
+        std::vector<Point> level_hits;
-    std::vector<Point> locations;
+        std::vector<double> level_confidences;
        BufferPool pool(Stream::Null());
        found_locations.clear();
        for (size_t i = 0; i < level_scale.size(); i++)
        {
            scale = level_scale[i];
        Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
        GpuMat smaller_img;
            Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
            GpuMat smaller_img;
            if (sz == img.size())
            {
                smaller_img = img;
            }
            else
            {
-            smaller_img.create(sz, img.type());
+                smaller_img = pool.getBuffer(sz, img.type());
                switch (img.type())
                {
                    case CV_8UC1: hog::resize_8UC1(img, smaller_img); break;
@@ -347,127 +415,137 @@ void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, std
                }
            }
-        computeConfidence(smaller_img, locations, hit_threshold, win_stride, padding, conf_out[i].locations, conf_out[i].confidences);
+            detect(smaller_img, level_hits,
                   confidences ? &level_confidences : NULL);
-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
+            Size scaled_win_size(cvRound(win_size_.width * scale),
-        for (size_t j = 0; j < locations.size(); j++)
+                                 cvRound(win_size_.height * scale));
            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
    }
-    found_locations.assign(all_candidates.begin(), all_candidates.end());
+            for (size_t j = 0; j < level_hits.size(); j++)
    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
 }
 void cv::cuda::HOGDescriptor::detect(const GpuMat& img, std::vector<Point>& hits, double hit_threshold, Size win_stride, Size padding)
            {
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
+                found_locations.push_back(Rect(Point2d(level_hits[j]) * scale, scaled_win_size));
-    CV_Assert(padding == Size(0, 0));
+                if (confidences)
-
+                    confidences->push_back(level_confidences[j]);
    hits.clear();
    if (detector.empty())
        return;
    computeBlockHistograms(img);
    if (win_stride == Size())
        win_stride = block_stride;
    else
        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
    Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
    //   labels.create(1, wins_per_img.area(), CV_8U);
    labels = getBuffer(1, wins_per_img.area(), CV_8U, labels_buf);
    hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
                        win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(),
                        detector.ptr<float>(), (float)free_coef, (float)hit_threshold, labels.ptr());
    labels.download(labels_host);
    unsigned char* vec = labels_host.ptr();
    for (int i = 0; i < wins_per_img.area(); i++)
    {
        int y = i / wins_per_img.width;
        int x = i - wins_per_img.width * y;
        if (vec[i])
            hits.push_back(Point(x * win_stride.width, y * win_stride.height));
            }
        }
-
+        if (group_threshold_ > 0)
 void cv::cuda::HOGDescriptor::detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations, double hit_threshold,
                                              Size win_stride, Size padding, double scale0, int group_threshold)
        {
            groupRectangles(found_locations, group_threshold_, 0.2/*magic number copied from CPU version*/);
        }
    }
    void HOG_Impl::compute(InputArray _img,
                           OutputArray _descriptors,
                           Stream& stream)
    {
        const GpuMat img = _img.getGpuMat();
        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
        CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
        CV_Assert( !stream );
-    std::vector<double> level_scale;
+        BufferPool pool(stream);
    double scale = 1.;
    int levels = 0;
-    for (levels = 0; levels < nlevels; levels++)
+        GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
        computeBlockHistograms(img, block_hists);
        const size_t block_hist_size = getBlockHistogramSize();
        Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
        Size wins_per_img   = numPartsWithin(img.size(), win_size_, win_stride_);
        _descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32FC1);
        GpuMat descriptors = _descriptors.getGpuMat();
        switch (descr_format_)
        {
-        level_scale.push_back(scale);
+        case DESCR_FORMAT_ROW_BY_ROW:
-        if (cvRound(img.cols/scale) < win_size.width ||
+            hog::extract_descrs_by_rows(win_size_.height, win_size_.width,
-            cvRound(img.rows/scale) < win_size.height || scale0 <= 1)
+                                        block_stride_.height, block_stride_.width,
                                        win_stride_.height, win_stride_.width,
                                        img.rows, img.cols,
                                        block_hists.ptr<float>(),
                                        descriptors);
            break;
-        scale *= scale0;
+        case DESCR_FORMAT_COL_BY_COL:
            hog::extract_descrs_by_cols(win_size_.height, win_size_.width,
                                        block_stride_.height, block_stride_.width,
                                        win_stride_.height, win_stride_.width,
                                        img.rows, img.cols,
                                        block_hists.ptr<float>(),
                                        descriptors);
            break;
        default:
            CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
        }
    }
    levels = std::max(levels, 1);
    level_scale.resize(levels);
    image_scales.resize(levels);
-    std::vector<Rect> all_candidates;
+    int HOG_Impl::getTotalHistSize(Size img_size) const
    std::vector<Point> locations;
    for (size_t i = 0; i < level_scale.size(); i++)
    {
-        scale = level_scale[i];
+        size_t block_hist_size = getBlockHistogramSize();
-        Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
+        Size blocks_per_img = numPartsWithin(img_size, block_size_, block_stride_);
-        GpuMat smaller_img;
+        return static_cast<int>(block_hist_size * blocks_per_img.area());
    }
-        if (sz == img.size())
+    void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists)
            smaller_img = img;
        else
    {
-            image_scales[i].create(sz, img.type());
+        cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height);
        BufferPool pool(Stream::Null());
        GpuMat grad = pool.getBuffer(img.size(), CV_32FC2);
        GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2);
        computeGradient(img, grad, qangle);
        block_hists.create(1, getTotalHistSize(img.size()), CV_32FC1);
        hog::compute_hists(nbins_,
                           block_stride_.width, block_stride_.height,
                           img.rows, img.cols,
                           grad, qangle,
                           (float)getWinSigma(),
                           block_hists.ptr<float>());
        hog::normalize_hists(nbins_,
                             block_stride_.width, block_stride_.height,
                             img.rows, img.cols,
                             block_hists.ptr<float>(),
                             (float)threshold_L2hys_);
    }
    void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)
    {
        grad.create(img.size(), CV_32FC2);
        qangle.create(img.size(), CV_8UC2);
        float angleScale = (float)(nbins_ / CV_PI);
        switch (img.type())
        {
-                case CV_8UC1: hog::resize_8UC1(img, image_scales[i]); break;
+            case CV_8UC1:
-                case CV_8UC4: hog::resize_8UC4(img, image_scales[i]); break;
+                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
                break;
            case CV_8UC4:
                hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
                break;
        }
    }
            smaller_img = image_scales[i];
 }
-        detect(smaller_img, locations, hit_threshold, win_stride, padding);
+Ptr<cuda::HOG> cv::cuda::HOG::create(Size win_size,
-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
+                                     Size block_size,
-        for (size_t j = 0; j < locations.size(); j++)
+                                     Size block_stride,
-            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
+                                     Size cell_size,
-    }
+                                     int nbins)
    found_locations.assign(all_candidates.begin(), all_candidates.end());
    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
 }
 int cv::cuda::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
 {
-    return (size - part_size + stride) / stride;
+    return makePtr<HOG_Impl>(win_size, block_size, block_stride, cell_size, nbins);
 }
-cv::Size cv::cuda::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
+namespace
 {
-    return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
+    static Mat getPeopleDetector48x96()
 }
 std::vector<float> cv::cuda::HOGDescriptor::getDefaultPeopleDetector()
    {
-    return getPeopleDetector64x128();
+        static float detector[] = {
 }
 std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96()
 {
    static const float detector[] = {
            0.294350f, -0.098796f, -0.129522f, 0.078753f, 0.387527f, 0.261529f,
            0.145939f, 0.061520f, 0.328699f, 0.227148f, -0.066467f, -0.086723f,
            0.047559f, 0.106714f, 0.037897f, 0.111461f, -0.024406f, 0.304769f,
@@ -799,15 +877,13 @@ std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96()
            -0.119002f, 0.026722f, 0.034853f, -0.060934f, -0.025054f, -0.093026f,
            -0.035372f, -0.233209f, -0.049869f, -0.039151f, -0.022279f, -0.065380f,
            -9.063785f };
-    return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0]));
+
        return Mat(1, static_cast<int>(sizeof(detector)/sizeof(detector[0])), CV_32FC1, detector);
    }
-
+    Mat getPeopleDetector64x128()
 std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128()
    {
-    static const float detector[] = {
+        static float detector[] = {
           0.05359386f, -0.14721455f, -0.05532170f, 0.05077307f,
           0.11547081f, -0.04268804f, 0.04635834f, -0.05468199f, 0.08232084f,
           0.10424068f, -0.02294518f, 0.01108519f, 0.01378693f, 0.11193510f,
@@ -1613,7 +1689,9 @@ std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128()
           -0.01612278f, -1.46097376e-003f, 0.14013411f, -8.96181818e-003f,
           -0.03250246f, 3.38630192e-003f, 2.64779478e-003f, 0.03359732f,
           -0.02411991f, -0.04229729f, 0.10666174f, -6.66579151f };
-    return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0]));
+
        return Mat(1, static_cast<int>(sizeof(detector)/sizeof(detector[0])), CV_32FC1, detector);
    }
 }
 #endif
--- a/modules/cudaobjdetect/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@@ -48,9 +48,10 @@ using namespace cvtest;
 //#define DUMP
-struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescriptor
+struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Ptr<cv::cuda::HOG> hog;
 #ifdef DUMP
    std::ofstream f;
@@ -69,23 +70,13 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
        devInfo = GetParam();
        cv::cuda::setDevice(devInfo.deviceID());
        hog = cv::cuda::HOG::create();
    }
 #ifdef DUMP
-    void dump(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void dump(const std::vector<cv::Point>& locations)
    {
        f.write((char*)&blockHists.rows, sizeof(blockHists.rows));
        f.write((char*)&blockHists.cols, sizeof(blockHists.cols));
        for (int i = 0; i < blockHists.rows; ++i)
        {
            for (int j = 0; j < blockHists.cols; ++j)
            {
                float val = blockHists.at<float>(i, j);
                f.write((char*)&val, sizeof(val));
            }
        }
        int nlocations = locations.size();
        f.write((char*)&nlocations, sizeof(nlocations));
@@ -93,21 +84,18 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
            f.write((char*)&locations[i], sizeof(locations[i]));
    }
 #else
-    void compare(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void compare(const std::vector<cv::Point>& locations)
    {
        // skip block_hists check
        int rows, cols;
        f.read((char*)&rows, sizeof(rows));
        f.read((char*)&cols, sizeof(cols));
-        ASSERT_EQ(rows, blockHists.rows);
+        for (int i = 0; i < rows; ++i)
        ASSERT_EQ(cols, blockHists.cols);
        for (int i = 0; i < blockHists.rows; ++i)
        {
-            for (int j = 0; j < blockHists.cols; ++j)
+            for (int j = 0; j < cols; ++j)
            {
                float val;
                f.read((char*)&val, sizeof(val));
                ASSERT_NEAR(val, blockHists.at<float>(i, j), 1e-3);
            }
        }
@@ -126,54 +114,41 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
    void testDetect(const cv::Mat& img)
    {
-        gamma_correction = false;
+        hog->setGammaCorrection(false);
-        setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog->setSVMDetector(hog->getDefaultPeopleDetector());
        std::vector<cv::Point> locations;
        // Test detect
-        detect(loadMat(img), locations, 0);
+        hog->detect(loadMat(img), locations);
 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
        // Test detect on smaller image
        cv::Mat img2;
        cv::resize(img, img2, cv::Size(img.cols / 2, img.rows / 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);
 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
        // Test detect on greater image
        cv::resize(img, img2, cv::Size(img.cols * 2, img.rows * 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);
 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
    }
    // Does not compare border value, as interpolation leads to delta
    void compare_inner_parts(cv::Mat d1, cv::Mat d2)
    {
        for (int i = 1; i < blocks_per_win_y - 1; ++i)
            for (int j = 1; j < blocks_per_win_x - 1; ++j)
                for (int k = 0; k < block_hist_size; ++k)
                {
                    float a = d1.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
                    float b = d2.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
                    ASSERT_FLOAT_EQ(a, b);
                }
    }
 };
 // desabled while resize does not fixed
@@ -182,13 +157,8 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
    cv::Mat img_rgb = readImage("hog/road.png");
    ASSERT_FALSE(img_rgb.empty());
 #ifdef DUMP
    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
    ASSERT_TRUE(f.is_open());
 #else
    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
    ASSERT_TRUE(f.is_open());
 #endif
    // Test on color image
    cv::Mat img;
@@ -198,8 +168,6 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
    // Test on gray image
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
    testDetect(img);
    f.close();
 }
 CUDA_TEST_P(HOG, GetDescriptors)
@@ -216,8 +184,14 @@ CUDA_TEST_P(HOG, GetDescriptors)
    // Convert train images into feature vectors (train table)
    cv::cuda::GpuMat descriptors, descriptors_by_cols;
-    getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
+
-    getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);
+    hog->setWinStride(Size(64, 128));
    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_ROW_BY_ROW);
    hog->compute(d_img, descriptors);
    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_COL_BY_COL);
    hog->compute(d_img, descriptors_by_cols);
    // Check size of the result train table
    wins_per_img_x = 3;
@@ -242,48 +216,6 @@ CUDA_TEST_P(HOG, GetDescriptors)
                    ASSERT_EQ(l[(y * blocks_per_win_x + x) * block_hist_size + k],
                              r[(x * blocks_per_win_y + y) * block_hist_size + k]);
    }
    /* Now we want to extract the same feature vectors, but from single images. NOTE: results will
    be defferent, due to border values interpolation. Using of many small images is slower, however we
    wont't call getDescriptors and will use computeBlockHistograms instead of. computeBlockHistograms
    works good, it can be checked in the gpu_hog sample */
    img_rgb = readImage("hog/positive1.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
    computeBlockHistograms(cv::cuda::GpuMat(img));
    // Everything is fine with interpolation for left top subimage
    ASSERT_EQ(0.0, cv::norm((cv::Mat)block_hists, (cv::Mat)descriptors.rowRange(0, 1)));
    img_rgb = readImage("hog/positive2.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));
    img_rgb = readImage("hog/negative1.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));
    img_rgb = readImage("hog/negative2.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));
    img_rgb = readImage("hog/positive3.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));
    img_rgb = readImage("hog/negative3.png");
    ASSERT_TRUE(!img_rgb.empty());
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
    computeBlockHistograms(cv::cuda::GpuMat(img));
    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
 }
 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
@@ -310,12 +242,12 @@ CUDA_TEST_P(CalTech, HOG)
    cv::cuda::GpuMat d_img(img);
    cv::Mat markedImage(img.clone());
-    cv::cuda::HOGDescriptor d_hog;
+    cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
-    d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+    d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
-    d_hog.nlevels = d_hog.nlevels + 32;
+    d_hog->setNumLevels(d_hog->getNumLevels() + 32);
    std::vector<cv::Rect> found_locations;
-    d_hog.detectMultiScale(d_img, found_locations);
+    d_hog->detectMultiScale(d_img, found_locations);
 #if defined (LOG_CASCADE_STATISTIC)
    for (int i = 0; i < (int)found_locations.size(); i++)
@@ -326,7 +258,8 @@ CUDA_TEST_P(CalTech, HOG)
        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
    }
-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage);
    cv::waitKey();
 #endif
 }
--- a/samples/gpu/hog.cpp
+++ b/samples/gpu/hog.cpp
@@ -244,19 +244,13 @@ void App::run()
    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
    Size win_stride(args.win_stride_width, args.win_stride_height);
-    // Create HOG descriptors and detectors here
+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size);
-    vector<float> detector;
+    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9);
    if (win_size == Size(64, 128))
        detector = cv::cuda::HOGDescriptor::getPeopleDetector64x128();
    else
        detector = cv::cuda::HOGDescriptor::getPeopleDetector48x96();
-    cv::cuda::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
+    // Create HOG descriptors and detectors here
-                                   cv::cuda::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
+    Mat detector = gpu_hog->getDefaultPeopleDetector();
-                                   cv::cuda::HOGDescriptor::DEFAULT_NLEVELS);
+
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
+    gpu_hog->setSVMDetector(detector);
                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
    gpu_hog.setSVMDetector(detector);
    cpu_hog.setSVMDetector(detector);
    while (running)
@@ -307,9 +301,6 @@ void App::run()
            else img = img_aux;
            img_to_show = img;
            gpu_hog.nlevels = nlevels;
            cpu_hog.nlevels = nlevels;
            vector<Rect> found;
            // Perform HOG classification
@@ -317,11 +308,19 @@ void App::run()
            if (use_gpu)
            {
                gpu_img.upload(img);
-                gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
+                gpu_hog->setNumLevels(nlevels);
                gpu_hog->setHitThreshold(hit_threshold);
                gpu_hog->setWinStride(win_stride);
                gpu_hog->setScaleFactor(scale);
                gpu_hog->setGroupThreshold(gr_threshold);
                gpu_hog->detectMultiScale(gpu_img, found);
            }
            else
            {
                cpu_hog.nlevels = nlevels;
                cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
                                          Size(0, 0), scale, gr_threshold);
            }
            else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
                                          Size(0, 0), scale, gr_threshold);
            hogWorkEnd();
            // Draw positive classified windows