From d65b3e06170e906a68d5f370501facf5ffaacd30 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 5 Sep 2012 10:48:07 +0400
Subject: [PATCH 01/74] fix warning in CUDA samples

---
 modules/gpu/src/cuda/bf_knnmatch.cu          |  2 +-
 modules/gpu/src/nvidia/core/NCV.hpp          |  2 +-
 samples/gpu/cascadeclassifier_nvidia_api.cpp |  8 ++++----
 samples/gpu/opticalflow_nvidia_api.cpp       | 16 ++++++++--------
 4 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu
index 6a778735b..b31f25ca8 100644
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -1034,7 +1034,7 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int /*cc*/, cudaStream_t stream)
         {
             findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
         }
diff --git a/modules/gpu/src/nvidia/core/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp
index ddac47c92..26b1d4ef1 100644
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -288,7 +288,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
     do \
     { \
         cudaError_t res = cudacall; \
-        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
+        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \
     } while (0)
 
 
diff --git a/samples/gpu/cascadeclassifier_nvidia_api.cpp b/samples/gpu/cascadeclassifier_nvidia_api.cpp
index da98643af..99c95ab97 100644
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -30,7 +30,7 @@ const Size2i preferredVideoFrameSize(640, 480);
 const string wndTitle = "NVIDIA Computer Vision :: Haar Classifiers Cascade";
 
 
-void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
+static void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 {
     int fontFace = FONT_HERSHEY_DUPLEX;
     double fontScale = 0.8;
@@ -45,7 +45,7 @@ void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 }
 
 
-void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
+static void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
 {
     Scalar fontColorRed = CV_RGB(255,0,0);
     Scalar fontColorNV  = CV_RGB(118,185,0);
@@ -74,7 +74,7 @@ void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bF
 }
 
 
-NCVStatus process(Mat *srcdst,
+static NCVStatus process(Mat *srcdst,
                   Ncv32u width, Ncv32u height,
                   NcvBool bFilterRects, NcvBool bLargestFace,
                   HaarClassifierCascadeDescriptor &haar,
@@ -281,7 +281,7 @@ int main(int argc, const char** argv)
     //==============================================================================
 
     namedWindow(wndTitle, 1);
-    Mat gray, frameDisp;
+    Mat frameDisp;
 
     do
     {
diff --git a/samples/gpu/opticalflow_nvidia_api.cpp b/samples/gpu/opticalflow_nvidia_api.cpp
index 8a149d740..05a37ef69 100644
--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
@@ -59,7 +59,7 @@ public:
 class RgbToR
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char /*g*/, unsigned char r)
     {
         return static_cast<float>(r)/255.0f;
     }
@@ -69,7 +69,7 @@ public:
 class RgbToG
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char g, unsigned char /*r*/)
     {
         return static_cast<float>(g)/255.0f;
     }
@@ -78,7 +78,7 @@ public:
 class RgbToB
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char b, unsigned char /*g*/, unsigned char /*r*/)
     {
         return static_cast<float>(b)/255.0f;
     }
@@ -135,7 +135,7 @@ NCVStatus CopyData(const IplImage *image, const NCVMatrixAlloc<Ncv32f> &dst)
     return NCV_SUCCESS;
 }
 
-NCVStatus LoadImages (const char *frame0Name,
+static NCVStatus LoadImages (const char *frame0Name,
                       const char *frame1Name,
                       int &width,
                       int &height,
@@ -186,7 +186,7 @@ inline T MapValue (T x, T a, T b, T c, T d)
     return c + (d - c) * (x - a) / (b - a);
 }
 
-NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
+static NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
 {
     IplImage *flowField;
 
@@ -246,7 +246,7 @@ NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const
     return NCV_SUCCESS;
 }
 
-IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
+static IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
 {
     CvSize imageSize = cvSize (h_r.width (), h_r.height ());
     IplImage *image  = cvCreateImage (imageSize, IPL_DEPTH_8U, 4);
@@ -270,7 +270,7 @@ IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g,
     return image;
 }
 
-void PrintHelp ()
+static void PrintHelp ()
 {
     std::cout << "Usage help:\n";
     std::cout << std::setiosflags(std::ios::left);
@@ -286,7 +286,7 @@ void PrintHelp ()
     std::cout << "\t" << std::setw(15) << PARAM_HELP << " - display this help message\n";
 }
 
-int ProcessCommandLine(int argc, char **argv,
+static int ProcessCommandLine(int argc, char **argv,
                        Ncv32f &timeStep,
                        char *&frame0Name,
                        char *&frame1Name,

From dd9c53497bb8c32d411d8b5e784b249ad9d13364 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 20 Sep 2012 03:28:45 +0400
Subject: [PATCH 02/74] GPU interface for soft cascade

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  43 ++++++++++
 modules/gpu/src/softcascade.cpp         | 100 ++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 modules/gpu/src/softcascade.cpp

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index ddb131788..4a2d88aa0 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1532,6 +1532,49 @@ public:
     int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 };
 
+// ======================== GPU version for soft cascade ===================== //
+
+class CV_EXPORTS SoftCascade
+{
+public:
+    //! An empty cascade will be created.
+    SoftCascade();
+
+    //! Cascade will be created from file for scales from minScale to maxScale.
+    //! Param filename is a path to xml-serialized cascade.
+    //! Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
+    //! Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
+    SoftCascade( const string& filename, const float minScale = 0.4f, const float maxScale = 5.f);
+
+    //! cascade will be loaded from file "filename". The previous cascade will be destroyed.
+    //! Param filename is a path to xml-serialized cascade.
+    //! Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
+    //! Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
+    bool load( const string& filename, const float minScale = 0.4f, const float maxScale = 5.f);
+
+    virtual ~SoftCascade();
+
+    //! return vector of bounding boxes. Each box contains one detected object
+    virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
+    int rejectfactor = 1, Stream stream = Stream::Null()); // ToDo store objects in GPU mem
+
+protected:
+    enum { BOOST = 0 };
+    enum
+    {
+        FRAME_WIDTH        = 640,
+        FRAME_HEIGHT       = 480,
+        TOTAL_SCALES       = 55,
+        CLASSIFIERS        = 5,
+        ORIG_OBJECT_WIDTH  = 64,
+        ORIG_OBJECT_HEIGHT = 128
+    };
+
+private:
+    struct Filds;
+    Filds* filds;
+};
+
 ////////////////////////////////// SURF //////////////////////////////////////////
 
 class CV_EXPORTS SURF_GPU
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
new file mode 100644
index 000000000..509e3f501
--- /dev/null
+++ b/modules/gpu/src/softcascade.cpp
@@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <precomp.hpp>
+
+#if !defined (HAVE_CUDA)
+
+cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); }
+
+cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); }
+
+cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); }
+
+bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); }
+
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu(); }
+
+#else
+
+struct cv::gpu::SoftCascade::Filds
+{
+    bool fill(const FileNode &root, const float mins, const float maxs){return true;}
+    void calcLevels(int frameW, int frameH, int scales) {}
+};
+
+cv::gpu::SoftCascade::SoftCascade() : filds(0) {}
+
+cv::gpu::SoftCascade::SoftCascade( const string& filename, const float minScale, const float maxScale) : filds(0)
+{
+    load(filename, minScale, maxScale);
+}
+
+cv::gpu::SoftCascade::~SoftCascade()
+{
+    delete filds;
+}
+
+bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, const float maxScale)
+{
+    if (filds)
+        delete filds;
+    filds = 0;
+
+    cv::FileStorage fs(filename, FileStorage::READ);
+    if (!fs.isOpened()) return false;
+
+    filds = new Filds;
+    Filds& flds = *filds;
+    if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
+    flds.calcLevels(FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
+
+    return true;
+}
+
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& /*image*/, const GpuMat& /*rois*/,
+                                GpuMat& /*objects*/, const int /*rejectfactor*/, Stream /*stream*/)
+{
+    // empty
+}
+
+#endif

From 267d140bfeb43b1c47734e554c37c0b1db81787f Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 20 Sep 2012 16:22:10 +0400
Subject: [PATCH 03/74] soft cascade: gpu representation

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |   9 +-
 modules/gpu/src/cuda/isf-sc.cu          |  43 +++++
 modules/gpu/src/icf.hpp                 | 118 ++++++++++++
 modules/gpu/src/softcascade.cpp         | 236 +++++++++++++++++++++++-
 modules/gpu/test/test_softcascade.cpp   |  73 ++++++++
 5 files changed, 473 insertions(+), 6 deletions(-)
 create mode 100644 modules/gpu/src/cuda/isf-sc.cu
 create mode 100644 modules/gpu/src/icf.hpp
 create mode 100644 modules/gpu/test/test_softcascade.cpp

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 4a2d88aa0..61f6006c5 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1554,9 +1554,14 @@ public:
 
     virtual ~SoftCascade();
 
-    //! return vector of bounding boxes. Each box contains one detected object
+    //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values
+    //! Param image is input frame for detector. Cascade will be applied to it.
+    //! Param rois is a mask
+    //! Param objects 4-channel matrix thet contain detected rectangles
+    //! Param rejectfactor used for final object box computing
+    //! Param stream
     virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
-    int rejectfactor = 1, Stream stream = Stream::Null()); // ToDo store objects in GPU mem
+    int rejectfactor = 1, Stream stream = Stream::Null());
 
 protected:
     enum { BOOST = 0 };
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
new file mode 100644
index 000000000..f36f86f96
--- /dev/null
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <icf.hpp>
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
new file mode 100644
index 000000000..110890232
--- /dev/null
+++ b/modules/gpu/src/icf.hpp
@@ -0,0 +1,118 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_ICF_HPP__
+#define __OPENCV_ICF_HPP__
+
+#if defined __CUDACC__
+# define __hd__ __host__ __device__ __forceinline__
+#else
+# define __hd__
+#endif
+
+
+namespace icf {
+
+    struct Cascade
+    {
+
+    };
+
+    struct ChannelStorage
+    {
+
+    };
+
+    struct __align__(16) Octave
+    {
+        ushort index;
+        ushort stages;
+        ushort shrinkage;
+        ushort2 size;
+        float scale;
+
+        Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
+        : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
+    };
+
+    struct __align__(8) Node
+    {
+        int feature;
+        float threshold;
+
+        Node(const int f, const float t) : feature(f), threshold(t) {}
+    };
+
+    struct __align__(8) Feature
+    {
+        int channel;
+        uchar4 rect;
+
+        Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
+    };
+
+    struct __align__(8) Level //is actually 24 bytes
+    {
+        int octave;
+
+        // float origScale; //not actually used
+        float relScale;
+        float shrScale;   // used for marking detection
+        float scaling[2]; // calculated according to Dollal paper
+
+        // for 640x480 we can not get overflow
+        uchar2 workRect;
+        uchar2 objSize;
+
+        Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+        :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+        {
+            workRect.x = round(w / (float)oct.shrinkage);
+            workRect.y = round(h / (float)oct.shrinkage);
+
+            objSize.x  = round(oct.size.x * relScale);
+            objSize.y  = round(oct.size.y * relScale);
+        }
+    };
+}
+
+#endif
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 509e3f501..04b68539c 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -56,12 +56,242 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat
 
 #else
 
+#include <icf.hpp>
+
 struct cv::gpu::SoftCascade::Filds
 {
-    bool fill(const FileNode &root, const float mins, const float maxs){return true;}
-    void calcLevels(int frameW, int frameH, int scales) {}
+    // scales range
+    float minScale;
+    float maxScale;
+
+    int origObjWidth;
+    int origObjHeight;
+
+    GpuMat octaves;
+    GpuMat stages;
+    GpuMat nodes;
+    GpuMat leaves;
+    GpuMat features;
+
+    std::vector<float> scales;
+
+    icf::Cascade cascade;
+
+    bool fill(const FileNode &root, const float mins, const float maxs);
+
+private:
+    void calcLevels(const std::vector<icf::Octave>& octs,
+                                                    int frameW, int frameH, int nscales);
+
+    typedef std::vector<icf::Octave>::const_iterator  octIt_t;
+    int fitOctave(const std::vector<icf::Octave>& octs, const float& logFactor)
+    {
+        float minAbsLog = FLT_MAX;
+        int res =  0;
+        for (int oct = 0; oct < (int)octs.size(); ++oct)
+        {
+            const icf::Octave& octave =octs[oct];
+            float logOctave = ::log(octave.scale);
+            float logAbsScale = ::fabs(logFactor - logOctave);
+
+            if(logAbsScale < minAbsLog)
+            {
+                res = oct;
+                minAbsLog = logAbsScale;
+            }
+        }
+        return res;
+    }
 };
 
+inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
+{
+    minScale = mins;
+    maxScale = maxs;
+
+    // cascade properties
+    static const char *const SC_STAGE_TYPE          = "stageType";
+    static const char *const SC_BOOST               = "BOOST";
+
+    static const char *const SC_FEATURE_TYPE        = "featureType";
+    static const char *const SC_ICF                 = "ICF";
+
+    static const char *const SC_ORIG_W              = "width";
+    static const char *const SC_ORIG_H              = "height";
+
+    static const char *const SC_OCTAVES             = "octaves";
+    static const char *const SC_STAGES              = "stages";
+    static const char *const SC_FEATURES            = "features";
+
+    static const char *const SC_WEEK                = "weakClassifiers";
+    static const char *const SC_INTERNAL            = "internalNodes";
+    static const char *const SC_LEAF                = "leafValues";
+
+    static const char *const SC_OCT_SCALE           = "scale";
+    static const char *const SC_OCT_STAGES          = "stageNum";
+    static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
+
+    static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
+
+    static const char * const SC_F_CHANNEL          = "channel";
+    static const char * const SC_F_RECT             = "rect";
+
+    // only Ada Boost supported
+    std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
+    CV_Assert(stageTypeStr == SC_BOOST);
+
+    // only HOG-like integral channel features cupported
+    string featureTypeStr = (string)root[SC_FEATURE_TYPE];
+    CV_Assert(featureTypeStr == SC_ICF);
+
+    origObjWidth = (int)root[SC_ORIG_W];
+    CV_Assert(origObjWidth == SoftCascade::ORIG_OBJECT_WIDTH);
+
+    origObjHeight = (int)root[SC_ORIG_H];
+    CV_Assert(origObjHeight == SoftCascade::ORIG_OBJECT_HEIGHT);
+
+    FileNode fn = root[SC_OCTAVES];
+        if (fn.empty()) return false;
+
+    std::vector<icf::Octave>  voctaves;
+    std::vector<float>        vstages;
+    std::vector<icf::Node>    vnodes;
+    std::vector<float>        vleaves;
+    std::vector<icf::Feature> vfeatures;
+    scales.clear();
+
+    // std::vector<Level> levels;
+
+    FileNodeIterator it = fn.begin(), it_end = fn.end();
+    int feature_offset = 0;
+    ushort octIndex = 0;
+
+    for (; it != it_end; ++it)
+    {
+        FileNode fns = *it;
+        float scale = (float)fns[SC_OCT_SCALE];
+        scales.push_back(scale);
+        ushort nstages = saturate_cast<ushort>((int)fn[SC_OCT_STAGES]);
+        ushort2 size;
+        size.x = cvRound(SoftCascade::ORIG_OBJECT_WIDTH * scale);
+        size.y = cvRound(SoftCascade::ORIG_OBJECT_HEIGHT * scale);
+        ushort shrinkage = saturate_cast<ushort>((int)fn[SC_OCT_SHRINKAGE]);
+
+        icf::Octave octave(octIndex, nstages, shrinkage, size, scale);
+        CV_Assert(octave.stages > 0);
+        voctaves.push_back(octave);
+
+        FileNode ffs = fns[SC_FEATURES];
+        if (ffs.empty()) return false;
+
+        fns = fns[SC_STAGES];
+        if (fn.empty()) return false;
+
+        // for each stage (~ decision tree with H = 2)
+        FileNodeIterator st = fns.begin(), st_end = fns.end();
+        for (; st != st_end; ++st )
+        {
+            fns = *st;
+            vstages.push_back((float)fn[SC_STAGE_THRESHOLD]);
+
+            fns = fns[SC_WEEK];
+            FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
+            for (; ftr != ft_end; ++ftr)
+            {
+                fns = (*ftr)[SC_INTERNAL];
+                FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
+                for (; inIt != inIt_end;)
+                {
+                    int feature = (int)(*(inIt +=2)++) + feature_offset;
+                    vnodes.push_back(icf::Node(feature, (float)(*(inIt++))));
+                }
+
+                fns = (*ftr)[SC_LEAF];
+                inIt = fns.begin(), inIt_end = fns.end();
+                for (; inIt != inIt_end; ++inIt)
+                    vleaves.push_back((float)(*inIt));
+            }
+        }
+
+        st = ffs.begin(), st_end = ffs.end();
+        for (; st != st_end; ++st )
+        {
+            cv::FileNode rn = (*st)[SC_F_RECT];
+            cv::FileNodeIterator r_it = rn.begin();
+            uchar4 rect;
+            rect.x = saturate_cast<uchar>((int)*(r_it++));
+            rect.y = saturate_cast<uchar>((int)*(r_it++));
+            rect.z = saturate_cast<uchar>((int)*(r_it++));
+            rect.w = saturate_cast<uchar>((int)*(r_it++));
+            vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect));
+        }
+
+        feature_offset += octave.stages * 3;
+        ++octIndex;
+    }
+
+    // upload in gpu memory
+    octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) ));
+    CV_Assert(!octaves.empty());
+
+    stages.upload(cv::Mat(vstages).reshape(1,1));
+    CV_Assert(!stages.empty());
+
+    nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) ));
+    CV_Assert(!nodes.empty());
+
+    leaves.upload(cv::Mat(vleaves).reshape(1,1));
+    CV_Assert(!leaves.empty());
+
+    features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
+    CV_Assert(!features.empty());
+
+    // compute levels
+    calcLevels(voctaves, (int)SoftCascade::FRAME_WIDTH, (int)SoftCascade::FRAME_HEIGHT, (int)SoftCascade::TOTAL_SCALES);
+
+    return true;
+}
+
+inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octave>& octs,
+                                                    int frameW, int frameH, int nscales)
+{
+    CV_Assert(nscales > 1);
+
+    std::vector<icf::Level> levels;
+    float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
+
+    float scale = minScale;
+    for (int sc = 0; sc < nscales; ++sc)
+    {
+        int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
+        int height = ::std::max(0.0f, frameH - (origObjHeight * scale));
+
+        float logScale = ::log(scale);
+        int fit = fitOctave(octs, logScale);
+
+        icf::Level level(fit, octs[fit], scale, width, height);
+
+        if (!width || !height)
+            break;
+        else
+            levels.push_back(level);
+
+        if (::fabs(scale - maxScale) < FLT_EPSILON) break;
+        scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
+
+        // std::cout << "level " << sc << " scale "
+        //           << levels[sc].origScale
+        //           << " octeve "
+        //           << levels[sc].octave->scale
+        //           << " "
+        //           << levels[sc].relScale
+        //           << " " << levels[sc].shrScale
+        //           << " [" << levels[sc].objSize.width
+        //           << " " << levels[sc].objSize.height << "] ["
+        // << levels[sc].workRect.width << " " << levels[sc].workRect.height << "]" << std::endl;
+    }
+}
+
 cv::gpu::SoftCascade::SoftCascade() : filds(0) {}
 
 cv::gpu::SoftCascade::SoftCascade( const string& filename, const float minScale, const float maxScale) : filds(0)
@@ -86,8 +316,6 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     filds = new Filds;
     Filds& flds = *filds;
     if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
-    flds.calcLevels(FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
-
     return true;
 }
 
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
new file mode 100644
index 000000000..821a2b140
--- /dev/null
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <test_precomp.hpp>
+
+#ifdef HAVE_CUDA
+
+using cv::gpu::GpuMat;
+
+TEST(SoftCascade, readCascade)
+{
+    std::string xml = cvtest::TS::ptr()->get_data_path() + "cascadeandhog/icf-template.xml";
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(xml));
+
+}
+
+TEST(SoftCascade, detect)
+{
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(xml));
+
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "cascadeandhog/bahnhof/image_00000000_0.png");
+    ASSERT_FALSE(coloredCpu.empty());
+    GpuMat colored(coloredCpu), objectBoxes, rois;
+
+    // ASSERT_NO_THROW(
+    // {
+        cascade.detectMultiScale(colored, rois, objectBoxes);
+    // });
+}
+
+#endif
\ No newline at end of file

From 2b7ce8b16031124d4ebd261fd5b70e5fbc5f6d5a Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 20 Sep 2012 16:44:38 +0400
Subject: [PATCH 04/74] precompute feature response for scaling factor

---
 modules/gpu/src/softcascade.cpp | 36 +++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 04b68539c..18306e04d 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -252,6 +252,40 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     return true;
 }
 
+namespace {
+    struct CascadeIntrinsics
+    {
+        static const float lambda = 1.099f, a = 0.89f;
+
+        static float getFor(int channel, float scaling)
+        {
+            CV_Assert(channel < 10);
+
+            if (fabs(scaling - 1.f) < FLT_EPSILON)
+                return 1.f;
+
+            // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
+            static const float A[2][2] =
+            {   //channel <= 6, otherwise
+                {        0.89f, 1.f}, // down
+                {        1.00f, 1.f}  // up
+            };
+
+            static const float B[2][2] =
+            {   //channel <= 6,  otherwise
+                { 1.099f / log(2), 2.f}, // down
+                {             0.f, 2.f}  // up
+            };
+
+            float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
+            float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
+
+            printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
+            return a * pow(scaling, b);
+        }
+    };
+}
+
 inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octave>& octs,
                                                     int frameW, int frameH, int nscales)
 {
@@ -270,6 +304,8 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octav
         int fit = fitOctave(octs, logScale);
 
         icf::Level level(fit, octs[fit], scale, width, height);
+        level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
+        level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
 
         if (!width || !height)
             break;

From 1ab7af69956491e5965a0429093c2296b91b7428 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 20 Sep 2012 18:35:48 +0400
Subject: [PATCH 05/74] GPU soft cascade: buffers preallocation

---
 modules/gpu/include/opencv2/gpu/gpu.hpp | 12 -----
 modules/gpu/src/icf.hpp                 | 16 +++++-
 modules/gpu/src/softcascade.cpp         | 68 +++++++++++++++++++++----
 3 files changed, 72 insertions(+), 24 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 61f6006c5..5008e1027 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1563,18 +1563,6 @@ public:
     virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
     int rejectfactor = 1, Stream stream = Stream::Null());
 
-protected:
-    enum { BOOST = 0 };
-    enum
-    {
-        FRAME_WIDTH        = 640,
-        FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        CLASSIFIERS        = 5,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128
-    };
-
 private:
     struct Filds;
     Filds* filds;
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 110890232..49919a79c 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -40,6 +40,8 @@
 //
 //M*/
 
+#include <opencv2/gpu/device/common.hpp>
+
 #ifndef __OPENCV_ICF_HPP__
 #define __OPENCV_ICF_HPP__
 
@@ -54,12 +56,24 @@ namespace icf {
 
     struct Cascade
     {
+        Cascade() {}
+        Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds,
+            const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
+        : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
+
+        cv::gpu::PtrStepSzb octaves;
+        cv::gpu::PtrStepSzf stages;
+        cv::gpu::PtrStepSzb nodes;
+        cv::gpu::PtrStepSzf leaves;
+        cv::gpu::PtrStepSzb features;
+
+        cv::gpu::PtrStepSzb levels;
 
     };
 
     struct ChannelStorage
     {
-
+        ChannelStorage(const cv::gpu::PtrStepSzb& /*f*/, const int /*shrinkage*/) {}
     };
 
     struct __align__(16) Octave
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 18306e04d..8ef1da457 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -72,19 +72,41 @@ struct cv::gpu::SoftCascade::Filds
     GpuMat nodes;
     GpuMat leaves;
     GpuMat features;
+    GpuMat levels;
+
+    // preallocated buffer 640x480x10
+    GpuMat dmem;
+    // 160x120x10
+    GpuMat shrunk;
+    // 161x121x10
+    GpuMat hogluv;
 
     std::vector<float> scales;
 
     icf::Cascade cascade;
 
     bool fill(const FileNode &root, const float mins, const float maxs);
+    void detect(const icf::ChannelStorage& /*channels*/) const {}
+
+    enum { BOOST = 0 };
+    enum
+    {
+        FRAME_WIDTH        = 640,
+        FRAME_HEIGHT       = 480,
+        TOTAL_SCALES       = 55,
+        CLASSIFIERS        = 5,
+        ORIG_OBJECT_WIDTH  = 64,
+        ORIG_OBJECT_HEIGHT = 128,
+        HOG_BINS           = 6,
+        HOG_LUV_BINS       = 10
+    };
 
 private:
     void calcLevels(const std::vector<icf::Octave>& octs,
                                                     int frameW, int frameH, int nscales);
 
     typedef std::vector<icf::Octave>::const_iterator  octIt_t;
-    int fitOctave(const std::vector<icf::Octave>& octs, const float& logFactor)
+    int fitOctave(const std::vector<icf::Octave>& octs, const float& logFactor) const
     {
         float minAbsLog = FLT_MAX;
         int res =  0;
@@ -145,10 +167,10 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     CV_Assert(featureTypeStr == SC_ICF);
 
     origObjWidth = (int)root[SC_ORIG_W];
-    CV_Assert(origObjWidth == SoftCascade::ORIG_OBJECT_WIDTH);
+    CV_Assert(origObjWidth  == ORIG_OBJECT_WIDTH);
 
     origObjHeight = (int)root[SC_ORIG_H];
-    CV_Assert(origObjHeight == SoftCascade::ORIG_OBJECT_HEIGHT);
+    CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT);
 
     FileNode fn = root[SC_OCTAVES];
         if (fn.empty()) return false;
@@ -165,6 +187,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     FileNodeIterator it = fn.begin(), it_end = fn.end();
     int feature_offset = 0;
     ushort octIndex = 0;
+    ushort shrinkage = 1;
 
     for (; it != it_end; ++it)
     {
@@ -173,9 +196,9 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
         scales.push_back(scale);
         ushort nstages = saturate_cast<ushort>((int)fn[SC_OCT_STAGES]);
         ushort2 size;
-        size.x = cvRound(SoftCascade::ORIG_OBJECT_WIDTH * scale);
-        size.y = cvRound(SoftCascade::ORIG_OBJECT_HEIGHT * scale);
-        ushort shrinkage = saturate_cast<ushort>((int)fn[SC_OCT_SHRINKAGE]);
+        size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
+        size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
+        shrinkage = saturate_cast<ushort>((int)fn[SC_OCT_SHRINKAGE]);
 
         icf::Octave octave(octIndex, nstages, shrinkage, size, scale);
         CV_Assert(octave.stages > 0);
@@ -247,7 +270,16 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     CV_Assert(!features.empty());
 
     // compute levels
-    calcLevels(voctaves, (int)SoftCascade::FRAME_WIDTH, (int)SoftCascade::FRAME_HEIGHT, (int)SoftCascade::TOTAL_SCALES);
+    calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
+    CV_Assert(!levels.empty());
+
+    // init Cascade
+    cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels);
+
+    // allocate buffers
+    dmem.create(FRAME_HEIGHT * HOG_LUV_BINS, FRAME_WIDTH, CV_8UC1);
+    shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
+    hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1);
 
     return true;
 }
@@ -291,7 +323,7 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octav
 {
     CV_Assert(nscales > 1);
 
-    std::vector<icf::Level> levels;
+    std::vector<icf::Level> vlevels;
     float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
 
     float scale = minScale;
@@ -310,11 +342,13 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octav
         if (!width || !height)
             break;
         else
-            levels.push_back(level);
+            vlevels.push_back(level);
 
         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
 
+        levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
+
         // std::cout << "level " << sc << " scale "
         //           << levels[sc].origScale
         //           << " octeve "
@@ -355,10 +389,22 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     return true;
 }
 
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& /*image*/, const GpuMat& /*rois*/,
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /*rois*/,
                                 GpuMat& /*objects*/, const int /*rejectfactor*/, Stream /*stream*/)
 {
-    // empty
+    // only color images are supperted
+    CV_Assert(image.type() == CV_8UC3);
+
+    // only this window size allowed
+    CV_Assert(image.cols == 640 && image.rows == 480);
+
+
+    // ToDo: add shrincage in whole cascade.
+    const int shrincage = 4;
+    icf::ChannelStorage storage(image, shrincage);
+
+    const Filds& flds = *filds;
+    flds.detect(storage);
 }
 
 #endif

From 4aac1444ad21041b226199d6fb6dcc12d884f928 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 20 Sep 2012 18:51:41 +0400
Subject: [PATCH 06/74] integral channel storage are cached as a cascade's
 field

---
 modules/gpu/src/icf.hpp         | 156 +++++++++++++++++---------------
 modules/gpu/src/softcascade.cpp |  13 ++-
 2 files changed, 91 insertions(+), 78 deletions(-)

diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 49919a79c..8cc4395c3 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -54,79 +54,93 @@
 
 namespace icf {
 
-    struct Cascade
+using cv::gpu::PtrStepSzb;
+using cv::gpu::PtrStepSzf;
+
+struct Cascade
+{
+    Cascade() {}
+    Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds,
+        const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
+    : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
+
+    PtrStepSzb octaves;
+    PtrStepSzf stages;
+    PtrStepSzb nodes;
+    PtrStepSzf leaves;
+    PtrStepSzb features;
+
+    PtrStepSzb levels;
+
+};
+
+struct ChannelStorage
+{
+    ChannelStorage(){}
+    ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr,
+        const cv::gpu::PtrStepSzb& itg, const int s)
+    : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
+
+    void frame(const cv::gpu::PtrStepSzb& image) {}
+
+    PtrStepSzb dmem;
+    PtrStepSzb shrunk;
+    PtrStepSzb hogluv;
+
+    int shrinkage;
+};
+
+struct __align__(16) Octave
+{
+    ushort index;
+    ushort stages;
+    ushort shrinkage;
+    ushort2 size;
+    float scale;
+
+    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
+    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
+};
+
+struct __align__(8) Node
+{
+    int feature;
+    float threshold;
+
+    Node(const int f, const float t) : feature(f), threshold(t) {}
+};
+
+struct __align__(8) Feature
+{
+    int channel;
+    uchar4 rect;
+
+    Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
+};
+
+struct __align__(8) Level //is actually 24 bytes
+{
+    int octave;
+
+    // float origScale; //not actually used
+    float relScale;
+    float shrScale;   // used for marking detection
+    float scaling[2]; // calculated according to Dollal paper
+
+    // for 640x480 we can not get overflow
+    uchar2 workRect;
+    uchar2 objSize;
+
+    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
     {
-        Cascade() {}
-        Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds,
-            const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
-        : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
+        workRect.x = round(w / (float)oct.shrinkage);
+        workRect.y = round(h / (float)oct.shrinkage);
 
-        cv::gpu::PtrStepSzb octaves;
-        cv::gpu::PtrStepSzf stages;
-        cv::gpu::PtrStepSzb nodes;
-        cv::gpu::PtrStepSzf leaves;
-        cv::gpu::PtrStepSzb features;
-
-        cv::gpu::PtrStepSzb levels;
-
-    };
-
-    struct ChannelStorage
-    {
-        ChannelStorage(const cv::gpu::PtrStepSzb& /*f*/, const int /*shrinkage*/) {}
-    };
-
-    struct __align__(16) Octave
-    {
-        ushort index;
-        ushort stages;
-        ushort shrinkage;
-        ushort2 size;
-        float scale;
-
-        Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
-        : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
-    };
-
-    struct __align__(8) Node
-    {
-        int feature;
-        float threshold;
-
-        Node(const int f, const float t) : feature(f), threshold(t) {}
-    };
-
-    struct __align__(8) Feature
-    {
-        int channel;
-        uchar4 rect;
-
-        Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
-    };
-
-    struct __align__(8) Level //is actually 24 bytes
-    {
-        int octave;
-
-        // float origScale; //not actually used
-        float relScale;
-        float shrScale;   // used for marking detection
-        float scaling[2]; // calculated according to Dollal paper
-
-        // for 640x480 we can not get overflow
-        uchar2 workRect;
-        uchar2 objSize;
-
-        Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-        :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
-        {
-            workRect.x = round(w / (float)oct.shrinkage);
-            workRect.y = round(h / (float)oct.shrinkage);
-
-            objSize.x  = round(oct.size.x * relScale);
-            objSize.y  = round(oct.size.y * relScale);
-        }
-    };
+        objSize.x  = round(oct.size.x * relScale);
+        objSize.y  = round(oct.size.y * relScale);
+    }
+};
 }
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 8ef1da457..54f37cd17 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -84,9 +84,10 @@ struct cv::gpu::SoftCascade::Filds
     std::vector<float> scales;
 
     icf::Cascade cascade;
+    icf::ChannelStorage storage;
 
     bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(const icf::ChannelStorage& /*channels*/) const {}
+    void detect() const {}
 
     enum { BOOST = 0 };
     enum
@@ -281,6 +282,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
     hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1);
 
+    storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
     return true;
 }
 
@@ -398,13 +400,10 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /
     // only this window size allowed
     CV_Assert(image.cols == 640 && image.rows == 480);
 
+    Filds& flds = *filds;
 
-    // ToDo: add shrincage in whole cascade.
-    const int shrincage = 4;
-    icf::ChannelStorage storage(image, shrincage);
-
-    const Filds& flds = *filds;
-    flds.detect(storage);
+    flds.storage.frame(image);
+    flds.detect();
 }
 
 #endif

From a3af5ede8059b29617e400bab631c122422b6316 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 20 Sep 2012 19:35:04 +0400
Subject: [PATCH 07/74] CUDA callers

---
 modules/gpu/src/cuda/isf-sc.cu  | 13 ++++++++++++-
 modules/gpu/src/icf.hpp         |  4 +++-
 modules/gpu/src/softcascade.cpp | 17 ++++++++++-------
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index f36f86f96..a6418c1d3 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -40,4 +40,15 @@
 //
 //M*/
 
-#include <icf.hpp>
\ No newline at end of file
+#include <icf.hpp>
+
+void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv) const
+{
+    // detection kernel
+}
+
+void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar4>& image)
+{
+    // color convertin kernel
+    // hog calculation kernel
+}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 8cc4395c3..7183fc0ac 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -64,6 +64,8 @@ struct Cascade
         const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
     : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
 
+    void detect(const cv::gpu::PtrStepSzb& hogluv) const;
+
     PtrStepSzb octaves;
     PtrStepSzf stages;
     PtrStepSzb nodes;
@@ -81,7 +83,7 @@ struct ChannelStorage
         const cv::gpu::PtrStepSzb& itg, const int s)
     : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
 
-    void frame(const cv::gpu::PtrStepSzb& image) {}
+    void frame(const cv::gpu::PtrStepSz<uchar4>& image);
 
     PtrStepSzb dmem;
     PtrStepSzb shrunk;
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 54f37cd17..80473da95 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -74,7 +74,7 @@ struct cv::gpu::SoftCascade::Filds
     GpuMat features;
     GpuMat levels;
 
-    // preallocated buffer 640x480x10
+    // preallocated buffer 640x480x10 + 640x480
     GpuMat dmem;
     // 160x120x10
     GpuMat shrunk;
@@ -86,9 +86,6 @@ struct cv::gpu::SoftCascade::Filds
     icf::Cascade cascade;
     icf::ChannelStorage storage;
 
-    bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect() const {}
-
     enum { BOOST = 0 };
     enum
     {
@@ -102,6 +99,12 @@ struct cv::gpu::SoftCascade::Filds
         HOG_LUV_BINS       = 10
     };
 
+    bool fill(const FileNode &root, const float mins, const float maxs);
+    void detect() const
+    {
+        cascade.detect(hogluv);
+    }
+
 private:
     void calcLevels(const std::vector<icf::Octave>& octs,
                                                     int frameW, int frameH, int nscales);
@@ -278,7 +281,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels);
 
     // allocate buffers
-    dmem.create(FRAME_HEIGHT * HOG_LUV_BINS, FRAME_WIDTH, CV_8UC1);
+    dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
     shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
     hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1);
 
@@ -395,7 +398,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /
                                 GpuMat& /*objects*/, const int /*rejectfactor*/, Stream /*stream*/)
 {
     // only color images are supperted
-    CV_Assert(image.type() == CV_8UC3);
+    CV_Assert(image.type() == CV_8UC4);
 
     // only this window size allowed
     CV_Assert(image.cols == 640 && image.rows == 480);
@@ -406,4 +409,4 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /
     flds.detect();
 }
 
-#endif
+#endif
\ No newline at end of file

From 14a0dd8c98ecff30ea17fe4ecd1a900df6f32f06 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Fri, 21 Sep 2012 11:44:26 +0400
Subject: [PATCH 08/74] fix typo in cascade loading

---
 modules/gpu/src/softcascade.cpp       | 33 +++++++++++++--------------
 modules/gpu/test/test_softcascade.cpp |  7 +++---
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 80473da95..fb36efddc 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -198,11 +198,11 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
         FileNode fns = *it;
         float scale = (float)fns[SC_OCT_SCALE];
         scales.push_back(scale);
-        ushort nstages = saturate_cast<ushort>((int)fn[SC_OCT_STAGES]);
+        ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
         ushort2 size;
         size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
         size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
-        shrinkage = saturate_cast<ushort>((int)fn[SC_OCT_SHRINKAGE]);
+        shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
 
         icf::Octave octave(octIndex, nstages, shrinkage, size, scale);
         CV_Assert(octave.stages > 0);
@@ -219,7 +219,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
         for (; st != st_end; ++st )
         {
             fns = *st;
-            vstages.push_back((float)fn[SC_STAGE_THRESHOLD]);
+            vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
 
             fns = fns[SC_WEEK];
             FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
@@ -230,7 +230,8 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
                 for (; inIt != inIt_end;)
                 {
                     int feature = (int)(*(inIt +=2)++) + feature_offset;
-                    vnodes.push_back(icf::Node(feature, (float)(*(inIt++))));
+                    float th = (float)(*(inIt++));
+                    vnodes.push_back(icf::Node(feature, th));
                 }
 
                 fns = (*ftr)[SC_LEAF];
@@ -277,7 +278,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
     CV_Assert(!levels.empty());
 
-    // init Cascade
+    //init Cascade
     cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels);
 
     // allocate buffers
@@ -317,7 +318,7 @@ namespace {
             float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
             float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
 
-            printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
+            // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
             return a * pow(scaling, b);
         }
     };
@@ -352,19 +353,17 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octav
         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
 
-        levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
-
-        // std::cout << "level " << sc << " scale "
-        //           << levels[sc].origScale
+        // std::cout << "level " << sc
         //           << " octeve "
-        //           << levels[sc].octave->scale
-        //           << " "
-        //           << levels[sc].relScale
-        //           << " " << levels[sc].shrScale
-        //           << " [" << levels[sc].objSize.width
-        //           << " " << levels[sc].objSize.height << "] ["
-        // << levels[sc].workRect.width << " " << levels[sc].workRect.height << "]" << std::endl;
+        //           << vlevels[sc].octave
+        //           << " relScale "
+        //           << vlevels[sc].relScale
+        //           << " " << vlevels[sc].shrScale
+        //           << " [" << (int)vlevels[sc].objSize.x
+        //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
+        // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
     }
+    levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
 }
 
 cv::gpu::SoftCascade::SoftCascade() : filds(0) {}
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 821a2b140..f94b0b726 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -48,7 +48,7 @@ using cv::gpu::GpuMat;
 
 TEST(SoftCascade, readCascade)
 {
-    std::string xml = cvtest::TS::ptr()->get_data_path() + "cascadeandhog/icf-template.xml";
+    std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml";
     cv::gpu::SoftCascade cascade;
     ASSERT_TRUE(cascade.load(xml));
 
@@ -56,11 +56,12 @@ TEST(SoftCascade, readCascade)
 
 TEST(SoftCascade, detect)
 {
-    std::string xml =  cvtest::TS::ptr()->get_data_path() + "cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
     cv::gpu::SoftCascade cascade;
     ASSERT_TRUE(cascade.load(xml));
 
-    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "cascadeandhog/bahnhof/image_00000000_0.png");
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
+        + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
     ASSERT_FALSE(coloredCpu.empty());
     GpuMat colored(coloredCpu), objectBoxes, rois;
 

From 1a52a322b50b7756464e8021f7225fa75f5a8dc2 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Fri, 21 Sep 2012 16:10:40 +0400
Subject: [PATCH 09/74] add performance test for GPU soft cascade

---
 modules/gpu/perf/perf_objdetect.cpp | 39 +++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index 6b864a3e5..cebcbdb63 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -89,6 +89,45 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
     SANITY_CHECK(found_locations);
 }
 
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(SoftCascade, pair_string);
+
+PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml",
+                                                              "cv/cascadeandhog/bahnhof/image_00000000_0.png")))
+{
+    if (runOnGpu)
+    {
+        cv::Mat cpu = readImage(GetParam().second);
+        ASSERT_FALSE(cpu.empty());
+        cv::gpu::GpuMat colored(cpu);
+
+        cv::gpu::SoftCascade cascade;
+        ASSERT_TRUE(cascade.load(GetParam().first));
+
+        cv::gpu::GpuMat rois, objectBoxes;
+        cascade.detectMultiScale(colored, rois, objectBoxes);
+
+        TEST_CYCLE()
+        {
+            cascade.detectMultiScale(colored, rois, objectBoxes);
+        }
+    } else
+    {
+        cv::Mat colored = readImage(GetParam().second);
+        ASSERT_FALSE(colored.empty());
+
+        cv::SoftCascade cascade;
+        ASSERT_TRUE(cascade.load(GetParam().first));
+
+        std::vector<cv::Rect> rois, objectBoxes;
+        cascade.detectMultiScale(colored, rois, objectBoxes);
+
+        TEST_CYCLE()
+        {
+            cascade.detectMultiScale(colored, rois, objectBoxes);
+        }
+    }
+}
 
 ///////////////////////////////////////////////////////////////
 // HaarClassifier

From 5d15e4ea58f8aa591d9be9d64ae4d22936fe0b88 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Fri, 21 Sep 2012 16:12:18 +0400
Subject: [PATCH 10/74] CUDA kernels interface

---
 modules/gpu/src/cuda/isf-sc.cu  | 85 +++++++++++++++++++++++++++++++--
 modules/gpu/src/icf.hpp         | 27 ++++++++---
 modules/gpu/src/softcascade.cpp | 14 +++---
 3 files changed, 110 insertions(+), 16 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index a6418c1d3..89a74eeac 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -42,13 +42,90 @@
 
 #include <icf.hpp>
 
-void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv) const
+namespace cv { namespace gpu {
+
+
+ namespace device {
+
+__global__ void rgb2grayluv(const uchar3* __restrict__ rgb, uchar* __restrict__ hog,
+                            const int rgbPitch, const int hogPitch)
 {
-    // detection kernel
 }
 
-void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar4>& image)
+__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog,
+                         const int pitch)
+{
+}
+
+__global__ void decimate(const uchar* __restrict__ hogluv, uchar* __restrict__ shrank,
+                        const int inPitch, const int outPitch )
+{
+}
+
+__global__ void intRow(const uchar* __restrict__ hogluv, ushort* __restrict__ sum,
+                       const int inPitch, const int outPitch)
+{
+
+}
+
+__global__ void intCol(ushort* __restrict__ sum, const int pitch)
+{
+
+}
+
+
+__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch)
+{
+    cascade.detectAt();
+}
+
+}
+
+void __device icf::Cascade::detectAt() const
+{
+
+}
+
+void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const
+{
+    // detection kernel
+
+}
+
+void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream)
 {
     // color convertin kernel
+    dim3 block(32, 8);
+    dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8);
+
+    uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS);
+    device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels, rgb.step, dmem.step);
+    cudaSafeCall( cudaGetLastError());
+
     // hog calculation kernel
-}
\ No newline at end of file
+    channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS);
+    device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    const int shrWidth  = FRAME_WIDTH / shrinkage;
+    const int shrHeight = FRAME_HEIGHT / shrinkage;
+
+    // decimate kernel
+    grid = dim3(shrWidth / 32, shrHeight / 8);
+    device::decimate<<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    // integrate rows
+    block = dim3(shrWidth, 1);
+    grid = dim3(shrHeight * HOG_LUV_BINS, 1);
+    device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), shrunk.step, hogluv.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    // integrate cols
+    block = dim3(128, 1);
+    grid = dim3(shrWidth * HOG_LUV_BINS, 1);
+    device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step);
+    cudaSafeCall( cudaGetLastError() );
+}
+
+}}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 7183fc0ac..8b075beba 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -46,17 +46,19 @@
 #define __OPENCV_ICF_HPP__
 
 #if defined __CUDACC__
-# define __hd__ __host__ __device__ __forceinline__
+# define __device __device__ __forceinline__
 #else
-# define __hd__
+# define __device
 #endif
 
 
-namespace icf {
+namespace cv { namespace gpu { namespace icf {
 
 using cv::gpu::PtrStepSzb;
 using cv::gpu::PtrStepSzf;
 
+typedef unsigned char uchar;
+
 struct Cascade
 {
     Cascade() {}
@@ -64,7 +66,8 @@ struct Cascade
         const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
     : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
 
-    void detect(const cv::gpu::PtrStepSzb& hogluv) const;
+    void detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const;
+    void __device detectAt() const;
 
     PtrStepSzb octaves;
     PtrStepSzf stages;
@@ -83,12 +86,24 @@ struct ChannelStorage
         const cv::gpu::PtrStepSzb& itg, const int s)
     : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
 
-    void frame(const cv::gpu::PtrStepSz<uchar4>& image);
+    void frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream);
 
     PtrStepSzb dmem;
     PtrStepSzb shrunk;
     PtrStepSzb hogluv;
 
+    enum
+    {
+        FRAME_WIDTH        = 640,
+        FRAME_HEIGHT       = 480,
+        TOTAL_SCALES       = 55,
+        CLASSIFIERS        = 5,
+        ORIG_OBJECT_WIDTH  = 64,
+        ORIG_OBJECT_HEIGHT = 128,
+        HOG_BINS           = 6,
+        HOG_LUV_BINS       = 10
+    };
+
     int shrinkage;
 };
 
@@ -143,6 +158,6 @@ struct __align__(8) Level //is actually 24 bytes
         objSize.y  = round(oct.size.y * relScale);
     }
 };
-}
+}}}
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index fb36efddc..b2419c12c 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -100,9 +100,9 @@ struct cv::gpu::SoftCascade::Filds
     };
 
     bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect() const
+    void detect(cudaStream_t stream) const
     {
-        cascade.detect(hogluv);
+        cascade.detect(hogluv, stream);
     }
 
 private:
@@ -394,18 +394,20 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
 }
 
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /*rois*/,
-                                GpuMat& /*objects*/, const int /*rejectfactor*/, Stream /*stream*/)
+                                GpuMat& /*objects*/, const int /*rejectfactor*/, Stream s)
 {
     // only color images are supperted
-    CV_Assert(image.type() == CV_8UC4);
+    CV_Assert(image.type() == CV_8UC3);
 
     // only this window size allowed
     CV_Assert(image.cols == 640 && image.rows == 480);
 
     Filds& flds = *filds;
 
-    flds.storage.frame(image);
-    flds.detect();
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    flds.storage.frame(image, stream);
+    flds.detect(stream);
 }
 
 #endif
\ No newline at end of file

From 0691dc554f046b880f61ad26526a798f01c12b34 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Fri, 21 Sep 2012 18:49:51 +0400
Subject: [PATCH 11/74] fix compilation

---
 modules/gpu/src/cuda/texture_binder.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/gpu/src/cuda/texture_binder.hpp b/modules/gpu/src/cuda/texture_binder.hpp
index 4f42b099d..391eb9a19 100644
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@@ -85,7 +85,7 @@ namespace cv
 
   namespace device
   {
-      using pcl::gpu::TextureBinder;
+      using cv::gpu::TextureBinder;
   }
 }
 

From 1cf7a46f3a01c88fdbefc785943f4b4182ab4079 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Fri, 21 Sep 2012 19:42:19 +0400
Subject: [PATCH 12/74] fix data paths in performance test

---
 modules/gpu/perf/perf_objdetect.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index cebcbdb63..a3a6e9c6b 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -97,12 +97,12 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
 {
     if (runOnGpu)
     {
-        cv::Mat cpu = readImage(GetParam().second);
+        cv::Mat cpu = readImage (GetParam().second);
         ASSERT_FALSE(cpu.empty());
         cv::gpu::GpuMat colored(cpu);
 
         cv::gpu::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(GetParam().first));
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
 
         cv::gpu::GpuMat rois, objectBoxes;
         cascade.detectMultiScale(colored, rois, objectBoxes);

From 08b4e780deb9ff6d7f1dc31031c317ea6b3bb6e6 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Fri, 21 Sep 2012 19:44:30 +0400
Subject: [PATCH 13/74] add shrinking kernel

---
 modules/gpu/src/cuda/isf-sc.cu | 132 +++++++++++++++++++++++++++++++--
 modules/gpu/src/icf.hpp        |   1 +
 2 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 89a74eeac..5cde71070 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -41,25 +41,136 @@
 //M*/
 
 #include <icf.hpp>
+#include <opencv2/gpu/device/saturate_cast.hpp>
 
 namespace cv { namespace gpu {
 
 
  namespace device {
 
+enum {
+    HOG_BINS = 6,
+    HOG_LUV_BINS = 10,
+    WIDTH = 640,
+    HEIGHT = 480,
+    GREY_OFFSET = HEIGHT * HOG_LUV_BINS
+};
+
+/* Returns the nearest upper power of two, works only for
+the typical GPU thread count (pert block) values */
+int power_2up(unsigned int n)
+{
+    if (n < 1) return 1;
+    else if (n < 2) return 2;
+    else if (n < 4) return 4;
+    else if (n < 8) return 8;
+    else if (n < 16) return 16;
+    else if (n < 32) return 32;
+    else if (n < 64) return 64;
+    else if (n < 128) return 128;
+    else if (n < 256) return 256;
+    else if (n < 512) return 512;
+    else if (n < 1024) return 1024;
+    return -1; // Input is too big
+}
+
+
+__device__ __forceinline__ uchar grey(const uchar3 rgb)
+{
+    return saturate_cast<uchar>(rgb.x * 0.114f + rgb.y * 0.587f + rgb.z * 0.299f);
+}
+
+__device__ __forceinline__ void luv(const uchar3 rgb, uchar& l, uchar& u, uchar& v)
+{
+
+}
+
 __global__ void rgb2grayluv(const uchar3* __restrict__ rgb, uchar* __restrict__ hog,
                             const int rgbPitch, const int hogPitch)
 {
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    const uchar3 color = rgb[rgbPitch * y + x];
+
+    uchar l, u, v;
+    luv(color, l, u, v);
+
+    hog[hogPitch *  y + x] = l;
+    hog[hogPitch * (y + HEIGHT) + x] = u;
+    hog[hogPitch * (y + 2 * HEIGHT) + x] = v;
+    hog[hogPitch * (y + 3 * HEIGHT) + x] = grey(color);
 }
 
-__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog,
-                         const int pitch)
+__device__ __forceinline__
+int qangle(const float &y, const float &x)
 {
+    int bin = 0;
+//     const float2 &bin_vector_zero = const_angle_bins_vectors[0];
+//     float max_dot_product = fabs(x*bin_vector_zero.x + y*bin_vector_zero.y);
+
+//     // let us hope this gets unrolled
+// #pragma unroll
+//     for(int i=1; i < num_angles_bin; i+=1)
+//     {
+//         const float2 &bin_vector_i = const_angle_bins_vectors[i];
+//         //const float2 bin_vector_i = const_angle_bins_vectors[i];
+//         //const float2 &bin_vector_i = angle_bins_vectors[i];
+//         const float dot_product = fabs(x*bin_vector_i.x + y*bin_vector_i.y);
+//         if(dot_product > max_dot_product)
+//         {
+//             max_dot_product = dot_product;
+//             index = i;
+//         }
+//     }
+
+    return bin;
 }
 
+// texture<uchar, 2, cudaReadModeElementType> tgray;
+__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog, const int pitch, const float norm)
+{
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // derivative
+    float dx = gray[y * pitch + x + 1];
+    dx -= gray[y * pitch + x - 1];
+
+    float dy = gray[(y + 1) * pitch + x];
+    dy -= gray[(y -1) * pitch + x - 1];
+
+    // mag and angle
+    const uchar mag =  saturate_cast<uchar>(sqrtf(dy * dy + dx * dx) * norm);
+    const int bin = qangle(dx, dy);
+
+}
+
+template <int FACTOR>
+__device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
+{
+    int out = 0;
+#pragma unroll
+    for(int dy = 0; dy < FACTOR; ++dy)
+#pragma unroll
+        for(int dx = 0; dx < FACTOR; ++dx)
+        {
+            out += ptr[dy * pitch + dx];
+        }
+
+    return saturate_cast<uchar>(out / FACTOR);
+}
+
+template<int FACTOR>
 __global__ void decimate(const uchar* __restrict__ hogluv, uchar* __restrict__ shrank,
                         const int inPitch, const int outPitch )
 {
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    const uchar* ptr = hogluv + (FACTOR * y) * inPitch + (FACTOR * x);
+
+    shrank[ y * outPitch + x]= shrink<FACTOR>(ptr, inPitch, y, x);
 }
 
 __global__ void intRow(const uchar* __restrict__ hogluv, ushort* __restrict__ sum,
@@ -89,6 +200,11 @@ void __device icf::Cascade::detectAt() const
 void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const
 {
     // detection kernel
+    dim3 block(32, 8, 1);
+    dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 64);
+    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(ushort));
+    if (!stream)
+        cudaSafeCall( cudaDeviceSynchronize() );
 
 }
 
@@ -99,12 +215,13 @@ void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStrea
     dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8);
 
     uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS);
-    device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels, rgb.step, dmem.step);
+    device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels,
+                                                    rgb.step / sizeof(uchar3), dmem.step);
     cudaSafeCall( cudaGetLastError());
 
     // hog calculation kernel
     channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS);
-    device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step);
+    device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling);
     cudaSafeCall( cudaGetLastError() );
 
     const int shrWidth  = FRAME_WIDTH / shrinkage;
@@ -112,19 +229,20 @@ void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStrea
 
     // decimate kernel
     grid = dim3(shrWidth / 32, shrHeight / 8);
-    device::decimate<<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
+    device::decimate<4><<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
     cudaSafeCall( cudaGetLastError() );
 
     // integrate rows
     block = dim3(shrWidth, 1);
     grid = dim3(shrHeight * HOG_LUV_BINS, 1);
-    device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), shrunk.step, hogluv.step);
+    device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(),
+        shrunk.step, hogluv.step / sizeof(ushort));
     cudaSafeCall( cudaGetLastError() );
 
     // integrate cols
     block = dim3(128, 1);
     grid = dim3(shrWidth * HOG_LUV_BINS, 1);
-    device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step);
+    device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort));
     cudaSafeCall( cudaGetLastError() );
 }
 
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 8b075beba..69d21fdd9 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -105,6 +105,7 @@ struct ChannelStorage
     };
 
     int shrinkage;
+    static const float magnitudeScaling = 1.f ;// / sqrt(2);
 };
 
 struct __align__(16) Octave

From 1bf85996b37d4d8995041050f6e0d03f2beca4bc Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 24 Sep 2012 17:59:44 +0400
Subject: [PATCH 14/74] add detections vector initialization in tests

---
 modules/gpu/perf/perf_objdetect.cpp   | 2 +-
 modules/gpu/test/test_softcascade.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index a3a6e9c6b..48a355d6a 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -104,7 +104,7 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
         cv::gpu::SoftCascade cascade;
         ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
 
-        cv::gpu::GpuMat rois, objectBoxes;
+        cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC1);
         cascade.detectMultiScale(colored, rois, objectBoxes);
 
         TEST_CYCLE()
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index f94b0b726..c7e3a1f77 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -63,7 +63,7 @@ TEST(SoftCascade, detect)
     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
         + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
     ASSERT_FALSE(coloredCpu.empty());
-    GpuMat colored(coloredCpu), objectBoxes, rois;
+    GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois;
 
     // ASSERT_NO_THROW(
     // {

From ba50d193412677954cff9a4a53c4ebe31e2e9661 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 24 Sep 2012 18:00:47 +0400
Subject: [PATCH 15/74] first version of soft cascade on GPU

---
 modules/gpu/src/cuda/isf-sc.cu  | 219 ++++++++++++++++++++++++++------
 modules/gpu/src/icf.hpp         |  78 ++++++------
 modules/gpu/src/softcascade.cpp |  98 ++++++++++++--
 3 files changed, 311 insertions(+), 84 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 5cde71070..37c6e3023 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -42,11 +42,48 @@
 
 #include <icf.hpp>
 #include <opencv2/gpu/device/saturate_cast.hpp>
+#include <stdio.h>
+#include <float.h>
 
-namespace cv { namespace gpu {
+namespace cv { namespace gpu { namespace device {
 
+namespace icf {
 
- namespace device {
+    enum {
+        HOG_BINS = 6,
+        HOG_LUV_BINS = 10,
+        WIDTH = 640,
+        HEIGHT = 480,
+        GREY_OFFSET = HEIGHT * HOG_LUV_BINS
+    };
+
+    __global__ void magToHist(const uchar* __restrict__ mag,
+                              const float* __restrict__ angle, const int angPitch,
+                                    uchar* __restrict__ hog,   const int hogPitch)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        const int bin = (int)(angle[y * angPitch + x]);
+        const uchar val = mag[y * angPitch + x];
+
+        hog[((HEIGHT * bin) + y) * hogPitch + x] = val;
+    }
+
+    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle)
+    {
+        const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS);
+        uchar* hog = (uchar*)hogluv.ptr();
+        const float* angle = (const float*)nangle.ptr();
+
+        dim3 block(32, 8);
+        dim3 grid(WIDTH / 32, HEIGHT / 8);
+
+        magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step);
+        cudaSafeCall( cudaGetLastError() );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
 
 enum {
     HOG_BINS = 6,
@@ -185,65 +222,175 @@ __global__ void intCol(ushort* __restrict__ sum, const int pitch)
 }
 
 
-__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch)
+__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch,
+    PtrStepSz<uchar4> objects)
 {
-    cascade.detectAt();
+    cascade.detectAt(hogluv, pitch, objects);
 }
 
 }
 
-void __device icf::Cascade::detectAt() const
+float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
+                                     const int channel, const float threshold) const
 {
+    float relScale = level.relScale;
+    float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
+    // rescale
+    scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+    scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+    scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+    scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+
+    float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+    float approx = 1.f;
+    if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
+    {
+        const float expected_new_area = farea * relScale * relScale;
+        approx = expected_new_area / sarea;
+    }
+
+    // compensation areas rounding
+    float rootThreshold = threshold / approx;
+    rootThreshold *= level.scaling[(int)(channel > 6)];
+
+    return rootThreshold;
 }
 
-void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const
+typedef unsigned char uchar;
+float __device get(const uchar* __restrict__ hogluv, const int pitch,
+                   const int x, const int y, int channel, uchar4 area)
+{
+    const uchar* curr = hogluv + ((channel * 121) + y) * pitch;
+
+    int a = curr[area.y * pitch + x + area.x];
+    int b = curr[area.y * pitch + x + area.z];
+    int c = curr[area.w * pitch + x + area.z];
+    int d = curr[area.w * pitch + x + area.x];
+
+    return (a - b + c - d);
+}
+
+
+void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int pitch,
+                                    PtrStepSz<uchar4>& objects) const
+{
+    const icf::Level* lls = (const icf::Level*)levels.ptr();
+    Level level = lls[0];
+
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (x >= level.workRect.x || y >= level.workRect.y) return;
+
+    const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
+    const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
+
+    float detectionScore = 0.f;
+
+    int st = stBegin;
+    for(; st < stEnd; ++st)
+    {
+        const float stage = stages(0, st);
+        {
+            const int nId = st * 3;
+
+            // work with root node
+            const Node node = ((const Node*)nodes.ptr())[nId];
+            const Feature feature = ((const Feature*)features.ptr())[node.feature];
+
+            uchar4 scaledRect = feature.rect;
+            float threshold = rescale(level, scaledRect, feature.channel, node.threshold);
+
+            float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
+
+            int next = 1 + (int)(sum >= threshold);
+
+            // leaves
+            const Node leaf = ((const Node*)nodes.ptr())[nId + next];
+            const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
+
+            scaledRect = fLeaf.rect;
+            threshold = rescale(level, scaledRect, feature.channel, node.threshold);
+            sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
+
+            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+            float impact = leaves(0, (st * 4) + lShift);
+
+            detectionScore += impact;
+        }
+
+        if (detectionScore <= stage) break;
+    }
+
+    // if (!threadIdx.x && !threadIdx.y)// printf("%f %d\n", detectionScore, st);
+    //     printf("x %d y %d: %d\n", x, y, st);
+
+    if (st == stEnd)
+    {
+        // printf("  got %d\n", st);
+        uchar4 a;
+        a.x = level.workRect.x;
+        a.y = level.workRect.y;
+        objects(0, threadIdx.x) = a;
+    }
+}
+
+void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, PtrStepSz<uchar4> objects,
+                          cudaStream_t stream) const
 {
     // detection kernel
     dim3 block(32, 8, 1);
-    dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 64);
-    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(ushort));
+    // dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1);
+    dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1);
+    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(ushort), objects);
+    cudaSafeCall( cudaGetLastError() );
     if (!stream)
         cudaSafeCall( cudaDeviceSynchronize() );
 
 }
 
+////////////////////////////////////////////////////
+
+
+
 void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream)
 {
-    // color convertin kernel
-    dim3 block(32, 8);
-    dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8);
+//     // // color convertin kernel
+//     // dim3 block(32, 8);
+//     // dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8);
 
-    uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS);
-    device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels,
-                                                    rgb.step / sizeof(uchar3), dmem.step);
-    cudaSafeCall( cudaGetLastError());
+//     // uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS);
+//     // device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels,
+//     //                                                 rgb.step / sizeof(uchar3), dmem.step);
+//     // cudaSafeCall( cudaGetLastError());
 
-    // hog calculation kernel
-    channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS);
-    device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling);
-    cudaSafeCall( cudaGetLastError() );
+//     // // hog calculation kernel
+//     // channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS);
+//     // device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling);
+//     // cudaSafeCall( cudaGetLastError() );
 
-    const int shrWidth  = FRAME_WIDTH / shrinkage;
-    const int shrHeight = FRAME_HEIGHT / shrinkage;
+//     // const int shrWidth  = FRAME_WIDTH / shrinkage;
+//     // const int shrHeight = FRAME_HEIGHT / shrinkage;
 
-    // decimate kernel
-    grid = dim3(shrWidth / 32, shrHeight / 8);
-    device::decimate<4><<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
-    cudaSafeCall( cudaGetLastError() );
+//     // // decimate kernel
+//     // grid = dim3(shrWidth / 32, shrHeight / 8);
+//     // device::decimate<4><<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
+//     // cudaSafeCall( cudaGetLastError() );
 
-    // integrate rows
-    block = dim3(shrWidth, 1);
-    grid = dim3(shrHeight * HOG_LUV_BINS, 1);
-    device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(),
-        shrunk.step, hogluv.step / sizeof(ushort));
-    cudaSafeCall( cudaGetLastError() );
+//     // // integrate rows
+//     // block = dim3(shrWidth, 1);
+//     // grid = dim3(shrHeight * HOG_LUV_BINS, 1);
+//     // device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(),
+//     //     shrunk.step, hogluv.step / sizeof(ushort));
+//     // cudaSafeCall( cudaGetLastError() );
 
-    // integrate cols
-    block = dim3(128, 1);
-    grid = dim3(shrWidth * HOG_LUV_BINS, 1);
-    device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort));
-    cudaSafeCall( cudaGetLastError() );
+//     // // integrate cols
+//     // block = dim3(128, 1);
+//     // grid = dim3(shrWidth * HOG_LUV_BINS, 1);
+//     // device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort));
+//     // cudaSafeCall( cudaGetLastError() );
 }
 
 }}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 69d21fdd9..454dad881 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -59,6 +59,42 @@ using cv::gpu::PtrStepSzf;
 
 typedef unsigned char uchar;
 
+struct __align__(16) Octave
+{
+    ushort index;
+    ushort stages;
+    ushort shrinkage;
+    ushort2 size;
+    float scale;
+
+    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
+    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
+};
+
+struct __align__(8) Level //is actually 24 bytes
+{
+    int octave;
+
+    // float origScale; //not actually used
+    float relScale;
+    float shrScale;   // used for marking detection
+    float scaling[2]; // calculated according to Dollal paper
+
+    // for 640x480 we can not get overflow
+    uchar2 workRect;
+    uchar2 objSize;
+
+    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+    {
+        workRect.x = round(w / (float)oct.shrinkage);
+        workRect.y = round(h / (float)oct.shrinkage);
+
+        objSize.x  = round(oct.size.x * relScale);
+        objSize.y  = round(oct.size.y * relScale);
+    }
+};
+
 struct Cascade
 {
     Cascade() {}
@@ -66,8 +102,10 @@ struct Cascade
         const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
     : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
 
-    void detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const;
-    void __device detectAt() const;
+    void detect(const cv::gpu::PtrStepSzb& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
+    void __device detectAt(const uchar* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
+    float __device rescale(const icf::Level& level, uchar4& scaledRect,
+                           const int channel, const float threshold) const;
 
     PtrStepSzb octaves;
     PtrStepSzf stages;
@@ -108,18 +146,6 @@ struct ChannelStorage
     static const float magnitudeScaling = 1.f ;// / sqrt(2);
 };
 
-struct __align__(16) Octave
-{
-    ushort index;
-    ushort stages;
-    ushort shrinkage;
-    ushort2 size;
-    float scale;
-
-    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
-    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
-};
-
 struct __align__(8) Node
 {
     int feature;
@@ -135,30 +161,6 @@ struct __align__(8) Feature
 
     Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
 };
-
-struct __align__(8) Level //is actually 24 bytes
-{
-    int octave;
-
-    // float origScale; //not actually used
-    float relScale;
-    float shrScale;   // used for marking detection
-    float scaling[2]; // calculated according to Dollal paper
-
-    // for 640x480 we can not get overflow
-    uchar2 workRect;
-    uchar2 objSize;
-
-    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
-    {
-        workRect.x = round(w / (float)oct.shrinkage);
-        workRect.y = round(h / (float)oct.shrinkage);
-
-        objSize.x  = round(oct.size.x * relScale);
-        objSize.y  = round(oct.size.y * relScale);
-    }
-};
 }}}
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index b2419c12c..abcae73dc 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include <precomp.hpp>
+#include "opencv2/highgui/highgui.hpp"
 
 #if !defined (HAVE_CUDA)
 
@@ -58,6 +59,12 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat
 
 #include <icf.hpp>
 
+namespace cv { namespace gpu { namespace device {
+namespace icf {
+    void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle);
+}
+}}}
+
 struct cv::gpu::SoftCascade::Filds
 {
     // scales range
@@ -81,6 +88,16 @@ struct cv::gpu::SoftCascade::Filds
     // 161x121x10
     GpuMat hogluv;
 
+    // will be removed in final version
+    // temporial mat for cvtColor
+    GpuMat luv;
+
+    // temporial mat for integrall
+    GpuMat integralBuffer;
+
+    // temp matrix for sobel and cartToPolar
+    GpuMat dfdx, dfdy, angle, mag, nmag, nangle;
+
     std::vector<float> scales;
 
     icf::Cascade cascade;
@@ -100,9 +117,9 @@ struct cv::gpu::SoftCascade::Filds
     };
 
     bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(cudaStream_t stream) const
+    void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
-        cascade.detect(hogluv, stream);
+        cascade.detect(hogluv, objects, stream);
     }
 
 private:
@@ -284,7 +301,18 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     // allocate buffers
     dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
     shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
-    hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1);
+    // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1);
+    hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1);
+    luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
+    integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
+
+    dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+    dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+    angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+    mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+
+    nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+    nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 
     storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
     return true;
@@ -393,21 +421,71 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     return true;
 }
 
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /*rois*/,
-                                GpuMat& /*objects*/, const int /*rejectfactor*/, Stream s)
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
+                                GpuMat& objects, const int /*rejectfactor*/, Stream s)
 {
     // only color images are supperted
-    CV_Assert(image.type() == CV_8UC3);
+    CV_Assert(colored.type() == CV_8UC3);
 
-    // only this window size allowed
-    CV_Assert(image.cols == 640 && image.rows == 480);
+    // // only this window size allowed
+    CV_Assert(colored.cols == 640 && colored.rows == 480);
 
     Filds& flds = *filds;
+    GpuMat& dmem = flds.dmem;
+    cudaMemset(dmem.data, 0, dmem.step * dmem.rows);
+    GpuMat& shrunk = flds.shrunk;
+    int w = shrunk.cols;
+    int h = colored.rows / flds.storage.shrinkage;
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    flds.storage.frame(image, stream);
-    flds.detect(stream);
+    std::vector<GpuMat> splited;
+    for(int i = 0; i < 3; ++i)
+    {
+        splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows)));
+    }
+
+    GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) );
+
+    cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);
+
+    //create hog
+    cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25);
+    cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25);
+
+    cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true);
+
+    cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag);
+    cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle);
+
+    GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows));
+    flds.nmag.convertTo(magCannel, CV_8UC1);
+    device::icf::fillBins(dmem, flds.nangle);
+
+    // create luv
+    cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
+    cv::gpu::split(flds.luv, splited);
+
+    GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS));
+    cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+    // cv::Mat cpu(plane);
+    // cv::imshow("channels", cpu);
+    // cv::waitKey(0);
+
+    // fer debug purpose
+    // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+
+    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+    {
+        GpuMat channel(shrunk, cv::Rect(0, h  * i, w, h ));
+        GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1));
+        cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
+    }
+
+    // detection
+    flds.detect(objects, stream);
+
+    // flds.storage.frame(colored, stream);
 }
 
 #endif
\ No newline at end of file

From e606a0d6519eadffba18e6060ace334f408c4411 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 24 Sep 2012 18:05:08 +0400
Subject: [PATCH 16/74] remove dead code

---
 modules/gpu/src/cuda/isf-sc.cu | 179 ---------------------------------
 modules/gpu/src/icf.hpp        |   2 +-
 2 files changed, 1 insertion(+), 180 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 37c6e3023..b5eb5ad17 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -85,143 +85,6 @@ namespace icf {
     }
 }
 
-enum {
-    HOG_BINS = 6,
-    HOG_LUV_BINS = 10,
-    WIDTH = 640,
-    HEIGHT = 480,
-    GREY_OFFSET = HEIGHT * HOG_LUV_BINS
-};
-
-/* Returns the nearest upper power of two, works only for
-the typical GPU thread count (pert block) values */
-int power_2up(unsigned int n)
-{
-    if (n < 1) return 1;
-    else if (n < 2) return 2;
-    else if (n < 4) return 4;
-    else if (n < 8) return 8;
-    else if (n < 16) return 16;
-    else if (n < 32) return 32;
-    else if (n < 64) return 64;
-    else if (n < 128) return 128;
-    else if (n < 256) return 256;
-    else if (n < 512) return 512;
-    else if (n < 1024) return 1024;
-    return -1; // Input is too big
-}
-
-
-__device__ __forceinline__ uchar grey(const uchar3 rgb)
-{
-    return saturate_cast<uchar>(rgb.x * 0.114f + rgb.y * 0.587f + rgb.z * 0.299f);
-}
-
-__device__ __forceinline__ void luv(const uchar3 rgb, uchar& l, uchar& u, uchar& v)
-{
-
-}
-
-__global__ void rgb2grayluv(const uchar3* __restrict__ rgb, uchar* __restrict__ hog,
-                            const int rgbPitch, const int hogPitch)
-{
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-    const uchar3 color = rgb[rgbPitch * y + x];
-
-    uchar l, u, v;
-    luv(color, l, u, v);
-
-    hog[hogPitch *  y + x] = l;
-    hog[hogPitch * (y + HEIGHT) + x] = u;
-    hog[hogPitch * (y + 2 * HEIGHT) + x] = v;
-    hog[hogPitch * (y + 3 * HEIGHT) + x] = grey(color);
-}
-
-__device__ __forceinline__
-int qangle(const float &y, const float &x)
-{
-    int bin = 0;
-//     const float2 &bin_vector_zero = const_angle_bins_vectors[0];
-//     float max_dot_product = fabs(x*bin_vector_zero.x + y*bin_vector_zero.y);
-
-//     // let us hope this gets unrolled
-// #pragma unroll
-//     for(int i=1; i < num_angles_bin; i+=1)
-//     {
-//         const float2 &bin_vector_i = const_angle_bins_vectors[i];
-//         //const float2 bin_vector_i = const_angle_bins_vectors[i];
-//         //const float2 &bin_vector_i = angle_bins_vectors[i];
-//         const float dot_product = fabs(x*bin_vector_i.x + y*bin_vector_i.y);
-//         if(dot_product > max_dot_product)
-//         {
-//             max_dot_product = dot_product;
-//             index = i;
-//         }
-//     }
-
-    return bin;
-}
-
-// texture<uchar, 2, cudaReadModeElementType> tgray;
-__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog, const int pitch, const float norm)
-{
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // derivative
-    float dx = gray[y * pitch + x + 1];
-    dx -= gray[y * pitch + x - 1];
-
-    float dy = gray[(y + 1) * pitch + x];
-    dy -= gray[(y -1) * pitch + x - 1];
-
-    // mag and angle
-    const uchar mag =  saturate_cast<uchar>(sqrtf(dy * dy + dx * dx) * norm);
-    const int bin = qangle(dx, dy);
-
-}
-
-template <int FACTOR>
-__device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
-{
-    int out = 0;
-#pragma unroll
-    for(int dy = 0; dy < FACTOR; ++dy)
-#pragma unroll
-        for(int dx = 0; dx < FACTOR; ++dx)
-        {
-            out += ptr[dy * pitch + dx];
-        }
-
-    return saturate_cast<uchar>(out / FACTOR);
-}
-
-template<int FACTOR>
-__global__ void decimate(const uchar* __restrict__ hogluv, uchar* __restrict__ shrank,
-                        const int inPitch, const int outPitch )
-{
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-    const uchar* ptr = hogluv + (FACTOR * y) * inPitch + (FACTOR * x);
-
-    shrank[ y * outPitch + x]= shrink<FACTOR>(ptr, inPitch, y, x);
-}
-
-__global__ void intRow(const uchar* __restrict__ hogluv, ushort* __restrict__ sum,
-                       const int inPitch, const int outPitch)
-{
-
-}
-
-__global__ void intCol(ushort* __restrict__ sum, const int pitch)
-{
-
-}
-
-
 __global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch,
     PtrStepSz<uchar4> objects)
 {
@@ -351,46 +214,4 @@ void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, PtrStepSz<uchar4> o
 
 }
 
-////////////////////////////////////////////////////
-
-
-
-void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream)
-{
-//     // // color convertin kernel
-//     // dim3 block(32, 8);
-//     // dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8);
-
-//     // uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS);
-//     // device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels,
-//     //                                                 rgb.step / sizeof(uchar3), dmem.step);
-//     // cudaSafeCall( cudaGetLastError());
-
-//     // // hog calculation kernel
-//     // channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS);
-//     // device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling);
-//     // cudaSafeCall( cudaGetLastError() );
-
-//     // const int shrWidth  = FRAME_WIDTH / shrinkage;
-//     // const int shrHeight = FRAME_HEIGHT / shrinkage;
-
-//     // // decimate kernel
-//     // grid = dim3(shrWidth / 32, shrHeight / 8);
-//     // device::decimate<4><<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
-//     // cudaSafeCall( cudaGetLastError() );
-
-//     // // integrate rows
-//     // block = dim3(shrWidth, 1);
-//     // grid = dim3(shrHeight * HOG_LUV_BINS, 1);
-//     // device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(),
-//     //     shrunk.step, hogluv.step / sizeof(ushort));
-//     // cudaSafeCall( cudaGetLastError() );
-
-//     // // integrate cols
-//     // block = dim3(128, 1);
-//     // grid = dim3(shrWidth * HOG_LUV_BINS, 1);
-//     // device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort));
-//     // cudaSafeCall( cudaGetLastError() );
-}
-
 }}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 454dad881..a8ce8d483 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -124,7 +124,7 @@ struct ChannelStorage
         const cv::gpu::PtrStepSzb& itg, const int s)
     : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
 
-    void frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream);
+    void frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream){}
 
     PtrStepSzb dmem;
     PtrStepSzb shrunk;

From dca27b4622c711f52797c126c0b2aba72f421497 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Tue, 25 Sep 2012 11:32:03 +0400
Subject: [PATCH 17/74] fix cast bug; add logging

---
 modules/gpu/src/cuda/isf-sc.cu  | 87 +++++++++++++++++++++++++++------
 modules/gpu/src/icf.hpp         |  4 +-
 modules/gpu/src/softcascade.cpp | 34 +++++++++++--
 3 files changed, 103 insertions(+), 22 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index b5eb5ad17..6572c54fc 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -45,6 +45,8 @@
 #include <stdio.h>
 #include <float.h>
 
+//#define LOG_CUDA_CASCADE
+
 namespace cv { namespace gpu { namespace device {
 
 namespace icf {
@@ -85,7 +87,7 @@ namespace icf {
     }
 }
 
-__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch,
+__global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
     PtrStepSz<uchar4> objects)
 {
     cascade.detectAt(hogluv, pitch, objects);
@@ -96,6 +98,11 @@ __global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restr
 float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
                                      const int channel, const float threshold) const
 {
+#if defined LOG_CUDA_CASCADE
+    printf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
+    printf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
+#endif
+
     float relScale = level.relScale;
     float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
@@ -107,6 +114,7 @@ float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect
 
     float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
+
     float approx = 1.f;
     if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
     {
@@ -114,40 +122,72 @@ float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect
         approx = expected_new_area / sarea;
     }
 
+#if defined LOG_CUDA_CASCADE
+    printf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
+        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
+#endif
+
     // compensation areas rounding
     float rootThreshold = threshold / approx;
+    // printf("    approx %f\n", rootThreshold);
     rootThreshold *= level.scaling[(int)(channel > 6)];
 
+#if defined LOG_CUDA_CASCADE
+    printf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
+#endif
+
     return rootThreshold;
 }
 
 typedef unsigned char uchar;
-float __device get(const uchar* __restrict__ hogluv, const int pitch,
+float __device get(const int* __restrict__ hogluv, const int pitch,
                    const int x, const int y, int channel, uchar4 area)
 {
-    const uchar* curr = hogluv + ((channel * 121) + y) * pitch;
+#if defined LOG_CUDA_CASCADE
+    printf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
+    printf("get for channel %d\n", channel);
+    printf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
+        x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
+        x + area.x, y + area.w);
+    printf("at point %d %d with offset %d\n", x, y, 0);
+#endif
+
+    const int* curr = hogluv + ((channel * 121) + y) * pitch;
 
     int a = curr[area.y * pitch + x + area.x];
     int b = curr[area.y * pitch + x + area.z];
     int c = curr[area.w * pitch + x + area.z];
     int d = curr[area.w * pitch + x + area.x];
 
+#if defined LOG_CUDA_CASCADE
+    printf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
+#endif
+
     return (a - b + c - d);
 }
 
 
-void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int pitch,
+void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch,
                                     PtrStepSz<uchar4>& objects) const
 {
     const icf::Level* lls = (const icf::Level*)levels.ptr();
-    Level level = lls[0];
 
     const int y = blockIdx.y * blockDim.y + threadIdx.y;
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    // if (x > 0 || y > 0) return;
 
+    Level level = lls[0];
     if (x >= level.workRect.x || y >= level.workRect.y) return;
 
+#if defined LOG_CUDA_CASCADE
+    printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
+        level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
+#endif
+
     const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
+    // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
+    //     octave.shrinkage, octave.size.x, octave.size.y, octave.scale);
+
     const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
 
     float detectionScore = 0.f;
@@ -156,11 +196,17 @@ void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int
     for(; st < stEnd; ++st)
     {
         const float stage = stages(0, st);
+#if defined LOG_CUDA_CASCADE
+        printf("Stage: %f\n", stage);
+#endif
         {
             const int nId = st * 3;
 
             // work with root node
             const Node node = ((const Node*)nodes.ptr())[nId];
+#if defined LOG_CUDA_CASCADE
+            printf("Node: %d %f\n", node.feature, node.threshold);
+#endif
             const Feature feature = ((const Feature*)features.ptr())[node.feature];
 
             uchar4 scaledRect = feature.rect;
@@ -168,31 +214,46 @@ void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int
 
             float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
 
+#if defined LOG_CUDA_CASCADE
+            printf("root feature %d %f\n",feature.channel, sum);
+#endif
             int next = 1 + (int)(sum >= threshold);
 
+#if defined LOG_CUDA_CASCADE
+            printf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
+#endif
             // leaves
             const Node leaf = ((const Node*)nodes.ptr())[nId + next];
             const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
 
             scaledRect = fLeaf.rect;
-            threshold = rescale(level, scaledRect, feature.channel, node.threshold);
+            threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold);
             sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
 
             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
             float impact = leaves(0, (st * 4) + lShift);
 
             detectionScore += impact;
+
+#if defined LOG_CUDA_CASCADE
+            printf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+            printf("extracted stage:\n");
+            printf("ct %f\n", stage);
+            printf("computed score %f\n\n", detectionScore);
+            printf("\n\n");
+#endif
+
         }
 
         if (detectionScore <= stage) break;
     }
 
-    // if (!threadIdx.x && !threadIdx.y)// printf("%f %d\n", detectionScore, st);
-    //     printf("x %d y %d: %d\n", x, y, st);
+#if defined LOG_CUDA_CASCADE
+    // printf("x %d y %d: %d\n", x, y, st - stBegin);
+#endif
 
     if (st == stEnd)
     {
-        // printf("  got %d\n", st);
         uchar4 a;
         a.x = level.workRect.x;
         a.y = level.workRect.y;
@@ -200,18 +261,14 @@ void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int
     }
 }
 
-void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, PtrStepSz<uchar4> objects,
-                          cudaStream_t stream) const
+void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
 {
-    // detection kernel
     dim3 block(32, 8, 1);
-    // dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1);
     dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1);
-    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(ushort), objects);
+    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
     cudaSafeCall( cudaGetLastError() );
     if (!stream)
         cudaSafeCall( cudaDeviceSynchronize() );
-
 }
 
 }}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index a8ce8d483..7d4b65980 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -102,8 +102,8 @@ struct Cascade
         const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
     : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
 
-    void detect(const cv::gpu::PtrStepSzb& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
-    void __device detectAt(const uchar* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
+    void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
+    void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
     float __device rescale(const icf::Level& level, uchar4& scaledRect,
                            const int channel, const float threshold) const;
 
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index abcae73dc..7e1a5abb9 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -381,6 +381,9 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octav
         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
 
+        // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
+        //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
+
         // std::cout << "level " << sc
         //           << " octeve "
         //           << vlevels[sc].octave
@@ -421,6 +424,15 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     return true;
 }
 
+namespace {
+    char *itoa(long i, char* s, int /*dummy_radix*/)
+    {
+        sprintf(s, "%ld", i);
+        return s;
+    }
+}
+
+#define USE_REFERENCE_VALUES
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
                                 GpuMat& objects, const int /*rejectfactor*/, Stream s)
 {
@@ -431,14 +443,26 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     CV_Assert(colored.cols == 640 && colored.rows == 480);
 
     Filds& flds = *filds;
+
+#if defined USE_REFERENCE_VALUES
+    cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+    cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
+    char buff[33];
+
+    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+    {
+        cv::Mat channel;
+        imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
+        GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
+        gchannel.upload(channel);
+    }
+#else
     GpuMat& dmem = flds.dmem;
     cudaMemset(dmem.data, 0, dmem.step * dmem.rows);
     GpuMat& shrunk = flds.shrunk;
     int w = shrunk.cols;
     int h = colored.rows / flds.storage.shrinkage;
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
     std::vector<GpuMat> splited;
     for(int i = 0; i < 3; ++i)
     {
@@ -468,9 +492,6 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
 
     GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS));
     cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-    // cv::Mat cpu(plane);
-    // cv::imshow("channels", cpu);
-    // cv::waitKey(0);
 
     // fer debug purpose
     // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
@@ -482,6 +503,9 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
         cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
     }
 
+#endif
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
     // detection
     flds.detect(objects, stream);
 

From 4881205baee2807aeb9fe5e70551290474e80671 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Tue, 25 Sep 2012 22:43:43 +0400
Subject: [PATCH 18/74] refactor logs

---
 modules/gpu/src/cuda/isf-sc.cu | 82 ++++++++++++++--------------------
 1 file changed, 34 insertions(+), 48 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 6572c54fc..ccc1ddf30 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -47,6 +47,13 @@
 
 //#define LOG_CUDA_CASCADE
 
+#if defined LOG_CUDA_CASCADE
+# define dprintf(format, ...) \
+            do { printf(format, __VA_ARGS__); } while (0)
+#else
+# define dprintf(format, ...)
+#endif
+
 namespace cv { namespace gpu { namespace device {
 
 namespace icf {
@@ -98,10 +105,8 @@ __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restric
 float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
                                      const int channel, const float threshold) const
 {
-#if defined LOG_CUDA_CASCADE
-    printf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
-    printf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
-#endif
+    dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
+    dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
 
     float relScale = level.relScale;
     float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
@@ -122,19 +127,15 @@ float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect
         approx = expected_new_area / sarea;
     }
 
-#if defined LOG_CUDA_CASCADE
-    printf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
+    dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
-#endif
 
     // compensation areas rounding
     float rootThreshold = threshold / approx;
     // printf("    approx %f\n", rootThreshold);
     rootThreshold *= level.scaling[(int)(channel > 6)];
 
-#if defined LOG_CUDA_CASCADE
-    printf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
-#endif
+    dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
 
     return rootThreshold;
 }
@@ -143,14 +144,12 @@ typedef unsigned char uchar;
 float __device get(const int* __restrict__ hogluv, const int pitch,
                    const int x, const int y, int channel, uchar4 area)
 {
-#if defined LOG_CUDA_CASCADE
-    printf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
-    printf("get for channel %d\n", channel);
-    printf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
+    dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
+    dprintf("get for channel %d\n", channel);
+    dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
         x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
         x + area.x, y + area.w);
-    printf("at point %d %d with offset %d\n", x, y, 0);
-#endif
+    dprintf("at point %d %d with offset %d\n", x, y, 0);
 
     const int* curr = hogluv + ((channel * 121) + y) * pitch;
 
@@ -159,9 +158,7 @@ float __device get(const int* __restrict__ hogluv, const int pitch,
     int c = curr[area.w * pitch + x + area.z];
     int d = curr[area.w * pitch + x + area.x];
 
-#if defined LOG_CUDA_CASCADE
-    printf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
-#endif
+    dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
 
     return (a - b + c - d);
 }
@@ -176,13 +173,11 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
     // if (x > 0 || y > 0) return;
 
-    Level level = lls[0];
+    Level level = lls[blockIdx.z];
     if (x >= level.workRect.x || y >= level.workRect.y) return;
 
-#if defined LOG_CUDA_CASCADE
-    printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
+    dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
         level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
-#endif
 
     const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
     // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
@@ -196,17 +191,15 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p
     for(; st < stEnd; ++st)
     {
         const float stage = stages(0, st);
-#if defined LOG_CUDA_CASCADE
-        printf("Stage: %f\n", stage);
-#endif
+        dprintf("Stage: %f\n", stage);
         {
             const int nId = st * 3;
 
             // work with root node
             const Node node = ((const Node*)nodes.ptr())[nId];
-#if defined LOG_CUDA_CASCADE
-            printf("Node: %d %f\n", node.feature, node.threshold);
-#endif
+
+            dprintf("Node: %d %f\n", node.feature, node.threshold);
+
             const Feature feature = ((const Feature*)features.ptr())[node.feature];
 
             uchar4 scaledRect = feature.rect;
@@ -214,14 +207,12 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p
 
             float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
 
-#if defined LOG_CUDA_CASCADE
-            printf("root feature %d %f\n",feature.channel, sum);
-#endif
+            dprintf("root feature %d %f\n",feature.channel, sum);
+
             int next = 1 + (int)(sum >= threshold);
 
-#if defined LOG_CUDA_CASCADE
-            printf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
-#endif
+            dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
+
             // leaves
             const Node leaf = ((const Node*)nodes.ptr())[nId + next];
             const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
@@ -235,22 +226,17 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p
 
             detectionScore += impact;
 
-#if defined LOG_CUDA_CASCADE
-            printf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-            printf("extracted stage:\n");
-            printf("ct %f\n", stage);
-            printf("computed score %f\n\n", detectionScore);
-            printf("\n\n");
-#endif
-
+            dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+            dprintf("extracted stage:\n");
+            dprintf("ct %f\n", stage);
+            dprintf("computed score %f\n\n", detectionScore);
+            dprintf("\n\n");
         }
 
-        if (detectionScore <= stage) break;
+        if (detectionScore <= stage || st - stBegin == 100) break;
     }
 
-#if defined LOG_CUDA_CASCADE
-    // printf("x %d y %d: %d\n", x, y, st - stBegin);
-#endif
+    dprintf("x %d y %d: %d\n", x, y, st - stBegin);
 
     if (st == stEnd)
     {
@@ -264,7 +250,7 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p
 void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
 {
     dim3 block(32, 8, 1);
-    dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1);
+    dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47);
     device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
     cudaSafeCall( cudaGetLastError() );
     if (!stream)

From 1917366528d0703b7646de10931a01490bb32b4f Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 26 Sep 2012 11:18:09 +0400
Subject: [PATCH 19/74] empty cascade

---
 modules/gpu/src/cuda/isf-sc.cu  | 326 +++++++--------
 modules/gpu/src/icf.hpp         | 192 ++++-----
 modules/gpu/src/softcascade.cpp | 678 ++++++++++++++++----------------
 3 files changed, 596 insertions(+), 600 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index ccc1ddf30..33b2222c7 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -40,221 +40,221 @@
 //
 //M*/
 
-#include <icf.hpp>
-#include <opencv2/gpu/device/saturate_cast.hpp>
-#include <stdio.h>
-#include <float.h>
+// #include <icf.hpp>
+// #include <opencv2/gpu/device/saturate_cast.hpp>
+// #include <stdio.h>
+// #include <float.h>
 
-//#define LOG_CUDA_CASCADE
+// //#define LOG_CUDA_CASCADE
 
-#if defined LOG_CUDA_CASCADE
-# define dprintf(format, ...) \
-            do { printf(format, __VA_ARGS__); } while (0)
-#else
-# define dprintf(format, ...)
-#endif
+// #if defined LOG_CUDA_CASCADE
+// # define dprintf(format, ...) \
+//             do { printf(format, __VA_ARGS__); } while (0)
+// #else
+// # define dprintf(format, ...)
+// #endif
 
-namespace cv { namespace gpu { namespace device {
+// namespace cv { namespace gpu { namespace device {
 
-namespace icf {
+// namespace icf {
 
-    enum {
-        HOG_BINS = 6,
-        HOG_LUV_BINS = 10,
-        WIDTH = 640,
-        HEIGHT = 480,
-        GREY_OFFSET = HEIGHT * HOG_LUV_BINS
-    };
+//     enum {
+//         HOG_BINS = 6,
+//         HOG_LUV_BINS = 10,
+//         WIDTH = 640,
+//         HEIGHT = 480,
+//         GREY_OFFSET = HEIGHT * HOG_LUV_BINS
+//     };
 
-    __global__ void magToHist(const uchar* __restrict__ mag,
-                              const float* __restrict__ angle, const int angPitch,
-                                    uchar* __restrict__ hog,   const int hogPitch)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+//     __global__ void magToHist(const uchar* __restrict__ mag,
+//                               const float* __restrict__ angle, const int angPitch,
+//                                     uchar* __restrict__ hog,   const int hogPitch)
+//     {
+//         const int y = blockIdx.y * blockDim.y + threadIdx.y;
+//         const int x = blockIdx.x * blockDim.x + threadIdx.x;
 
-        const int bin = (int)(angle[y * angPitch + x]);
-        const uchar val = mag[y * angPitch + x];
+//         const int bin = (int)(angle[y * angPitch + x]);
+//         const uchar val = mag[y * angPitch + x];
 
-        hog[((HEIGHT * bin) + y) * hogPitch + x] = val;
-    }
+//         hog[((HEIGHT * bin) + y) * hogPitch + x] = val;
+//     }
 
-    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle)
-    {
-        const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS);
-        uchar* hog = (uchar*)hogluv.ptr();
-        const float* angle = (const float*)nangle.ptr();
+//     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle)
+//     {
+//         const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS);
+//         uchar* hog = (uchar*)hogluv.ptr();
+//         const float* angle = (const float*)nangle.ptr();
 
-        dim3 block(32, 8);
-        dim3 grid(WIDTH / 32, HEIGHT / 8);
+//         dim3 block(32, 8);
+//         dim3 grid(WIDTH / 32, HEIGHT / 8);
 
-        magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step);
-        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
+//         magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step);
+//         cudaSafeCall( cudaGetLastError() );
+//         cudaSafeCall( cudaDeviceSynchronize() );
+//     }
+// }
 
-__global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
-    PtrStepSz<uchar4> objects)
-{
-    cascade.detectAt(hogluv, pitch, objects);
-}
+// __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
+//     PtrStepSz<uchar4> objects)
+// {
+//     cascade.detectAt(hogluv, pitch, objects);
+// }
 
-}
+// }
 
-float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
-                                     const int channel, const float threshold) const
-{
-    dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
-    dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
+// float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
+//                                      const int channel, const float threshold) const
+// {
+//     dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
+//     dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
 
-    float relScale = level.relScale;
-    float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+//     float relScale = level.relScale;
+//     float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
-    // rescale
-    scaledRect.x = __float2int_rn(relScale * scaledRect.x);
-    scaledRect.y = __float2int_rn(relScale * scaledRect.y);
-    scaledRect.z = __float2int_rn(relScale * scaledRect.z);
-    scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+//     // rescale
+//     scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+//     scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+//     scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+//     scaledRect.w = __float2int_rn(relScale * scaledRect.w);
 
-    float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+//     float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
 
-    float approx = 1.f;
-    if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
-    {
-        const float expected_new_area = farea * relScale * relScale;
-        approx = expected_new_area / sarea;
-    }
+//     float approx = 1.f;
+//     if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
+//     {
+//         const float expected_new_area = farea * relScale * relScale;
+//         approx = expected_new_area / sarea;
+//     }
 
-    dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
-        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
+//     dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
+//         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
 
-    // compensation areas rounding
-    float rootThreshold = threshold / approx;
-    // printf("    approx %f\n", rootThreshold);
-    rootThreshold *= level.scaling[(int)(channel > 6)];
+//     // compensation areas rounding
+//     float rootThreshold = threshold / approx;
+//     // printf("    approx %f\n", rootThreshold);
+//     rootThreshold *= level.scaling[(int)(channel > 6)];
 
-    dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
+//     dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
 
-    return rootThreshold;
-}
+//     return rootThreshold;
+// }
 
-typedef unsigned char uchar;
-float __device get(const int* __restrict__ hogluv, const int pitch,
-                   const int x, const int y, int channel, uchar4 area)
-{
-    dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
-    dprintf("get for channel %d\n", channel);
-    dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
-        x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
-        x + area.x, y + area.w);
-    dprintf("at point %d %d with offset %d\n", x, y, 0);
+// typedef unsigned char uchar;
+// float __device get(const int* __restrict__ hogluv, const int pitch,
+//                    const int x, const int y, int channel, uchar4 area)
+// {
+//     dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
+//     dprintf("get for channel %d\n", channel);
+//     dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
+//         x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
+//         x + area.x, y + area.w);
+//     dprintf("at point %d %d with offset %d\n", x, y, 0);
 
-    const int* curr = hogluv + ((channel * 121) + y) * pitch;
+//     const int* curr = hogluv + ((channel * 121) + y) * pitch;
 
-    int a = curr[area.y * pitch + x + area.x];
-    int b = curr[area.y * pitch + x + area.z];
-    int c = curr[area.w * pitch + x + area.z];
-    int d = curr[area.w * pitch + x + area.x];
+//     int a = curr[area.y * pitch + x + area.x];
+//     int b = curr[area.y * pitch + x + area.z];
+//     int c = curr[area.w * pitch + x + area.z];
+//     int d = curr[area.w * pitch + x + area.x];
 
-    dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
+//     dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
 
-    return (a - b + c - d);
-}
+//     return (a - b + c - d);
+// }
 
 
-void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch,
-                                    PtrStepSz<uchar4>& objects) const
-{
-    const icf::Level* lls = (const icf::Level*)levels.ptr();
+// void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch,
+//                                     PtrStepSz<uchar4>& objects) const
+// {
+//     const icf::Level* lls = (const icf::Level*)levels.ptr();
 
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    // if (x > 0 || y > 0) return;
+//     const int y = blockIdx.y * blockDim.y + threadIdx.y;
+//     const int x = blockIdx.x * blockDim.x + threadIdx.x;
+//     // if (x > 0 || y > 0) return;
 
-    Level level = lls[blockIdx.z];
-    if (x >= level.workRect.x || y >= level.workRect.y) return;
+//     Level level = lls[blockIdx.z];
+//     if (x >= level.workRect.x || y >= level.workRect.y) return;
 
-    dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-        level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
+//     dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
+//         level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
 
-    const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
-    // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
-    //     octave.shrinkage, octave.size.x, octave.size.y, octave.scale);
+//     const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
+//     // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
+//     //     octave.shrinkage, octave.size.x, octave.size.y, octave.scale);
 
-    const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
+//     const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
 
-    float detectionScore = 0.f;
+//     float detectionScore = 0.f;
 
-    int st = stBegin;
-    for(; st < stEnd; ++st)
-    {
-        const float stage = stages(0, st);
-        dprintf("Stage: %f\n", stage);
-        {
-            const int nId = st * 3;
+//     int st = stBegin;
+//     for(; st < stEnd; ++st)
+//     {
+//         const float stage = stages(0, st);
+//         dprintf("Stage: %f\n", stage);
+//         {
+//             const int nId = st * 3;
 
-            // work with root node
-            const Node node = ((const Node*)nodes.ptr())[nId];
+//             // work with root node
+//             const Node node = ((const Node*)nodes.ptr())[nId];
 
-            dprintf("Node: %d %f\n", node.feature, node.threshold);
+//             dprintf("Node: %d %f\n", node.feature, node.threshold);
 
-            const Feature feature = ((const Feature*)features.ptr())[node.feature];
+//             const Feature feature = ((const Feature*)features.ptr())[node.feature];
 
-            uchar4 scaledRect = feature.rect;
-            float threshold = rescale(level, scaledRect, feature.channel, node.threshold);
+//             uchar4 scaledRect = feature.rect;
+//             float threshold = rescale(level, scaledRect, feature.channel, node.threshold);
 
-            float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
+//             float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
 
-            dprintf("root feature %d %f\n",feature.channel, sum);
+//             dprintf("root feature %d %f\n",feature.channel, sum);
 
-            int next = 1 + (int)(sum >= threshold);
+//             int next = 1 + (int)(sum >= threshold);
 
-            dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
+//             dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
 
-            // leaves
-            const Node leaf = ((const Node*)nodes.ptr())[nId + next];
-            const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
+//             // leaves
+//             const Node leaf = ((const Node*)nodes.ptr())[nId + next];
+//             const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
 
-            scaledRect = fLeaf.rect;
-            threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold);
-            sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
+//             scaledRect = fLeaf.rect;
+//             threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold);
+//             sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
 
-            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-            float impact = leaves(0, (st * 4) + lShift);
+//             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+//             float impact = leaves(0, (st * 4) + lShift);
 
-            detectionScore += impact;
+//             detectionScore += impact;
 
-            dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-            dprintf("extracted stage:\n");
-            dprintf("ct %f\n", stage);
-            dprintf("computed score %f\n\n", detectionScore);
-            dprintf("\n\n");
-        }
+//             dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+//             dprintf("extracted stage:\n");
+//             dprintf("ct %f\n", stage);
+//             dprintf("computed score %f\n\n", detectionScore);
+//             dprintf("\n\n");
+//         }
 
-        if (detectionScore <= stage || st - stBegin == 100) break;
-    }
+//         if (detectionScore <= stage || st - stBegin == 100) break;
+//     }
 
-    dprintf("x %d y %d: %d\n", x, y, st - stBegin);
+//     dprintf("x %d y %d: %d\n", x, y, st - stBegin);
 
-    if (st == stEnd)
-    {
-        uchar4 a;
-        a.x = level.workRect.x;
-        a.y = level.workRect.y;
-        objects(0, threadIdx.x) = a;
-    }
-}
+//     if (st == stEnd)
+//     {
+//         uchar4 a;
+//         a.x = level.workRect.x;
+//         a.y = level.workRect.y;
+//         objects(0, threadIdx.x) = a;
+//     }
+// }
 
-void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
-{
-    dim3 block(32, 8, 1);
-    dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47);
-    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
-    cudaSafeCall( cudaGetLastError() );
-    if (!stream)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+// void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
+// {
+//     dim3 block(32, 8, 1);
+//     dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47);
+//     device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
+//     cudaSafeCall( cudaGetLastError() );
+//     if (!stream)
+//         cudaSafeCall( cudaDeviceSynchronize() );
+// }
 
-}}
\ No newline at end of file
+// }}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 7d4b65980..cf1348007 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -40,127 +40,127 @@
 //
 //M*/
 
-#include <opencv2/gpu/device/common.hpp>
+// #include <opencv2/gpu/device/common.hpp>
 
-#ifndef __OPENCV_ICF_HPP__
-#define __OPENCV_ICF_HPP__
+// #ifndef __OPENCV_ICF_HPP__
+// #define __OPENCV_ICF_HPP__
 
-#if defined __CUDACC__
-# define __device __device__ __forceinline__
-#else
-# define __device
-#endif
+// #if defined __CUDACC__
+// # define __device __device__ __forceinline__
+// #else
+// # define __device
+// #endif
 
 
-namespace cv { namespace gpu { namespace icf {
+// namespace cv { namespace gpu { namespace icf {
 
-using cv::gpu::PtrStepSzb;
-using cv::gpu::PtrStepSzf;
+// using cv::gpu::PtrStepSzb;
+// using cv::gpu::PtrStepSzf;
 
-typedef unsigned char uchar;
+// typedef unsigned char uchar;
 
-struct __align__(16) Octave
-{
-    ushort index;
-    ushort stages;
-    ushort shrinkage;
-    ushort2 size;
-    float scale;
+// struct __align__(16) Octave
+// {
+//     ushort index;
+//     ushort stages;
+//     ushort shrinkage;
+//     ushort2 size;
+//     float scale;
 
-    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
-    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
-};
+//     Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
+//     : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
+// };
 
-struct __align__(8) Level //is actually 24 bytes
-{
-    int octave;
+// struct __align__(8) Level //is actually 24 bytes
+// {
+//     int octave;
 
-    // float origScale; //not actually used
-    float relScale;
-    float shrScale;   // used for marking detection
-    float scaling[2]; // calculated according to Dollal paper
+//     // float origScale; //not actually used
+//     float relScale;
+//     float shrScale;   // used for marking detection
+//     float scaling[2]; // calculated according to Dollal paper
 
-    // for 640x480 we can not get overflow
-    uchar2 workRect;
-    uchar2 objSize;
+//     // for 640x480 we can not get overflow
+//     uchar2 workRect;
+//     uchar2 objSize;
 
-    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
-    {
-        workRect.x = round(w / (float)oct.shrinkage);
-        workRect.y = round(h / (float)oct.shrinkage);
+//     Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+//     :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+//     {
+//         workRect.x = round(w / (float)oct.shrinkage);
+//         workRect.y = round(h / (float)oct.shrinkage);
 
-        objSize.x  = round(oct.size.x * relScale);
-        objSize.y  = round(oct.size.y * relScale);
-    }
-};
+//         objSize.x  = round(oct.size.x * relScale);
+//         objSize.y  = round(oct.size.y * relScale);
+//     }
+// };
 
-struct Cascade
-{
-    Cascade() {}
-    Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds,
-        const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
-    : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
+// struct Cascade
+// {
+//     Cascade() {}
+//     Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds,
+//         const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
+//     : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
 
-    void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
-    void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
-    float __device rescale(const icf::Level& level, uchar4& scaledRect,
-                           const int channel, const float threshold) const;
+//     void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
+//     void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
+//     float __device rescale(const icf::Level& level, uchar4& scaledRect,
+//                            const int channel, const float threshold) const;
 
-    PtrStepSzb octaves;
-    PtrStepSzf stages;
-    PtrStepSzb nodes;
-    PtrStepSzf leaves;
-    PtrStepSzb features;
+//     PtrStepSzb octaves;
+//     PtrStepSzf stages;
+//     PtrStepSzb nodes;
+//     PtrStepSzf leaves;
+//     PtrStepSzb features;
 
-    PtrStepSzb levels;
+//     PtrStepSzb levels;
 
-};
+// };
 
-struct ChannelStorage
-{
-    ChannelStorage(){}
-    ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr,
-        const cv::gpu::PtrStepSzb& itg, const int s)
-    : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
+// struct ChannelStorage
+// {
+//     ChannelStorage(){}
+//     ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr,
+//         const cv::gpu::PtrStepSzb& itg, const int s)
+//     : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
 
-    void frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream){}
+//     void frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream){}
 
-    PtrStepSzb dmem;
-    PtrStepSzb shrunk;
-    PtrStepSzb hogluv;
+//     PtrStepSzb dmem;
+//     PtrStepSzb shrunk;
+//     PtrStepSzb hogluv;
 
-    enum
-    {
-        FRAME_WIDTH        = 640,
-        FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        CLASSIFIERS        = 5,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128,
-        HOG_BINS           = 6,
-        HOG_LUV_BINS       = 10
-    };
+//     enum
+//     {
+//         FRAME_WIDTH        = 640,
+//         FRAME_HEIGHT       = 480,
+//         TOTAL_SCALES       = 55,
+//         CLASSIFIERS        = 5,
+//         ORIG_OBJECT_WIDTH  = 64,
+//         ORIG_OBJECT_HEIGHT = 128,
+//         HOG_BINS           = 6,
+//         HOG_LUV_BINS       = 10
+//     };
 
-    int shrinkage;
-    static const float magnitudeScaling = 1.f ;// / sqrt(2);
-};
+//     int shrinkage;
+//     static const float magnitudeScaling = 1.f ;// / sqrt(2);
+// };
 
-struct __align__(8) Node
-{
-    int feature;
-    float threshold;
+// struct __align__(8) Node
+// {
+//     int feature;
+//     float threshold;
 
-    Node(const int f, const float t) : feature(f), threshold(t) {}
-};
+//     Node(const int f, const float t) : feature(f), threshold(t) {}
+// };
 
-struct __align__(8) Feature
-{
-    int channel;
-    uchar4 rect;
+// struct __align__(8) Feature
+// {
+//     int channel;
+//     uchar4 rect;
 
-    Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
-};
-}}}
+//     Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
+// };
+// }}}
 
-#endif
\ No newline at end of file
+// #endif
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 7e1a5abb9..c4334ca1d 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -41,361 +41,365 @@
 //M*/
 
 #include <precomp.hpp>
-#include "opencv2/highgui/highgui.hpp"
+#include <opencv2/highgui/highgui.hpp>
 
 #if !defined (HAVE_CUDA)
 
 cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); }
-
 cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); }
-
 cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); }
-
-bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); }
-
+bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; }
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu(); }
 
 #else
 
-#include <icf.hpp>
+// #include <icf.hpp>
 
-namespace cv { namespace gpu { namespace device {
-namespace icf {
-    void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle);
-}
-}}}
+// namespace cv { namespace gpu { namespace device {
+// namespace icf {
+//     void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle);
+// }
+// }}}
+
+// namespace {
+//     char *itoa(long i, char* s, int /*dummy_radix*/)
+//     {
+//         sprintf(s, "%ld", i);
+//         return s;
+//     }
+// }
 
 struct cv::gpu::SoftCascade::Filds
 {
-    // scales range
-    float minScale;
-    float maxScale;
+//     // scales range
+//     float minScale;
+//     float maxScale;
 
-    int origObjWidth;
-    int origObjHeight;
+//     int origObjWidth;
+//     int origObjHeight;
 
-    GpuMat octaves;
-    GpuMat stages;
-    GpuMat nodes;
-    GpuMat leaves;
-    GpuMat features;
-    GpuMat levels;
+//     GpuMat octaves;
+//     GpuMat stages;
+//     GpuMat nodes;
+//     GpuMat leaves;
+//     GpuMat features;
+//     GpuMat levels;
 
-    // preallocated buffer 640x480x10 + 640x480
-    GpuMat dmem;
-    // 160x120x10
-    GpuMat shrunk;
-    // 161x121x10
-    GpuMat hogluv;
+//     // preallocated buffer 640x480x10 + 640x480
+//     GpuMat dmem;
+//     // 160x120x10
+//     GpuMat shrunk;
+//     // 161x121x10
+//     GpuMat hogluv;
 
-    // will be removed in final version
-    // temporial mat for cvtColor
-    GpuMat luv;
+//     // will be removed in final version
+//     // temporial mat for cvtColor
+//     GpuMat luv;
 
-    // temporial mat for integrall
-    GpuMat integralBuffer;
+//     // temporial mat for integrall
+//     GpuMat integralBuffer;
 
-    // temp matrix for sobel and cartToPolar
-    GpuMat dfdx, dfdy, angle, mag, nmag, nangle;
+//     // temp matrix for sobel and cartToPolar
+//     GpuMat dfdx, dfdy, angle, mag, nmag, nangle;
 
-    std::vector<float> scales;
+//     std::vector<float> scales;
 
-    icf::Cascade cascade;
-    icf::ChannelStorage storage;
+//     icf::Cascade cascade;
+//     icf::ChannelStorage storage;
 
-    enum { BOOST = 0 };
-    enum
-    {
-        FRAME_WIDTH        = 640,
-        FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        CLASSIFIERS        = 5,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128,
-        HOG_BINS           = 6,
-        HOG_LUV_BINS       = 10
-    };
+//     enum { BOOST = 0 };
+//     enum
+//     {
+//         FRAME_WIDTH        = 640,
+//         FRAME_HEIGHT       = 480,
+//         TOTAL_SCALES       = 55,
+//         CLASSIFIERS        = 5,
+//         ORIG_OBJECT_WIDTH  = 64,
+//         ORIG_OBJECT_HEIGHT = 128,
+//         HOG_BINS           = 6,
+//         HOG_LUV_BINS       = 10
+//     };
 
-    bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
-    {
-        cascade.detect(hogluv, objects, stream);
-    }
+//     bool fill(const FileNode &root, const float mins, const float maxs);
+//     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
+//     {
+//         cascade.detect(hogluv, objects, stream);
+//     }
 
-private:
-    void calcLevels(const std::vector<icf::Octave>& octs,
-                                                    int frameW, int frameH, int nscales);
+// private:
+//     void calcLevels(const std::vector<icf::Octave>& octs,
+//                                                     int frameW, int frameH, int nscales);
 
-    typedef std::vector<icf::Octave>::const_iterator  octIt_t;
-    int fitOctave(const std::vector<icf::Octave>& octs, const float& logFactor) const
-    {
-        float minAbsLog = FLT_MAX;
-        int res =  0;
-        for (int oct = 0; oct < (int)octs.size(); ++oct)
-        {
-            const icf::Octave& octave =octs[oct];
-            float logOctave = ::log(octave.scale);
-            float logAbsScale = ::fabs(logFactor - logOctave);
+//     typedef std::vector<icf::Octave>::const_iterator  octIt_t;
+//     int fitOctave(const std::vector<icf::Octave>& octs, const float& logFactor) const
+//     {
+//         float minAbsLog = FLT_MAX;
+//         int res =  0;
+//         for (int oct = 0; oct < (int)octs.size(); ++oct)
+//         {
+//             const icf::Octave& octave =octs[oct];
+//             float logOctave = ::log(octave.scale);
+//             float logAbsScale = ::fabs(logFactor - logOctave);
 
-            if(logAbsScale < minAbsLog)
-            {
-                res = oct;
-                minAbsLog = logAbsScale;
-            }
-        }
-        return res;
-    }
+//             if(logAbsScale < minAbsLog)
+//             {
+//                 res = oct;
+//                 minAbsLog = logAbsScale;
+//             }
+//         }
+//         return res;
+//     }
 };
 
-inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
-{
-    minScale = mins;
-    maxScale = maxs;
+// inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
+// {
+//     minScale = mins;
+//     maxScale = maxs;
 
-    // cascade properties
-    static const char *const SC_STAGE_TYPE          = "stageType";
-    static const char *const SC_BOOST               = "BOOST";
+//     // cascade properties
+//     static const char *const SC_STAGE_TYPE          = "stageType";
+//     static const char *const SC_BOOST               = "BOOST";
 
-    static const char *const SC_FEATURE_TYPE        = "featureType";
-    static const char *const SC_ICF                 = "ICF";
+//     static const char *const SC_FEATURE_TYPE        = "featureType";
+//     static const char *const SC_ICF                 = "ICF";
 
-    static const char *const SC_ORIG_W              = "width";
-    static const char *const SC_ORIG_H              = "height";
+//     static const char *const SC_ORIG_W              = "width";
+//     static const char *const SC_ORIG_H              = "height";
 
-    static const char *const SC_OCTAVES             = "octaves";
-    static const char *const SC_STAGES              = "stages";
-    static const char *const SC_FEATURES            = "features";
+//     static const char *const SC_OCTAVES             = "octaves";
+//     static const char *const SC_STAGES              = "stages";
+//     static const char *const SC_FEATURES            = "features";
 
-    static const char *const SC_WEEK                = "weakClassifiers";
-    static const char *const SC_INTERNAL            = "internalNodes";
-    static const char *const SC_LEAF                = "leafValues";
+//     static const char *const SC_WEEK                = "weakClassifiers";
+//     static const char *const SC_INTERNAL            = "internalNodes";
+//     static const char *const SC_LEAF                = "leafValues";
 
-    static const char *const SC_OCT_SCALE           = "scale";
-    static const char *const SC_OCT_STAGES          = "stageNum";
-    static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
+//     static const char *const SC_OCT_SCALE           = "scale";
+//     static const char *const SC_OCT_STAGES          = "stageNum";
+//     static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
 
-    static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
+//     static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
 
-    static const char * const SC_F_CHANNEL          = "channel";
-    static const char * const SC_F_RECT             = "rect";
+//     static const char * const SC_F_CHANNEL          = "channel";
+//     static const char * const SC_F_RECT             = "rect";
 
-    // only Ada Boost supported
-    std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
-    CV_Assert(stageTypeStr == SC_BOOST);
+//     // only Ada Boost supported
+//     std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
+//     CV_Assert(stageTypeStr == SC_BOOST);
 
-    // only HOG-like integral channel features cupported
-    string featureTypeStr = (string)root[SC_FEATURE_TYPE];
-    CV_Assert(featureTypeStr == SC_ICF);
+//     // only HOG-like integral channel features cupported
+//     string featureTypeStr = (string)root[SC_FEATURE_TYPE];
+//     CV_Assert(featureTypeStr == SC_ICF);
 
-    origObjWidth = (int)root[SC_ORIG_W];
-    CV_Assert(origObjWidth  == ORIG_OBJECT_WIDTH);
+//     origObjWidth = (int)root[SC_ORIG_W];
+//     CV_Assert(origObjWidth  == ORIG_OBJECT_WIDTH);
 
-    origObjHeight = (int)root[SC_ORIG_H];
-    CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT);
+//     origObjHeight = (int)root[SC_ORIG_H];
+//     CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT);
 
-    FileNode fn = root[SC_OCTAVES];
-        if (fn.empty()) return false;
+//     FileNode fn = root[SC_OCTAVES];
+//         if (fn.empty()) return false;
 
-    std::vector<icf::Octave>  voctaves;
-    std::vector<float>        vstages;
-    std::vector<icf::Node>    vnodes;
-    std::vector<float>        vleaves;
-    std::vector<icf::Feature> vfeatures;
-    scales.clear();
+//     std::vector<icf::Octave>  voctaves;
+//     std::vector<float>        vstages;
+//     std::vector<icf::Node>    vnodes;
+//     std::vector<float>        vleaves;
+//     std::vector<icf::Feature> vfeatures;
+//     scales.clear();
 
-    // std::vector<Level> levels;
+//     // std::vector<Level> levels;
 
-    FileNodeIterator it = fn.begin(), it_end = fn.end();
-    int feature_offset = 0;
-    ushort octIndex = 0;
-    ushort shrinkage = 1;
+//     FileNodeIterator it = fn.begin(), it_end = fn.end();
+//     int feature_offset = 0;
+//     ushort octIndex = 0;
+//     ushort shrinkage = 1;
 
-    for (; it != it_end; ++it)
-    {
-        FileNode fns = *it;
-        float scale = (float)fns[SC_OCT_SCALE];
-        scales.push_back(scale);
-        ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
-        ushort2 size;
-        size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
-        size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
-        shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
+//     for (; it != it_end; ++it)
+//     {
+//         FileNode fns = *it;
+//         float scale = (float)fns[SC_OCT_SCALE];
+//         scales.push_back(scale);
+//         ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
+//         ushort2 size;
+//         size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
+//         size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
+//         shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
 
-        icf::Octave octave(octIndex, nstages, shrinkage, size, scale);
-        CV_Assert(octave.stages > 0);
-        voctaves.push_back(octave);
+//         icf::Octave octave(octIndex, nstages, shrinkage, size, scale);
+//         CV_Assert(octave.stages > 0);
+//         voctaves.push_back(octave);
 
-        FileNode ffs = fns[SC_FEATURES];
-        if (ffs.empty()) return false;
+//         FileNode ffs = fns[SC_FEATURES];
+//         if (ffs.empty()) return false;
 
-        fns = fns[SC_STAGES];
-        if (fn.empty()) return false;
+//         fns = fns[SC_STAGES];
+//         if (fn.empty()) return false;
 
-        // for each stage (~ decision tree with H = 2)
-        FileNodeIterator st = fns.begin(), st_end = fns.end();
-        for (; st != st_end; ++st )
-        {
-            fns = *st;
-            vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
+//         // for each stage (~ decision tree with H = 2)
+//         FileNodeIterator st = fns.begin(), st_end = fns.end();
+//         for (; st != st_end; ++st )
+//         {
+//             fns = *st;
+//             vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
 
-            fns = fns[SC_WEEK];
-            FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
-            for (; ftr != ft_end; ++ftr)
-            {
-                fns = (*ftr)[SC_INTERNAL];
-                FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
-                for (; inIt != inIt_end;)
-                {
-                    int feature = (int)(*(inIt +=2)++) + feature_offset;
-                    float th = (float)(*(inIt++));
-                    vnodes.push_back(icf::Node(feature, th));
-                }
+//             fns = fns[SC_WEEK];
+//             FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
+//             for (; ftr != ft_end; ++ftr)
+//             {
+//                 fns = (*ftr)[SC_INTERNAL];
+//                 FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
+//                 for (; inIt != inIt_end;)
+//                 {
+//                     int feature = (int)(*(inIt +=2)++) + feature_offset;
+//                     float th = (float)(*(inIt++));
+//                     vnodes.push_back(icf::Node(feature, th));
+//                 }
 
-                fns = (*ftr)[SC_LEAF];
-                inIt = fns.begin(), inIt_end = fns.end();
-                for (; inIt != inIt_end; ++inIt)
-                    vleaves.push_back((float)(*inIt));
-            }
-        }
+//                 fns = (*ftr)[SC_LEAF];
+//                 inIt = fns.begin(), inIt_end = fns.end();
+//                 for (; inIt != inIt_end; ++inIt)
+//                     vleaves.push_back((float)(*inIt));
+//             }
+//         }
 
-        st = ffs.begin(), st_end = ffs.end();
-        for (; st != st_end; ++st )
-        {
-            cv::FileNode rn = (*st)[SC_F_RECT];
-            cv::FileNodeIterator r_it = rn.begin();
-            uchar4 rect;
-            rect.x = saturate_cast<uchar>((int)*(r_it++));
-            rect.y = saturate_cast<uchar>((int)*(r_it++));
-            rect.z = saturate_cast<uchar>((int)*(r_it++));
-            rect.w = saturate_cast<uchar>((int)*(r_it++));
-            vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect));
-        }
+//         st = ffs.begin(), st_end = ffs.end();
+//         for (; st != st_end; ++st )
+//         {
+//             cv::FileNode rn = (*st)[SC_F_RECT];
+//             cv::FileNodeIterator r_it = rn.begin();
+//             uchar4 rect;
+//             rect.x = saturate_cast<uchar>((int)*(r_it++));
+//             rect.y = saturate_cast<uchar>((int)*(r_it++));
+//             rect.z = saturate_cast<uchar>((int)*(r_it++));
+//             rect.w = saturate_cast<uchar>((int)*(r_it++));
+//             vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect));
+//         }
 
-        feature_offset += octave.stages * 3;
-        ++octIndex;
-    }
+//         feature_offset += octave.stages * 3;
+//         ++octIndex;
+//     }
 
-    // upload in gpu memory
-    octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) ));
-    CV_Assert(!octaves.empty());
+//     // upload in gpu memory
+//     octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) ));
+//     CV_Assert(!octaves.empty());
 
-    stages.upload(cv::Mat(vstages).reshape(1,1));
-    CV_Assert(!stages.empty());
+//     stages.upload(cv::Mat(vstages).reshape(1,1));
+//     CV_Assert(!stages.empty());
 
-    nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) ));
-    CV_Assert(!nodes.empty());
+//     nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) ));
+//     CV_Assert(!nodes.empty());
 
-    leaves.upload(cv::Mat(vleaves).reshape(1,1));
-    CV_Assert(!leaves.empty());
+//     leaves.upload(cv::Mat(vleaves).reshape(1,1));
+//     CV_Assert(!leaves.empty());
 
-    features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
-    CV_Assert(!features.empty());
+//     features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
+//     CV_Assert(!features.empty());
 
-    // compute levels
-    calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
-    CV_Assert(!levels.empty());
+//     // compute levels
+//     calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
+//     CV_Assert(!levels.empty());
 
-    //init Cascade
-    cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels);
+//     //init Cascade
+//     cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels);
 
-    // allocate buffers
-    dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
-    shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
-    // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1);
-    hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1);
-    luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
-    integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
+//     // allocate buffers
+//     dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
+//     shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
+//     // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1);
+//     hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1);
+//     luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
+//     integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
 
-    dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-    dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-    angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-    mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 
-    nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-    nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
 
-    storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
-    return true;
-}
+//     storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
+//     return true;
+// }
 
-namespace {
-    struct CascadeIntrinsics
-    {
-        static const float lambda = 1.099f, a = 0.89f;
+// namespace {
+//     struct CascadeIntrinsics
+//     {
+//         static const float lambda = 1.099f, a = 0.89f;
 
-        static float getFor(int channel, float scaling)
-        {
-            CV_Assert(channel < 10);
+//         static float getFor(int channel, float scaling)
+//         {
+//             CV_Assert(channel < 10);
 
-            if (fabs(scaling - 1.f) < FLT_EPSILON)
-                return 1.f;
+//             if (fabs(scaling - 1.f) < FLT_EPSILON)
+//                 return 1.f;
 
-            // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
-            static const float A[2][2] =
-            {   //channel <= 6, otherwise
-                {        0.89f, 1.f}, // down
-                {        1.00f, 1.f}  // up
-            };
+//             // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
+//             static const float A[2][2] =
+//             {   //channel <= 6, otherwise
+//                 {        0.89f, 1.f}, // down
+//                 {        1.00f, 1.f}  // up
+//             };
 
-            static const float B[2][2] =
-            {   //channel <= 6,  otherwise
-                { 1.099f / log(2), 2.f}, // down
-                {             0.f, 2.f}  // up
-            };
+//             static const float B[2][2] =
+//             {   //channel <= 6,  otherwise
+//                 { 1.099f / log(2), 2.f}, // down
+//                 {             0.f, 2.f}  // up
+//             };
 
-            float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
-            float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
+//             float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
+//             float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
 
-            // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
-            return a * pow(scaling, b);
-        }
-    };
-}
+//             // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
+//             return a * pow(scaling, b);
+//         }
+//     };
+// }
 
-inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octave>& octs,
-                                                    int frameW, int frameH, int nscales)
-{
-    CV_Assert(nscales > 1);
+// inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octave>& octs,
+//                                                     int frameW, int frameH, int nscales)
+// {
+//     CV_Assert(nscales > 1);
 
-    std::vector<icf::Level> vlevels;
-    float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
+//     std::vector<icf::Level> vlevels;
+//     float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
 
-    float scale = minScale;
-    for (int sc = 0; sc < nscales; ++sc)
-    {
-        int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
-        int height = ::std::max(0.0f, frameH - (origObjHeight * scale));
+//     float scale = minScale;
+//     for (int sc = 0; sc < nscales; ++sc)
+//     {
+//         int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
+//         int height = ::std::max(0.0f, frameH - (origObjHeight * scale));
 
-        float logScale = ::log(scale);
-        int fit = fitOctave(octs, logScale);
+//         float logScale = ::log(scale);
+//         int fit = fitOctave(octs, logScale);
 
-        icf::Level level(fit, octs[fit], scale, width, height);
-        level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
-        level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
+//         icf::Level level(fit, octs[fit], scale, width, height);
+//         level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
+//         level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
 
-        if (!width || !height)
-            break;
-        else
-            vlevels.push_back(level);
+//         if (!width || !height)
+//             break;
+//         else
+//             vlevels.push_back(level);
 
-        if (::fabs(scale - maxScale) < FLT_EPSILON) break;
-        scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
+//         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
+//         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
 
-        // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-        //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
+//         // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
+//         //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
 
-        // std::cout << "level " << sc
-        //           << " octeve "
-        //           << vlevels[sc].octave
-        //           << " relScale "
-        //           << vlevels[sc].relScale
-        //           << " " << vlevels[sc].shrScale
-        //           << " [" << (int)vlevels[sc].objSize.x
-        //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
-        // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
-    }
-    levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
-}
+//         // std::cout << "level " << sc
+//         //           << " octeve "
+//         //           << vlevels[sc].octave
+//         //           << " relScale "
+//         //           << vlevels[sc].relScale
+//         //           << " " << vlevels[sc].shrScale
+//         //           << " [" << (int)vlevels[sc].objSize.x
+//         //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
+//         // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
+//     }
+//     levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
+// }
 
 cv::gpu::SoftCascade::SoftCascade() : filds(0) {}
 
@@ -419,97 +423,89 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     if (!fs.isOpened()) return false;
 
     filds = new Filds;
-    Filds& flds = *filds;
-    if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
+//     Filds& flds = *filds;
+//     if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
     return true;
 }
 
-namespace {
-    char *itoa(long i, char* s, int /*dummy_radix*/)
-    {
-        sprintf(s, "%ld", i);
-        return s;
-    }
-}
-
-#define USE_REFERENCE_VALUES
+// #define USE_REFERENCE_VALUES
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
                                 GpuMat& objects, const int /*rejectfactor*/, Stream s)
 {
-    // only color images are supperted
-    CV_Assert(colored.type() == CV_8UC3);
+//     // only color images are supperted
+//     CV_Assert(colored.type() == CV_8UC3);
 
-    // // only this window size allowed
-    CV_Assert(colored.cols == 640 && colored.rows == 480);
+//     // // only this window size allowed
+//     CV_Assert(colored.cols == 640 && colored.rows == 480);
 
-    Filds& flds = *filds;
+//     Filds& flds = *filds;
 
-#if defined USE_REFERENCE_VALUES
-    cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
-    cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
-    char buff[33];
+// #if defined USE_REFERENCE_VALUES
+//     cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+//     cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
+//     char buff[33];
 
-    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-    {
-        cv::Mat channel;
-        imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
-        GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
-        gchannel.upload(channel);
-    }
-#else
-    GpuMat& dmem = flds.dmem;
-    cudaMemset(dmem.data, 0, dmem.step * dmem.rows);
-    GpuMat& shrunk = flds.shrunk;
-    int w = shrunk.cols;
-    int h = colored.rows / flds.storage.shrinkage;
+//     for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+//     {
+//         cv::Mat channel;
+//         imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
+//         GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
+//         gchannel.upload(channel);
+//     }
+// #else
+//     GpuMat& dmem = flds.dmem;
+//     cudaMemset(dmem.data, 0, dmem.step * dmem.rows);
+//     GpuMat& shrunk = flds.shrunk;
+//     int w = shrunk.cols;
+//     int h = colored.rows / flds.storage.shrinkage;
 
-    std::vector<GpuMat> splited;
-    for(int i = 0; i < 3; ++i)
-    {
-        splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows)));
-    }
+//     std::vector<GpuMat> splited;
+//     for(int i = 0; i < 3; ++i)
+//     {
+//         splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows)));
+//     }
 
-    GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) );
+//     GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) );
 
-    cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);
+//     cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);
 
-    //create hog
-    cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25);
-    cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25);
+//     //create hog
+//     cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25);
+//     cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25);
 
-    cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true);
+//     cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true);
 
-    cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag);
-    cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle);
+//     cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag);
+//     cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle);
 
-    GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows));
-    flds.nmag.convertTo(magCannel, CV_8UC1);
-    device::icf::fillBins(dmem, flds.nangle);
+//     GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows));
+//     flds.nmag.convertTo(magCannel, CV_8UC1);
+//     device::icf::fillBins(dmem, flds.nangle);
 
-    // create luv
-    cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
-    cv::gpu::split(flds.luv, splited);
+//     // create luv
+//     cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
+//     cv::gpu::split(flds.luv, splited);
 
-    GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS));
-    cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+//     GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS));
+//     cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
 
-    // fer debug purpose
-    // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+//     // fer debug purpose
+//     // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
 
-    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-    {
-        GpuMat channel(shrunk, cv::Rect(0, h  * i, w, h ));
-        GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1));
-        cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
-    }
+//     for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+//     {
+//         GpuMat channel(shrunk, cv::Rect(0, h  * i, w, h ));
+//         GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1));
+//         cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
+    // }
 
-#endif
+// #endif
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    // detection
-    flds.detect(objects, stream);
+//     cudaStream_t stream = StreamAccessor::getStream(s);
+//     // detection
+//     flds.detect(objects, stream);
 
-    // flds.storage.frame(colored, stream);
+//     // flds.storage.frame(colored, stream);
 }
 
 #endif
\ No newline at end of file

From 4d9c7c1012be346f64fb50d50fd52d6ecb16b07e Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 26 Sep 2012 13:34:21 +0400
Subject: [PATCH 20/74] preprocessing ~1.981 ms

---
 modules/gpu/src/cuda/isf-sc.cu  |  54 +++++-----
 modules/gpu/src/softcascade.cpp | 175 +++++++++++++++++++-------------
 2 files changed, 133 insertions(+), 96 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 33b2222c7..e4831e2e6 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -40,6 +40,7 @@
 //
 //M*/
 
+#include <opencv2/gpu/device/common.hpp>
 // #include <icf.hpp>
 // #include <opencv2/gpu/device/saturate_cast.hpp>
 // #include <stdio.h>
@@ -54,9 +55,8 @@
 // # define dprintf(format, ...)
 // #endif
 
-// namespace cv { namespace gpu { namespace device {
-
-// namespace icf {
+namespace cv { namespace gpu { namespace device {
+namespace icf {
 
 //     enum {
 //         HOG_BINS = 6,
@@ -66,33 +66,35 @@
 //         GREY_OFFSET = HEIGHT * HOG_LUV_BINS
 //     };
 
-//     __global__ void magToHist(const uchar* __restrict__ mag,
-//                               const float* __restrict__ angle, const int angPitch,
-//                                     uchar* __restrict__ hog,   const int hogPitch)
-//     {
-//         const int y = blockIdx.y * blockDim.y + threadIdx.y;
-//         const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    // ToDo: use textures or ancached load instruction.
+    __global__ void magToHist(const uchar* __restrict__ mag,
+                              const float* __restrict__ angle, const int angPitch,
+                                    uchar* __restrict__ hog,   const int hogPitch, const int fh)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
 
-//         const int bin = (int)(angle[y * angPitch + x]);
-//         const uchar val = mag[y * angPitch + x];
+        const int bin = (int)(angle[y * angPitch + x]);
+        const uchar val = mag[y * hogPitch + x];
+        hog[((fh * bin) + y) * hogPitch + x] = val;
+    }
 
-//         hog[((HEIGHT * bin) + y) * hogPitch + x] = val;
-//     }
+    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
+                  const int fw,  const int fh, const int bins)
+    {
+        const uchar* mag = (const uchar*)hogluv.ptr(fh * bins);
+        uchar* hog = (uchar*)hogluv.ptr();
+        const float* angle = (const float*)nangle.ptr();
 
-//     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle)
-//     {
-//         const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS);
-//         uchar* hog = (uchar*)hogluv.ptr();
-//         const float* angle = (const float*)nangle.ptr();
+        dim3 block(32, 8);
+        dim3 grid(fw / 32, fh / 8);
 
-//         dim3 block(32, 8);
-//         dim3 grid(WIDTH / 32, HEIGHT / 8);
-
-//         magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step);
-//         cudaSafeCall( cudaGetLastError() );
-//         cudaSafeCall( cudaDeviceSynchronize() );
-//     }
-// }
+        magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh);
+        cudaSafeCall( cudaGetLastError() );
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+}}}
 
 // __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
 //     PtrStepSz<uchar4> objects)
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index c4334ca1d..f336fd2d9 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -49,17 +49,18 @@ cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); }
 cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); }
 cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); }
 bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; }
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu(); }
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu();}
 
 #else
 
 // #include <icf.hpp>
 
-// namespace cv { namespace gpu { namespace device {
-// namespace icf {
-//     void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle);
-// }
-// }}}
+namespace cv { namespace gpu { namespace device {
+namespace icf {
+    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
+        const int fw, const int fh, const int bins);
+}
+}}}
 
 // namespace {
 //     char *itoa(long i, char* s, int /*dummy_radix*/)
@@ -71,6 +72,16 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat
 
 struct cv::gpu::SoftCascade::Filds
 {
+
+    Filds()
+    {
+        plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
+        fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1);
+        luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
+        shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1);
+        integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
+        hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1);
+    }
 //     // scales range
 //     float minScale;
 //     float maxScale;
@@ -85,19 +96,26 @@ struct cv::gpu::SoftCascade::Filds
 //     GpuMat features;
 //     GpuMat levels;
 
-//     // preallocated buffer 640x480x10 + 640x480
-//     GpuMat dmem;
-//     // 160x120x10
-//     GpuMat shrunk;
-//     // 161x121x10
-//     GpuMat hogluv;
+    // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
+    GpuMat plane;
+
+    // preallocated buffer for floating point operations
+    GpuMat fplane;
+
+    // temporial mat for cvtColor
+    GpuMat luv;
+
+    // 160x120x10
+    GpuMat shrunk;
+
+    // temporial mat for integrall
+    GpuMat integralBuffer;
+
+    // 161x121x10
+    GpuMat hogluv;
 
 //     // will be removed in final version
-//     // temporial mat for cvtColor
-//     GpuMat luv;
 
-//     // temporial mat for integrall
-//     GpuMat integralBuffer;
 
 //     // temp matrix for sobel and cartToPolar
 //     GpuMat dfdx, dfdy, angle, mag, nmag, nangle;
@@ -108,17 +126,18 @@ struct cv::gpu::SoftCascade::Filds
 //     icf::ChannelStorage storage;
 
 //     enum { BOOST = 0 };
-//     enum
-//     {
-//         FRAME_WIDTH        = 640,
-//         FRAME_HEIGHT       = 480,
+    enum
+    {
+        FRAME_WIDTH        = 640,
+        FRAME_HEIGHT       = 480,
 //         TOTAL_SCALES       = 55,
 //         CLASSIFIERS        = 5,
 //         ORIG_OBJECT_WIDTH  = 64,
 //         ORIG_OBJECT_HEIGHT = 128,
-//         HOG_BINS           = 6,
-//         HOG_LUV_BINS       = 10
-//     };
+        HOG_BINS           = 6,
+        LUV_BINS           = 3,
+        HOG_LUV_BINS       = 10
+    };
 
 //     bool fill(const FileNode &root, const float mins, const float maxs);
 //     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
@@ -386,7 +405,8 @@ struct cv::gpu::SoftCascade::Filds
 //         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
 
 //         // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-//         //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
+//         //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x,
+//level.objSize.y);
 
 //         // std::cout << "level " << sc
 //         //           << " octeve "
@@ -423,8 +443,8 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     if (!fs.isOpened()) return false;
 
     filds = new Filds;
-//     Filds& flds = *filds;
-//     if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
+    Filds& flds = *filds;
+    // if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
     return true;
 }
 
@@ -432,15 +452,15 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
                                 GpuMat& objects, const int /*rejectfactor*/, Stream s)
 {
-//     // only color images are supperted
-//     CV_Assert(colored.type() == CV_8UC3);
+    // only color images are supperted
+    CV_Assert(colored.type() == CV_8UC3);
 
-//     // // only this window size allowed
-//     CV_Assert(colored.cols == 640 && colored.rows == 480);
+    // only this window size allowed
+    CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
 
-//     Filds& flds = *filds;
+    Filds& flds = *filds;
 
-// #if defined USE_REFERENCE_VALUES
+#if defined USE_REFERENCE_VALUES
 //     cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
 //     cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
 //     char buff[33];
@@ -452,57 +472,72 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
 //         GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
 //         gchannel.upload(channel);
 //     }
-// #else
-//     GpuMat& dmem = flds.dmem;
-//     cudaMemset(dmem.data, 0, dmem.step * dmem.rows);
-//     GpuMat& shrunk = flds.shrunk;
-//     int w = shrunk.cols;
-//     int h = colored.rows / flds.storage.shrinkage;
+#else
+    GpuMat& plane = flds.plane;
+    GpuMat& shrunk = flds.shrunk;
+    cudaMemset(plane.data, 0, plane.step * plane.rows);
 
-//     std::vector<GpuMat> splited;
-//     for(int i = 0; i < 3; ++i)
-//     {
-//         splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows)));
-//     }
+    int fw = Filds::FRAME_WIDTH;
+    int fh = Filds::FRAME_HEIGHT;
 
-//     GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) );
+    GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh));
 
-//     cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);
+    //cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);
+    cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY);
 
-//     //create hog
-//     cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25);
-//     cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25);
+    //create hog
+    GpuMat dfdx(flds.fplane, cv::Rect(0,  0, fw, fh));
+    GpuMat dfdy(flds.fplane, cv::Rect(0, fh, fw, fh));
 
-//     cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true);
+    cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f);
+    cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f);
 
-//     cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag);
-//     cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle);
+    GpuMat mag(flds.fplane, cv::Rect(0, 2 * fh, fw, fh));
+    GpuMat ang(flds.fplane, cv::Rect(0, 3 * fh, fw, fh));
 
-//     GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows));
-//     flds.nmag.convertTo(magCannel, CV_8UC1);
-//     device::icf::fillBins(dmem, flds.nangle);
+    cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true);
 
-//     // create luv
-//     cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
-//     cv::gpu::split(flds.luv, splited);
+    // normolize magnitude to uchar interval and angles to 6 bins
 
-//     GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS));
-//     cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+    GpuMat nmag(flds.fplane, cv::Rect(0, 4 * fh, fw, fh));
+    GpuMat nang(flds.fplane, cv::Rect(0, 5 * fh, fw, fh));
 
-//     // fer debug purpose
-//     // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+    cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag);
+    cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
 
-//     for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-//     {
-//         GpuMat channel(shrunk, cv::Rect(0, h  * i, w, h ));
-//         GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1));
-//         cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
-    // }
+    //create uchar magnitude
+    GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh));
+    nmag.convertTo(cmag, CV_8UC1);
 
-// #endif
+    // create luv
+    cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
 
-//     cudaStream_t stream = StreamAccessor::getStream(s);
-//     // detection
+    std::vector<GpuMat> splited;
+    for(int i = 0; i < Filds::LUV_BINS; ++i)
+    {
+        splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
+    }
+
+    cv::gpu::split(flds.luv, splited);
+
+    device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS);
+
+    GpuMat hogluv(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
+    cv::gpu::resize(hogluv, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+
+    fw /= 4;
+    fh /= 4;
+    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+    {
+        GpuMat channel(shrunk, cv::Rect(0, fh  * i, fw, fh ));
+        GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
+        cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
+    }
+
+#endif
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    // detection
 //     flds.detect(objects, stream);
 
 //     // flds.storage.frame(colored, stream);

From b83d4add2ea096b3481a838f9e26a038d10a93d5 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 26 Sep 2012 17:15:17 +0400
Subject: [PATCH 21/74] memory optimization

---
 modules/gpu/src/cuda/isf-sc.cu  |  56 +++-
 modules/gpu/src/icf.hpp         | 106 ++++---
 modules/gpu/src/softcascade.cpp | 489 +++++++++++++++-----------------
 3 files changed, 339 insertions(+), 312 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index e4831e2e6..714bdfa44 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -41,9 +41,9 @@
 //M*/
 
 #include <opencv2/gpu/device/common.hpp>
-// #include <icf.hpp>
+#include <icf.hpp>
 // #include <opencv2/gpu/device/saturate_cast.hpp>
-// #include <stdio.h>
+#include <stdio.h>
 // #include <float.h>
 
 // //#define LOG_CUDA_CASCADE
@@ -93,6 +93,58 @@ namespace icf {
         cudaSafeCall( cudaGetLastError() );
         cudaSafeCall( cudaDeviceSynchronize() );
     }
+
+    texture<float2,  cudaTextureType1D, cudaReadModeElementType> tnode;
+    __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
+        const Node* nodes,
+        PtrStepSz<uchar4> objects)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        Level level = levels[blockIdx.z];
+        if(x >= level.workRect.x || y >= level.workRect.y) return;
+
+        Octave octave = octaves[level.octave];
+        int st = octave.index * octave.stages;
+        const int stEnd = st + 1000;//octave.stages;
+
+        float confidence = 0.f;
+
+#pragma unroll 8
+        for(; st < stEnd; ++st)
+        {
+            const int nId = st * 3;
+            const Node node = nodes[nId];
+
+            const float stage = stages[st];
+            confidence += node.rect.x * stage;
+        }
+
+        uchar4 val;
+        val.x = (int)confidence;
+        if (x == y) objects(0, threadIdx.x) = val;
+
+    }
+
+    void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
+        const PtrStepSzb& nodes, const PtrStepSzb& features,
+        PtrStepSz<uchar4> objects)
+    {
+        int fw = 160;
+        int fh = 120;
+        dim3 block(32, 8);
+        dim3 grid(fw / 32, fh / 8, 47);
+        const Level* l = (const Level*)levels.ptr();
+        const Octave* oct = ((const Octave*)octaves.ptr());
+        const float* st = (const float*)stages.ptr();
+        const Node* nd = (const Node*)nodes.ptr();
+        // cudaSafeCall( cudaBindTexture(0, tnode, nodes.data, rgb.cols / size) );
+
+        test_kernel<<<grid, block>>>(l, oct, st, nd, objects);
+
+        cudaSafeCall( cudaGetLastError());
+        cudaSafeCall( cudaDeviceSynchronize());
+    }
 }
 }}}
 
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index cf1348007..51ea2c068 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -1,4 +1,4 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+//M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -38,12 +38,12 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
-//M*/
+//M
 
-// #include <opencv2/gpu/device/common.hpp>
+#include <opencv2/gpu/device/common.hpp>
 
-// #ifndef __OPENCV_ICF_HPP__
-// #define __OPENCV_ICF_HPP__
+#ifndef __OPENCV_ICF_HPP__
+#define __OPENCV_ICF_HPP__
 
 // #if defined __CUDACC__
 // # define __device __device__ __forceinline__
@@ -52,49 +52,62 @@
 // #endif
 
 
-// namespace cv { namespace gpu { namespace icf {
+namespace cv { namespace gpu { namespace device {
+namespace icf {
 
-// using cv::gpu::PtrStepSzb;
-// using cv::gpu::PtrStepSzf;
+struct __align__(16) Octave
+{
+    ushort index;
+    ushort stages;
+    ushort shrinkage;
+    ushort2 size;
+    float scale;
 
-// typedef unsigned char uchar;
+    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
+    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
+};
 
-// struct __align__(16) Octave
-// {
-//     ushort index;
-//     ushort stages;
-//     ushort shrinkage;
-//     ushort2 size;
-//     float scale;
+struct __align__(8) Level //is actually 24 bytes
+{
+    int octave;
 
-//     Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
-//     : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
-// };
+    float relScale;
+    float shrScale;   // used for marking detection
+    float scaling[2]; // calculated according to Dollal paper
 
-// struct __align__(8) Level //is actually 24 bytes
-// {
-//     int octave;
+    // for 640x480 we can not get overflow
+    uchar2 workRect;
+    uchar2 objSize;
 
-//     // float origScale; //not actually used
-//     float relScale;
-//     float shrScale;   // used for marking detection
-//     float scaling[2]; // calculated according to Dollal paper
+    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+    {
+        workRect.x = round(w / (float)oct.shrinkage);
+        workRect.y = round(h / (float)oct.shrinkage);
 
-//     // for 640x480 we can not get overflow
-//     uchar2 workRect;
-//     uchar2 objSize;
+        objSize.x  = round(oct.size.x * relScale);
+        objSize.y  = round(oct.size.y * relScale);
+    }
+};
 
-//     Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-//     :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
-//     {
-//         workRect.x = round(w / (float)oct.shrinkage);
-//         workRect.y = round(h / (float)oct.shrinkage);
+struct __align__(8) Node
+{
+    // int feature;
+    uchar4 rect;
+    float threshold;
 
-//         objSize.x  = round(oct.size.x * relScale);
-//         objSize.y  = round(oct.size.y * relScale);
-//     }
-// };
+    Node(const uchar4 c, const int t) : rect(c), threshold(t) {}
+};
 
+struct __align__(8) Feature
+{
+    int channel;
+    uchar4 rect;
+
+    Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
+};
+}
+}}}
 // struct Cascade
 // {
 //     Cascade() {}
@@ -146,21 +159,6 @@
 //     static const float magnitudeScaling = 1.f ;// / sqrt(2);
 // };
 
-// struct __align__(8) Node
-// {
-//     int feature;
-//     float threshold;
-
-//     Node(const int f, const float t) : feature(f), threshold(t) {}
-// };
-
-// struct __align__(8) Feature
-// {
-//     int channel;
-//     uchar4 rect;
-
-//     Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
-// };
 // }}}
 
-// #endif
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index f336fd2d9..8d75176ab 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -53,12 +53,15 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat
 
 #else
 
-// #include <icf.hpp>
+#include <icf.hpp>
 
 namespace cv { namespace gpu { namespace device {
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
+    void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
+        const PtrStepSzb& nodes, const PtrStepSzb& features,
+        PtrStepSz<uchar4> objects);
 }
 }}}
 
@@ -82,19 +85,20 @@ struct cv::gpu::SoftCascade::Filds
         integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
         hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1);
     }
-//     // scales range
-//     float minScale;
-//     float maxScale;
 
-//     int origObjWidth;
-//     int origObjHeight;
+    // scales range
+    float minScale;
+    float maxScale;
 
-//     GpuMat octaves;
-//     GpuMat stages;
-//     GpuMat nodes;
-//     GpuMat leaves;
-//     GpuMat features;
-//     GpuMat levels;
+    int origObjWidth;
+    int origObjHeight;
+
+    GpuMat octaves;
+    GpuMat stages;
+    GpuMat nodes;
+    GpuMat leaves;
+    GpuMat features;
+    GpuMat levels;
 
     // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
     GpuMat plane;
@@ -114,312 +118,285 @@ struct cv::gpu::SoftCascade::Filds
     // 161x121x10
     GpuMat hogluv;
 
-//     // will be removed in final version
+    std::vector<float> scales;
 
-
-//     // temp matrix for sobel and cartToPolar
-//     GpuMat dfdx, dfdy, angle, mag, nmag, nangle;
-
-//     std::vector<float> scales;
-
-//     icf::Cascade cascade;
-//     icf::ChannelStorage storage;
-
-//     enum { BOOST = 0 };
+    enum { BOOST = 0 };
     enum
     {
         FRAME_WIDTH        = 640,
         FRAME_HEIGHT       = 480,
-//         TOTAL_SCALES       = 55,
+        TOTAL_SCALES       = 55,
 //         CLASSIFIERS        = 5,
-//         ORIG_OBJECT_WIDTH  = 64,
-//         ORIG_OBJECT_HEIGHT = 128,
+        ORIG_OBJECT_WIDTH  = 64,
+        ORIG_OBJECT_HEIGHT = 128,
         HOG_BINS           = 6,
         LUV_BINS           = 3,
         HOG_LUV_BINS       = 10
     };
 
-//     bool fill(const FileNode &root, const float mins, const float maxs);
-//     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
-//     {
-//         cascade.detect(hogluv, objects, stream);
-//     }
+    bool fill(const FileNode &root, const float mins, const float maxs);
+    void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
+    {
+        device::icf::detect(levels, octaves, stages, nodes, features, objects);
+    }
 
-// private:
-//     void calcLevels(const std::vector<icf::Octave>& octs,
-//                                                     int frameW, int frameH, int nscales);
+private:
+    void calcLevels(const std::vector<device::icf::Octave>& octs,
+                                                    int frameW, int frameH, int nscales);
 
-//     typedef std::vector<icf::Octave>::const_iterator  octIt_t;
-//     int fitOctave(const std::vector<icf::Octave>& octs, const float& logFactor) const
-//     {
-//         float minAbsLog = FLT_MAX;
-//         int res =  0;
-//         for (int oct = 0; oct < (int)octs.size(); ++oct)
-//         {
-//             const icf::Octave& octave =octs[oct];
-//             float logOctave = ::log(octave.scale);
-//             float logAbsScale = ::fabs(logFactor - logOctave);
+    typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
+    int fitOctave(const std::vector<device::icf::Octave>& octs, const float& logFactor) const
+    {
+        float minAbsLog = FLT_MAX;
+        int res =  0;
+        for (int oct = 0; oct < (int)octs.size(); ++oct)
+        {
+            const device::icf::Octave& octave =octs[oct];
+            float logOctave = ::log(octave.scale);
+            float logAbsScale = ::fabs(logFactor - logOctave);
 
-//             if(logAbsScale < minAbsLog)
-//             {
-//                 res = oct;
-//                 minAbsLog = logAbsScale;
-//             }
-//         }
-//         return res;
-//     }
+            if(logAbsScale < minAbsLog)
+            {
+                res = oct;
+                minAbsLog = logAbsScale;
+            }
+        }
+        return res;
+    }
 };
 
-// inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
-// {
-//     minScale = mins;
-//     maxScale = maxs;
+inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
+{
+    using namespace device::icf;
+    minScale = mins;
+    maxScale = maxs;
 
-//     // cascade properties
-//     static const char *const SC_STAGE_TYPE          = "stageType";
-//     static const char *const SC_BOOST               = "BOOST";
+    // cascade properties
+    static const char *const SC_STAGE_TYPE          = "stageType";
+    static const char *const SC_BOOST               = "BOOST";
 
-//     static const char *const SC_FEATURE_TYPE        = "featureType";
-//     static const char *const SC_ICF                 = "ICF";
+    static const char *const SC_FEATURE_TYPE        = "featureType";
+    static const char *const SC_ICF                 = "ICF";
 
-//     static const char *const SC_ORIG_W              = "width";
-//     static const char *const SC_ORIG_H              = "height";
+    static const char *const SC_ORIG_W              = "width";
+    static const char *const SC_ORIG_H              = "height";
 
-//     static const char *const SC_OCTAVES             = "octaves";
-//     static const char *const SC_STAGES              = "stages";
-//     static const char *const SC_FEATURES            = "features";
+    static const char *const SC_OCTAVES             = "octaves";
+    static const char *const SC_STAGES              = "stages";
+    static const char *const SC_FEATURES            = "features";
 
-//     static const char *const SC_WEEK                = "weakClassifiers";
-//     static const char *const SC_INTERNAL            = "internalNodes";
-//     static const char *const SC_LEAF                = "leafValues";
+    static const char *const SC_WEEK                = "weakClassifiers";
+    static const char *const SC_INTERNAL            = "internalNodes";
+    static const char *const SC_LEAF                = "leafValues";
 
-//     static const char *const SC_OCT_SCALE           = "scale";
-//     static const char *const SC_OCT_STAGES          = "stageNum";
-//     static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
+    static const char *const SC_OCT_SCALE           = "scale";
+    static const char *const SC_OCT_STAGES          = "stageNum";
+    static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
 
-//     static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
+    static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
 
-//     static const char * const SC_F_CHANNEL          = "channel";
-//     static const char * const SC_F_RECT             = "rect";
+    static const char * const SC_F_CHANNEL          = "channel";
+    static const char * const SC_F_RECT             = "rect";
 
-//     // only Ada Boost supported
-//     std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
-//     CV_Assert(stageTypeStr == SC_BOOST);
+    // only Ada Boost supported
+    std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
+    CV_Assert(stageTypeStr == SC_BOOST);
 
-//     // only HOG-like integral channel features cupported
-//     string featureTypeStr = (string)root[SC_FEATURE_TYPE];
-//     CV_Assert(featureTypeStr == SC_ICF);
+    // only HOG-like integral channel features cupported
+    string featureTypeStr = (string)root[SC_FEATURE_TYPE];
+    CV_Assert(featureTypeStr == SC_ICF);
 
-//     origObjWidth = (int)root[SC_ORIG_W];
-//     CV_Assert(origObjWidth  == ORIG_OBJECT_WIDTH);
+    origObjWidth = (int)root[SC_ORIG_W];
+    CV_Assert(origObjWidth  == ORIG_OBJECT_WIDTH);
 
-//     origObjHeight = (int)root[SC_ORIG_H];
-//     CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT);
+    origObjHeight = (int)root[SC_ORIG_H];
+    CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT);
 
-//     FileNode fn = root[SC_OCTAVES];
-//         if (fn.empty()) return false;
+    FileNode fn = root[SC_OCTAVES];
+        if (fn.empty()) return false;
 
-//     std::vector<icf::Octave>  voctaves;
-//     std::vector<float>        vstages;
-//     std::vector<icf::Node>    vnodes;
-//     std::vector<float>        vleaves;
-//     std::vector<icf::Feature> vfeatures;
-//     scales.clear();
+    std::vector<Octave>  voctaves;
+    std::vector<float>        vstages;
+    std::vector<Node>    vnodes;
+    std::vector<float>        vleaves;
+    std::vector<Feature> vfeatures;
+    scales.clear();
 
-//     // std::vector<Level> levels;
+    FileNodeIterator it = fn.begin(), it_end = fn.end();
+    int feature_offset = 0;
+    ushort octIndex = 0;
+    ushort shrinkage = 1;
 
-//     FileNodeIterator it = fn.begin(), it_end = fn.end();
-//     int feature_offset = 0;
-//     ushort octIndex = 0;
-//     ushort shrinkage = 1;
+    for (; it != it_end; ++it)
+    {
+        FileNode fns = *it;
+        float scale = (float)fns[SC_OCT_SCALE];
+        scales.push_back(scale);
+        ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
+        ushort2 size;
+        size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
+        size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
+        shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
 
-//     for (; it != it_end; ++it)
-//     {
-//         FileNode fns = *it;
-//         float scale = (float)fns[SC_OCT_SCALE];
-//         scales.push_back(scale);
-//         ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
-//         ushort2 size;
-//         size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
-//         size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
-//         shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
+        Octave octave(octIndex, nstages, shrinkage, size, scale);
+        CV_Assert(octave.stages > 0);
+        voctaves.push_back(octave);
 
-//         icf::Octave octave(octIndex, nstages, shrinkage, size, scale);
-//         CV_Assert(octave.stages > 0);
-//         voctaves.push_back(octave);
+        FileNode ffs = fns[SC_FEATURES];
+        if (ffs.empty()) return false;
 
-//         FileNode ffs = fns[SC_FEATURES];
-//         if (ffs.empty()) return false;
+        fns = fns[SC_STAGES];
+        if (fn.empty()) return false;
 
-//         fns = fns[SC_STAGES];
-//         if (fn.empty()) return false;
+        // for each stage (~ decision tree with H = 2)
+        FileNodeIterator st = fns.begin(), st_end = fns.end();
+        for (; st != st_end; ++st )
+        {
+            fns = *st;
+            vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
 
-//         // for each stage (~ decision tree with H = 2)
-//         FileNodeIterator st = fns.begin(), st_end = fns.end();
-//         for (; st != st_end; ++st )
-//         {
-//             fns = *st;
-//             vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
+            fns = fns[SC_WEEK];
+            FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
+            for (; ftr != ft_end; ++ftr)
+            {
+                fns = (*ftr)[SC_INTERNAL];
+                FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
+                for (; inIt != inIt_end;)
+                {
+                    int feature = (int)(*(inIt +=2)++) + feature_offset;
+                    float th = (float)(*(inIt++));
+                    uchar4 rect;
+                    vnodes.push_back(Node(rect, th));
+                }
 
-//             fns = fns[SC_WEEK];
-//             FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
-//             for (; ftr != ft_end; ++ftr)
-//             {
-//                 fns = (*ftr)[SC_INTERNAL];
-//                 FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
-//                 for (; inIt != inIt_end;)
-//                 {
-//                     int feature = (int)(*(inIt +=2)++) + feature_offset;
-//                     float th = (float)(*(inIt++));
-//                     vnodes.push_back(icf::Node(feature, th));
-//                 }
+                fns = (*ftr)[SC_LEAF];
+                inIt = fns.begin(), inIt_end = fns.end();
+                for (; inIt != inIt_end; ++inIt)
+                    vleaves.push_back((float)(*inIt));
+            }
+        }
 
-//                 fns = (*ftr)[SC_LEAF];
-//                 inIt = fns.begin(), inIt_end = fns.end();
-//                 for (; inIt != inIt_end; ++inIt)
-//                     vleaves.push_back((float)(*inIt));
-//             }
-//         }
+        st = ffs.begin(), st_end = ffs.end();
+        for (; st != st_end; ++st )
+        {
+            cv::FileNode rn = (*st)[SC_F_RECT];
+            cv::FileNodeIterator r_it = rn.begin();
+            uchar4 rect;
+            rect.x = saturate_cast<uchar>((int)*(r_it++));
+            rect.y = saturate_cast<uchar>((int)*(r_it++));
+            rect.z = saturate_cast<uchar>((int)*(r_it++));
+            rect.w = saturate_cast<uchar>((int)*(r_it++));
+            vfeatures.push_back(Feature((int)(*st)[SC_F_CHANNEL], rect));
+        }
 
-//         st = ffs.begin(), st_end = ffs.end();
-//         for (; st != st_end; ++st )
-//         {
-//             cv::FileNode rn = (*st)[SC_F_RECT];
-//             cv::FileNodeIterator r_it = rn.begin();
-//             uchar4 rect;
-//             rect.x = saturate_cast<uchar>((int)*(r_it++));
-//             rect.y = saturate_cast<uchar>((int)*(r_it++));
-//             rect.z = saturate_cast<uchar>((int)*(r_it++));
-//             rect.w = saturate_cast<uchar>((int)*(r_it++));
-//             vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect));
-//         }
+        feature_offset += octave.stages * 3;
+        ++octIndex;
+    }
 
-//         feature_offset += octave.stages * 3;
-//         ++octIndex;
-//     }
+    // upload in gpu memory
+    octaves.upload(cv::Mat(1, voctaves.size() * sizeof(Octave), CV_8UC1, (uchar*)&(voctaves[0]) ));
+    CV_Assert(!octaves.empty());
 
-//     // upload in gpu memory
-//     octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) ));
-//     CV_Assert(!octaves.empty());
+    stages.upload(cv::Mat(vstages).reshape(1,1));
+    CV_Assert(!stages.empty());
 
-//     stages.upload(cv::Mat(vstages).reshape(1,1));
-//     CV_Assert(!stages.empty());
+    nodes.upload(cv::Mat(1, vnodes.size() * sizeof(Node), CV_8UC1, (uchar*)&(vnodes[0]) ));
+    CV_Assert(!nodes.empty());
 
-//     nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) ));
-//     CV_Assert(!nodes.empty());
+    leaves.upload(cv::Mat(vleaves).reshape(1,1));
+    CV_Assert(!leaves.empty());
 
-//     leaves.upload(cv::Mat(vleaves).reshape(1,1));
-//     CV_Assert(!leaves.empty());
+    features.upload(cv::Mat(1, vfeatures.size() * sizeof(Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
+    CV_Assert(!features.empty());
 
-//     features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
-//     CV_Assert(!features.empty());
+    // compute levels
+    calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
+    CV_Assert(!levels.empty());
 
-//     // compute levels
-//     calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
-//     CV_Assert(!levels.empty());
+    return true;
+}
 
-//     //init Cascade
-//     cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels);
+namespace {
+    struct CascadeIntrinsics
+    {
+        static const float lambda = 1.099f, a = 0.89f;
 
-//     // allocate buffers
-//     dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
-//     shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
-//     // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1);
-//     hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1);
-//     luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
-//     integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
+        static float getFor(int channel, float scaling)
+        {
+            CV_Assert(channel < 10);
 
-//     dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-//     dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-//     angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-//     mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+            if (fabs(scaling - 1.f) < FLT_EPSILON)
+                return 1.f;
 
-//     nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-//     nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+            // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
+            static const float A[2][2] =
+            {   //channel <= 6, otherwise
+                {        0.89f, 1.f}, // down
+                {        1.00f, 1.f}  // up
+            };
 
-//     storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
-//     return true;
-// }
+            static const float B[2][2] =
+            {   //channel <= 6,  otherwise
+                { 1.099f / log(2), 2.f}, // down
+                {             0.f, 2.f}  // up
+            };
 
-// namespace {
-//     struct CascadeIntrinsics
-//     {
-//         static const float lambda = 1.099f, a = 0.89f;
+            float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
+            float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
 
-//         static float getFor(int channel, float scaling)
-//         {
-//             CV_Assert(channel < 10);
+            // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
+            return a * pow(scaling, b);
+        }
+    };
+}
 
-//             if (fabs(scaling - 1.f) < FLT_EPSILON)
-//                 return 1.f;
+inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<device::icf::Octave>& octs,
+                                                    int frameW, int frameH, int nscales)
+{
+    CV_Assert(nscales > 1);
+    using device::icf::Level;
 
-//             // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
-//             static const float A[2][2] =
-//             {   //channel <= 6, otherwise
-//                 {        0.89f, 1.f}, // down
-//                 {        1.00f, 1.f}  // up
-//             };
+    std::vector<Level> vlevels;
+    float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
 
-//             static const float B[2][2] =
-//             {   //channel <= 6,  otherwise
-//                 { 1.099f / log(2), 2.f}, // down
-//                 {             0.f, 2.f}  // up
-//             };
+    float scale = minScale;
+    for (int sc = 0; sc < nscales; ++sc)
+    {
+        int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
+        int height = ::std::max(0.0f, frameH - (origObjHeight * scale));
 
-//             float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
-//             float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
+        float logScale = ::log(scale);
+        int fit = fitOctave(octs, logScale);
 
-//             // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
-//             return a * pow(scaling, b);
-//         }
-//     };
-// }
+        Level level(fit, octs[fit], scale, width, height);
+        level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
+        level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
 
-// inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octave>& octs,
-//                                                     int frameW, int frameH, int nscales)
-// {
-//     CV_Assert(nscales > 1);
+        if (!width || !height)
+            break;
+        else
+            vlevels.push_back(level);
 
-//     std::vector<icf::Level> vlevels;
-//     float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
+        if (::fabs(scale - maxScale) < FLT_EPSILON) break;
+        scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
 
-//     float scale = minScale;
-//     for (int sc = 0; sc < nscales; ++sc)
-//     {
-//         int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
-//         int height = ::std::max(0.0f, frameH - (origObjHeight * scale));
+        // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
+        //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x,
+        //level.objSize.y);
 
-//         float logScale = ::log(scale);
-//         int fit = fitOctave(octs, logScale);
+        std::cout << "level " << sc
+                  << " octeve "
+                  << vlevels[sc].octave
+                  << " relScale "
+                  << vlevels[sc].relScale
+                  << " " << vlevels[sc].shrScale
+                  << " [" << (int)vlevels[sc].objSize.x
+                  << " " <<  (int)vlevels[sc].objSize.y << "] ["
+        <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
+    }
 
-//         icf::Level level(fit, octs[fit], scale, width, height);
-//         level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
-//         level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
-
-//         if (!width || !height)
-//             break;
-//         else
-//             vlevels.push_back(level);
-
-//         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
-//         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
-
-//         // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-//         //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x,
-//level.objSize.y);
-
-//         // std::cout << "level " << sc
-//         //           << " octeve "
-//         //           << vlevels[sc].octave
-//         //           << " relScale "
-//         //           << vlevels[sc].relScale
-//         //           << " " << vlevels[sc].shrScale
-//         //           << " [" << (int)vlevels[sc].objSize.x
-//         //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
-//         // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
-//     }
-//     levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
-// }
+    levels.upload(cv::Mat(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
+}
 
 cv::gpu::SoftCascade::SoftCascade() : filds(0) {}
 
@@ -444,7 +421,7 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
 
     filds = new Filds;
     Filds& flds = *filds;
-    // if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
+    if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
     return true;
 }
 
@@ -538,7 +515,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
 
     cudaStream_t stream = StreamAccessor::getStream(s);
     // detection
-//     flds.detect(objects, stream);
+    flds.detect(objects, stream);
 
 //     // flds.storage.frame(colored, stream);
 }

From 8108bd30febf17e81f8329ccb65f695dd335a471 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 27 Sep 2012 12:44:06 +0400
Subject: [PATCH 22/74] optimize memory usage

---
 modules/gpu/src/cuda/isf-sc.cu  | 306 +++++++++++++-------------------
 modules/gpu/src/icf.hpp         |  29 +--
 modules/gpu/src/softcascade.cpp |  86 +++++----
 3 files changed, 180 insertions(+), 241 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 714bdfa44..c8dff34bd 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -42,18 +42,17 @@
 
 #include <opencv2/gpu/device/common.hpp>
 #include <icf.hpp>
-// #include <opencv2/gpu/device/saturate_cast.hpp>
 #include <stdio.h>
-// #include <float.h>
+#include <float.h>
 
-// //#define LOG_CUDA_CASCADE
+// #define LOG_CUDA_CASCADE
 
-// #if defined LOG_CUDA_CASCADE
-// # define dprintf(format, ...) \
-//             do { printf(format, __VA_ARGS__); } while (0)
-// #else
-// # define dprintf(format, ...)
-// #endif
+#if defined LOG_CUDA_CASCADE
+# define dprintf(format, ...) \
+            do { printf(format, __VA_ARGS__); } while (0)
+#else
+# define dprintf(format, ...)
+#endif
 
 namespace cv { namespace gpu { namespace device {
 namespace icf {
@@ -94,32 +93,128 @@ namespace icf {
         cudaSafeCall( cudaDeviceSynchronize() );
     }
 
-    texture<float2,  cudaTextureType1D, cudaReadModeElementType> tnode;
+    texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
+    // ToDo: do it in load time
+    // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
+    // {
+    //     scaledRect = node.rect;
+    //     return (float)(node.threshold & 0x0FFFFFFFU);
+    // }
+
+    __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
+    {
+        float relScale = level.relScale;
+        float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+        dprintf("feature %d box %d %d %d %d\n", (node.threshold >> 28), scaledRect.x, scaledRect.y,
+            scaledRect.z, scaledRect.w);
+        dprintf("rescale: %f [%f %f] selected %f\n",level.relScale, level.scaling[0], level.scaling[1],
+            level.scaling[(node.threshold >> 28) > 6]);
+
+        // rescale
+        scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+        scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+        scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+        scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+
+        float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+
+        float approx = 1.f;
+        // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
+        {
+            const float expected_new_area = farea * relScale * relScale;
+            approx =  sarea / expected_new_area;
+        }
+
+        dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", (node.threshold >> 28),
+        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
+
+
+        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
+        rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
+
+        dprintf("approximation %f %d -> %f %f\n", approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
+            level.scaling[(node.threshold >> 28) > 6]);
+
+        return rootThreshold;
+    }
+
+    __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area)
+    {
+
+        dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
+        dprintf("get for channel %d\n", channel);
+        dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
+            x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
+            x + area.x, y + area.w);
+        dprintf("at point %d %d with offset %d\n", x, y, 0);
+
+        int offset = channel * 121;
+        y += offset;
+
+        int a = tex2D(thogluv, x + area.x, y + area.y);
+        int b = tex2D(thogluv, x + area.z, y + area.y);
+        int c = tex2D(thogluv, x + area.z, y + area.w);
+        int d = tex2D(thogluv, x + area.x, y + area.w);
+
+        dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
+
+        return (a - b + c - d);
+    }
+
     __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes,
-        PtrStepSz<uchar4> objects)
+        const Node* nodes, const float* leaves, PtrStepSz<uchar4> objects)
     {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         Level level = levels[blockIdx.z];
+
+        // if (x > 0 || y > 0 || blockIdx.z > 0) return;
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
         Octave octave = octaves[level.octave];
+
         int st = octave.index * octave.stages;
         const int stEnd = st + 1000;//octave.stages;
 
         float confidence = 0.f;
 
-#pragma unroll 8
+// #pragma unroll 8
         for(; st < stEnd; ++st)
         {
+            dprintf("\n\nstage: %d\n", st);
             const int nId = st * 3;
-            const Node node = nodes[nId];
+            Node node = nodes[nId];
 
-            const float stage = stages[st];
-            confidence += node.rect.x * stage;
+            dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
+                node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
+
+            float threshold = rescale(level, node.rect, node);
+            int sum = get(x, y, (node.threshold >> 28), node.rect);
+
+            dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
+                node.rect.w, threshold);
+
+            int next = 1 + (int)(sum >= threshold);
+            dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
+
+            node = nodes[nId + next];
+            threshold = rescale(level, node.rect, node);
+            sum = get(x, y, (node.threshold >> 28), node.rect);
+
+            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+            float impact = leaves[st * 4 + lShift];
+            confidence += impact;
+
+            if (confidence <= stages[st]) st = stEnd + 1;
+            dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+            dprintf("extracted stage: %f\n", stages[st]);
+            dprintf("computed  score: %f\n\n", confidence);
         }
 
+        // if (st == stEnd)
+        //     printf("%d %d %d\n", x, y, st);
+
         uchar4 val;
         val.x = (int)confidence;
         if (x == y) objects(0, threadIdx.x) = val;
@@ -127,188 +222,27 @@ namespace icf {
     }
 
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzb& features,
-        PtrStepSz<uchar4> objects)
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects)
     {
         int fw = 160;
         int fh = 120;
+
         dim3 block(32, 8);
         dim3 grid(fw / 32, fh / 8, 47);
+
         const Level* l = (const Level*)levels.ptr();
         const Octave* oct = ((const Octave*)octaves.ptr());
         const float* st = (const float*)stages.ptr();
         const Node* nd = (const Node*)nodes.ptr();
-        // cudaSafeCall( cudaBindTexture(0, tnode, nodes.data, rgb.cols / size) );
+        const float* lf = (const float*)leaves.ptr();
 
-        test_kernel<<<grid, block>>>(l, oct, st, nd, objects);
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
+
+        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, objects);
 
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
     }
 }
-}}}
-
-// __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
-//     PtrStepSz<uchar4> objects)
-// {
-//     cascade.detectAt(hogluv, pitch, objects);
-// }
-
-// }
-
-// float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
-//                                      const int channel, const float threshold) const
-// {
-//     dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
-//     dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
-
-//     float relScale = level.relScale;
-//     float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
-
-//     // rescale
-//     scaledRect.x = __float2int_rn(relScale * scaledRect.x);
-//     scaledRect.y = __float2int_rn(relScale * scaledRect.y);
-//     scaledRect.z = __float2int_rn(relScale * scaledRect.z);
-//     scaledRect.w = __float2int_rn(relScale * scaledRect.w);
-
-//     float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
-
-
-//     float approx = 1.f;
-//     if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
-//     {
-//         const float expected_new_area = farea * relScale * relScale;
-//         approx = expected_new_area / sarea;
-//     }
-
-//     dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
-//         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
-
-//     // compensation areas rounding
-//     float rootThreshold = threshold / approx;
-//     // printf("    approx %f\n", rootThreshold);
-//     rootThreshold *= level.scaling[(int)(channel > 6)];
-
-//     dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
-
-//     return rootThreshold;
-// }
-
-// typedef unsigned char uchar;
-// float __device get(const int* __restrict__ hogluv, const int pitch,
-//                    const int x, const int y, int channel, uchar4 area)
-// {
-//     dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
-//     dprintf("get for channel %d\n", channel);
-//     dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
-//         x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
-//         x + area.x, y + area.w);
-//     dprintf("at point %d %d with offset %d\n", x, y, 0);
-
-//     const int* curr = hogluv + ((channel * 121) + y) * pitch;
-
-//     int a = curr[area.y * pitch + x + area.x];
-//     int b = curr[area.y * pitch + x + area.z];
-//     int c = curr[area.w * pitch + x + area.z];
-//     int d = curr[area.w * pitch + x + area.x];
-
-//     dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
-
-//     return (a - b + c - d);
-// }
-
-
-// void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch,
-//                                     PtrStepSz<uchar4>& objects) const
-// {
-//     const icf::Level* lls = (const icf::Level*)levels.ptr();
-
-//     const int y = blockIdx.y * blockDim.y + threadIdx.y;
-//     const int x = blockIdx.x * blockDim.x + threadIdx.x;
-//     // if (x > 0 || y > 0) return;
-
-//     Level level = lls[blockIdx.z];
-//     if (x >= level.workRect.x || y >= level.workRect.y) return;
-
-//     dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-//         level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
-
-//     const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
-//     // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
-//     //     octave.shrinkage, octave.size.x, octave.size.y, octave.scale);
-
-//     const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
-
-//     float detectionScore = 0.f;
-
-//     int st = stBegin;
-//     for(; st < stEnd; ++st)
-//     {
-//         const float stage = stages(0, st);
-//         dprintf("Stage: %f\n", stage);
-//         {
-//             const int nId = st * 3;
-
-//             // work with root node
-//             const Node node = ((const Node*)nodes.ptr())[nId];
-
-//             dprintf("Node: %d %f\n", node.feature, node.threshold);
-
-//             const Feature feature = ((const Feature*)features.ptr())[node.feature];
-
-//             uchar4 scaledRect = feature.rect;
-//             float threshold = rescale(level, scaledRect, feature.channel, node.threshold);
-
-//             float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
-
-//             dprintf("root feature %d %f\n",feature.channel, sum);
-
-//             int next = 1 + (int)(sum >= threshold);
-
-//             dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
-
-//             // leaves
-//             const Node leaf = ((const Node*)nodes.ptr())[nId + next];
-//             const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
-
-//             scaledRect = fLeaf.rect;
-//             threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold);
-//             sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
-
-//             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-//             float impact = leaves(0, (st * 4) + lShift);
-
-//             detectionScore += impact;
-
-//             dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-//             dprintf("extracted stage:\n");
-//             dprintf("ct %f\n", stage);
-//             dprintf("computed score %f\n\n", detectionScore);
-//             dprintf("\n\n");
-//         }
-
-//         if (detectionScore <= stage || st - stBegin == 100) break;
-//     }
-
-//     dprintf("x %d y %d: %d\n", x, y, st - stBegin);
-
-//     if (st == stEnd)
-//     {
-//         uchar4 a;
-//         a.x = level.workRect.x;
-//         a.y = level.workRect.y;
-//         objects(0, threadIdx.x) = a;
-//     }
-// }
-
-// void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
-// {
-//     dim3 block(32, 8, 1);
-//     dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47);
-//     device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
-//     cudaSafeCall( cudaGetLastError() );
-//     if (!stream)
-//         cudaSafeCall( cudaDeviceSynchronize() );
-// }
-
-// }}
\ No newline at end of file
+}}}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 51ea2c068..ecd1886d3 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -40,11 +40,13 @@
 //
 //M
 
-#include <opencv2/gpu/device/common.hpp>
 
 #ifndef __OPENCV_ICF_HPP__
 #define __OPENCV_ICF_HPP__
 
+#include <opencv2/gpu/device/common.hpp>
+#include <stdio.h>
+
 // #if defined __CUDACC__
 // # define __device __device__ __forceinline__
 // #else
@@ -92,20 +94,27 @@ struct __align__(8) Level //is actually 24 bytes
 
 struct __align__(8) Node
 {
-    // int feature;
     uchar4 rect;
-    float threshold;
+    // ushort channel;
+    uint threshold;
 
-    Node(const uchar4 c, const int t) : rect(c), threshold(t) {}
+    enum { THRESHOLD_MASK = 0x0FFFFFFF };
+
+    Node(const uchar4 r, const uint ch, const uint t) : rect(r), threshold(t + (ch << 28))
+    {
+        // printf("%d\n", t);
+        // printf("[%d %d %d %d] %d, %d\n",rect.x, rect.y, rect.z, rect.w, (int)(threshold >> 28),
+        //     (int)(0x0FFFFFFF & threshold));
+    }
 };
 
-struct __align__(8) Feature
-{
-    int channel;
-    uchar4 rect;
+// struct __align__(8) Feature
+// {
+//     int channel;
+//     uchar4 rect;
 
-    Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
-};
+//     Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
+// };
 }
 }}}
 // struct Cascade
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 8d75176ab..ffbf380c6 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -60,19 +60,10 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzb& features,
-        PtrStepSz<uchar4> objects);
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects);
 }
 }}}
 
-// namespace {
-//     char *itoa(long i, char* s, int /*dummy_radix*/)
-//     {
-//         sprintf(s, "%ld", i);
-//         return s;
-//     }
-// }
-
 struct cv::gpu::SoftCascade::Filds
 {
 
@@ -97,7 +88,6 @@ struct cv::gpu::SoftCascade::Filds
     GpuMat stages;
     GpuMat nodes;
     GpuMat leaves;
-    GpuMat features;
     GpuMat levels;
 
     // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
@@ -137,7 +127,7 @@ struct cv::gpu::SoftCascade::Filds
     bool fill(const FileNode &root, const float mins, const float maxs);
     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
-        device::icf::detect(levels, octaves, stages, nodes, features, objects);
+        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects);
     }
 
 private:
@@ -216,10 +206,9 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
         if (fn.empty()) return false;
 
     std::vector<Octave>  voctaves;
-    std::vector<float>        vstages;
+    std::vector<float>   vstages;
     std::vector<Node>    vnodes;
-    std::vector<float>        vleaves;
-    std::vector<Feature> vfeatures;
+    std::vector<float>   vleaves;
     scales.clear();
 
     FileNodeIterator it = fn.begin(), it_end = fn.end();
@@ -245,6 +234,8 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
         FileNode ffs = fns[SC_FEATURES];
         if (ffs.empty()) return false;
 
+        FileNodeIterator ftrs = ffs.begin();
+
         fns = fns[SC_STAGES];
         if (fn.empty()) return false;
 
@@ -263,10 +254,21 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
                 FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
                 for (; inIt != inIt_end;)
                 {
-                    int feature = (int)(*(inIt +=2)++) + feature_offset;
-                    float th = (float)(*(inIt++));
+                    // int feature = (int)(*(inIt +=2)) + feature_offset;
+                    inIt +=3;
+                    // extract feature, Todo:check it
+                    uint th = saturate_cast<uint>((float)(*(inIt++)));
+                    cv::FileNode ftn = (*ftrs)[SC_F_RECT];
+                    cv::FileNodeIterator r_it = ftn.begin();
                     uchar4 rect;
-                    vnodes.push_back(Node(rect, th));
+                    rect.x = saturate_cast<uchar>((int)*(r_it++));
+                    rect.y = saturate_cast<uchar>((int)*(r_it++));
+                    rect.z = saturate_cast<uchar>((int)*(r_it++));
+                    rect.w = saturate_cast<uchar>((int)*(r_it++));
+
+                    uint channel = saturate_cast<uint>((int)(*ftrs)[SC_F_CHANNEL]);
+                    vnodes.push_back(Node(rect, channel, th));
+                    ++ftrs;
                 }
 
                 fns = (*ftr)[SC_LEAF];
@@ -276,19 +278,6 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
             }
         }
 
-        st = ffs.begin(), st_end = ffs.end();
-        for (; st != st_end; ++st )
-        {
-            cv::FileNode rn = (*st)[SC_F_RECT];
-            cv::FileNodeIterator r_it = rn.begin();
-            uchar4 rect;
-            rect.x = saturate_cast<uchar>((int)*(r_it++));
-            rect.y = saturate_cast<uchar>((int)*(r_it++));
-            rect.z = saturate_cast<uchar>((int)*(r_it++));
-            rect.w = saturate_cast<uchar>((int)*(r_it++));
-            vfeatures.push_back(Feature((int)(*st)[SC_F_CHANNEL], rect));
-        }
-
         feature_offset += octave.stages * 3;
         ++octIndex;
     }
@@ -306,9 +295,6 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float
     leaves.upload(cv::Mat(vleaves).reshape(1,1));
     CV_Assert(!leaves.empty());
 
-    features.upload(cv::Mat(1, vfeatures.size() * sizeof(Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
-    CV_Assert(!features.empty());
-
     // compute levels
     calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
     CV_Assert(!levels.empty());
@@ -425,7 +411,14 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     return true;
 }
 
-// #define USE_REFERENCE_VALUES
+#define USE_REFERENCE_VALUES
+namespace {
+    char *itoa(long i, char* s, int /*dummy_radix*/)
+    {
+        sprintf(s, "%ld", i);
+        return s;
+    }
+}
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
                                 GpuMat& objects, const int /*rejectfactor*/, Stream s)
 {
@@ -438,17 +431,20 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     Filds& flds = *filds;
 
 #if defined USE_REFERENCE_VALUES
-//     cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
-//     cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
-//     char buff[33];
+    cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
 
-//     for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-//     {
-//         cv::Mat channel;
-//         imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
-//         GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
-//         gchannel.upload(channel);
-//     }
+    cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
+    char buff[33];
+
+    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+    {
+        cv::Mat channel;
+        imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
+
+        // std::cout << "channel " << i << std::endl << channel << std::endl;
+        GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
+        gchannel.upload(channel);
+    }
 #else
     GpuMat& plane = flds.plane;
     GpuMat& shrunk = flds.shrunk;

From 72b499df006a39185b529861f40e335611ebccc8 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 27 Sep 2012 14:40:13 +0400
Subject: [PATCH 23/74] add detection storing

---
 modules/gpu/perf/perf_objdetect.cpp |  4 +-
 modules/gpu/src/cuda/isf-sc.cu      | 59 ++++++++++-------------------
 modules/gpu/src/softcascade.cpp     | 14 ++++---
 3 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index 48a355d6a..e272d6535 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -104,7 +104,7 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
         cv::gpu::SoftCascade cascade;
         ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
 
-        cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC1);
+        cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC4);
         cascade.detectMultiScale(colored, rois, objectBoxes);
 
         TEST_CYCLE()
@@ -117,7 +117,7 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
         ASSERT_FALSE(colored.empty());
 
         cv::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(GetParam().first));
+        ASSERT_TRUE(cascade.load(getDataPath(GetParam().first)));
 
         std::vector<cv::Rect> rois, objectBoxes;
         cascade.detectMultiScale(colored, rois, objectBoxes);
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index c8dff34bd..4bf410fc5 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -57,14 +57,6 @@
 namespace cv { namespace gpu { namespace device {
 namespace icf {
 
-//     enum {
-//         HOG_BINS = 6,
-//         HOG_LUV_BINS = 10,
-//         WIDTH = 640,
-//         HEIGHT = 480,
-//         GREY_OFFSET = HEIGHT * HOG_LUV_BINS
-//     };
-
     // ToDo: use textures or ancached load instruction.
     __global__ void magToHist(const uchar* __restrict__ mag,
                               const float* __restrict__ angle, const int angPitch,
@@ -94,13 +86,6 @@ namespace icf {
     }
 
     texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
-    // ToDo: do it in load time
-    // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
-    // {
-    //     scaledRect = node.rect;
-    //     return (float)(node.threshold & 0x0FFFFFFFU);
-    // }
-
     __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
     {
         float relScale = level.relScale;
@@ -119,17 +104,12 @@ namespace icf {
 
         float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
-        float approx = 1.f;
-        // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
-        {
-            const float expected_new_area = farea * relScale * relScale;
-            approx =  sarea / expected_new_area;
-        }
+        const float expected_new_area = farea * relScale * relScale;
+        float approx =  sarea / expected_new_area;
 
         dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", (node.threshold >> 28),
         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
 
-
         float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
         rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
 
@@ -139,7 +119,7 @@ namespace icf {
         return rootThreshold;
     }
 
-    __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area)
+    __device__ __forceinline__ int get(const int x, int y, uchar4 area)
     {
 
         dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
@@ -149,9 +129,6 @@ namespace icf {
             x + area.x, y + area.w);
         dprintf("at point %d %d with offset %d\n", x, y, 0);
 
-        int offset = channel * 121;
-        y += offset;
-
         int a = tex2D(thogluv, x + area.x, y + area.y);
         int b = tex2D(thogluv, x + area.z, y + area.y);
         int c = tex2D(thogluv, x + area.z, y + area.w);
@@ -163,7 +140,7 @@ namespace icf {
     }
 
     __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, PtrStepSz<uchar4> objects)
+        const Node* nodes, const float* leaves, PtrStepSz<uchar4> objects, uint* ctr)
     {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -179,7 +156,7 @@ namespace icf {
 
         float confidence = 0.f;
 
-// #pragma unroll 8
+// #pragma unroll 2
         for(; st < stEnd; ++st)
         {
             dprintf("\n\nstage: %d\n", st);
@@ -190,7 +167,7 @@ namespace icf {
                 node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
 
             float threshold = rescale(level, node.rect, node);
-            int sum = get(x, y, (node.threshold >> 28), node.rect);
+            int sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
 
             dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
                 node.rect.w, threshold);
@@ -200,29 +177,30 @@ namespace icf {
 
             node = nodes[nId + next];
             threshold = rescale(level, node.rect, node);
-            sum = get(x, y, (node.threshold >> 28), node.rect);
+            sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
 
             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
             float impact = leaves[st * 4 + lShift];
             confidence += impact;
 
-            if (confidence <= stages[st]) st = stEnd + 1;
+            if (confidence <= stages[st]) st = stEnd + 10;
             dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
             dprintf("extracted stage: %f\n", stages[st]);
             dprintf("computed  score: %f\n\n", confidence);
         }
 
-        // if (st == stEnd)
-        //     printf("%d %d %d\n", x, y, st);
-
-        uchar4 val;
-        val.x = (int)confidence;
-        if (x == y) objects(0, threadIdx.x) = val;
-
+        if(st == stEnd)
+        {
+            int idx = atomicInc(ctr, objects.cols);
+            uchar4 val;
+            val.x = x * 4;
+            objects(0, idx) = val;
+        }
     }
 
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects)
+                const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,
+                PtrStepSz<uchar4> objects, PtrStepSzi counter)
     {
         int fw = 160;
         int fh = 120;
@@ -235,11 +213,12 @@ namespace icf {
         const float* st = (const float*)stages.ptr();
         const Node* nd = (const Node*)nodes.ptr();
         const float* lf = (const float*)leaves.ptr();
+        uint* ctr = (uint*)counter.ptr();
 
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, objects);
+        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, objects, ctr);
 
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index ffbf380c6..320fbb343 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -60,7 +60,8 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects);
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter);
 }
 }}}
 
@@ -75,6 +76,7 @@ struct cv::gpu::SoftCascade::Filds
         shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1);
         integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
         hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1);
+        detCounter.create(1,1, CV_32SC1);
     }
 
     // scales range
@@ -90,6 +92,8 @@ struct cv::gpu::SoftCascade::Filds
     GpuMat leaves;
     GpuMat levels;
 
+    GpuMat detCounter;
+
     // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
     GpuMat plane;
 
@@ -127,7 +131,8 @@ struct cv::gpu::SoftCascade::Filds
     bool fill(const FileNode &root, const float mins, const float maxs);
     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
-        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects);
+        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
+        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter);
     }
 
 private:
@@ -506,14 +511,13 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
         GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
         cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
     }
-
 #endif
 
     cudaStream_t stream = StreamAccessor::getStream(s);
-    // detection
     flds.detect(objects, stream);
 
-//     // flds.storage.frame(colored, stream);
+        //     cv::Mat out(flds.detCounter);
+        // std::cout << out << std::endl;
 }
 
 #endif
\ No newline at end of file

From c0359ed5c5987a3e58151cec20fd866fab3776ca Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 27 Sep 2012 16:50:47 +0400
Subject: [PATCH 24/74] fix test: enough size for detection matrix

---
 modules/gpu/perf/perf_objdetect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index e272d6535..8531372b0 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -104,7 +104,7 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
         cv::gpu::SoftCascade cascade;
         ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
 
-        cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC4);
+        cv::gpu::GpuMat rois, objectBoxes(1, 16384, CV_8UC1);
         cascade.detectMultiScale(colored, rois, objectBoxes);
 
         TEST_CYCLE()

From 0314e0e5d74e0e6c979505448c5eb103c2181989 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 27 Sep 2012 16:54:37 +0400
Subject: [PATCH 25/74] add kind in detection representation

---
 modules/gpu/src/cuda/isf-sc.cu | 17 +++----
 modules/gpu/src/icf.hpp        | 81 ++++++++--------------------------
 2 files changed, 27 insertions(+), 71 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 4bf410fc5..adfc9edcb 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -123,7 +123,6 @@ namespace icf {
     {
 
         dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
-        dprintf("get for channel %d\n", channel);
         dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
             x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
             x + area.x, y + area.w);
@@ -140,13 +139,13 @@ namespace icf {
     }
 
     __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, PtrStepSz<uchar4> objects, uint* ctr)
+        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr)
     {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         Level level = levels[blockIdx.z];
 
-        // if (x > 0 || y > 0 || blockIdx.z > 0) return;
+        // if (blockIdx.z != 31) return;
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
         Octave octave = octaves[level.octave];
@@ -191,10 +190,10 @@ namespace icf {
 
         if(st == stEnd)
         {
-            int idx = atomicInc(ctr, objects.cols);
-            uchar4 val;
-            val.x = x * 4;
-            objects(0, idx) = val;
+            int idx = atomicInc(ctr, ndetections);
+            // store detection
+            objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
+                __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
         }
     }
 
@@ -214,11 +213,13 @@ namespace icf {
         const Node* nd = (const Node*)nodes.ptr();
         const float* lf = (const float*)leaves.ptr();
         uint* ctr = (uint*)counter.ptr();
+        Detection* det = (Detection*)objects.ptr();
+        uint max_det = objects.cols / sizeof(Detection);
 
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, objects, ctr);
+        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr);
 
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index ecd1886d3..35658892f 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -47,11 +47,11 @@
 #include <opencv2/gpu/device/common.hpp>
 #include <stdio.h>
 
-// #if defined __CUDACC__
-// # define __device __device__ __forceinline__
-// #else
-// # define __device
-// #endif
+#if defined __CUDACC__
+# define __device __device__ __forceinline__
+#else
+# define __device
+#endif
 
 
 namespace cv { namespace gpu { namespace device {
@@ -108,66 +108,21 @@ struct __align__(8) Node
     }
 };
 
-// struct __align__(8) Feature
-// {
-//     int channel;
-//     uchar4 rect;
+struct __align__(16) Detection
+{
+    ushort x;
+    ushort y;
+    ushort w;
+    ushort h;
+    float confidence;
+    int kind;
+
+    Detection(){}
+    __device Detection(int _x, int _y, uchar _w, uchar _h, float c)
+    : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
+};
 
-//     Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
-// };
 }
 }}}
-// struct Cascade
-// {
-//     Cascade() {}
-//     Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds,
-//         const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
-//     : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
-
-//     void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
-//     void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
-//     float __device rescale(const icf::Level& level, uchar4& scaledRect,
-//                            const int channel, const float threshold) const;
-
-//     PtrStepSzb octaves;
-//     PtrStepSzf stages;
-//     PtrStepSzb nodes;
-//     PtrStepSzf leaves;
-//     PtrStepSzb features;
-
-//     PtrStepSzb levels;
-
-// };
-
-// struct ChannelStorage
-// {
-//     ChannelStorage(){}
-//     ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr,
-//         const cv::gpu::PtrStepSzb& itg, const int s)
-//     : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
-
-//     void frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream){}
-
-//     PtrStepSzb dmem;
-//     PtrStepSzb shrunk;
-//     PtrStepSzb hogluv;
-
-//     enum
-//     {
-//         FRAME_WIDTH        = 640,
-//         FRAME_HEIGHT       = 480,
-//         TOTAL_SCALES       = 55,
-//         CLASSIFIERS        = 5,
-//         ORIG_OBJECT_WIDTH  = 64,
-//         ORIG_OBJECT_HEIGHT = 128,
-//         HOG_BINS           = 6,
-//         HOG_LUV_BINS       = 10
-//     };
-
-//     int shrinkage;
-//     static const float magnitudeScaling = 1.f ;// / sqrt(2);
-// };
-
-// }}}
 
 #endif
\ No newline at end of file

From 612a258506aec30531e3db9dd1157253a6a4bc23 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Fri, 28 Sep 2012 19:10:29 +0400
Subject: [PATCH 26/74] kepler specific version

---
 modules/gpu/src/cuda/isf-sc.cu | 130 ++++++++++++++++++++++++---------
 1 file changed, 97 insertions(+), 33 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index adfc9edcb..f3c92cc6a 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -91,9 +91,9 @@ namespace icf {
         float relScale = level.relScale;
         float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
-        dprintf("feature %d box %d %d %d %d\n", (node.threshold >> 28), scaledRect.x, scaledRect.y,
+        dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y,
             scaledRect.z, scaledRect.w);
-        dprintf("rescale: %f [%f %f] selected %f\n",level.relScale, level.scaling[0], level.scaling[1],
+        dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1],
             level.scaling[(node.threshold >> 28) > 6]);
 
         // rescale
@@ -107,13 +107,13 @@ namespace icf {
         const float expected_new_area = farea * relScale * relScale;
         float approx =  sarea / expected_new_area;
 
-        dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", (node.threshold >> 28),
+        dprintf("%d: new rect: %d box %d %d %d %d  rel areas %f %f\n",threadIdx.x, (node.threshold >> 28),
         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
 
         float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
         rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
 
-        dprintf("approximation %f %d -> %f %f\n", approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
+        dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
             level.scaling[(node.threshold >> 28) > 6]);
 
         return rootThreshold;
@@ -122,73 +122,137 @@ namespace icf {
     __device__ __forceinline__ int get(const int x, int y, uchar4 area)
     {
 
-        dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
-        dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
+        dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
+        dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x,
             x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
             x + area.x, y + area.w);
-        dprintf("at point %d %d with offset %d\n", x, y, 0);
+        dprintf("%d: at point %d %d with offset %d\n", x, y, 0);
 
         int a = tex2D(thogluv, x + area.x, y + area.y);
         int b = tex2D(thogluv, x + area.z, y + area.y);
         int c = tex2D(thogluv, x + area.z, y + area.w);
         int d = tex2D(thogluv, x + area.x, y + area.w);
 
-        dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
+        dprintf("%d   retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d);
 
         return (a - b + c - d);
     }
 
-    __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
+//     __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
+//         const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr)
+//     {
+//         const int y = blockIdx.y * blockDim.y + threadIdx.y;
+//         const int x = blockIdx.x * blockDim.x + threadIdx.x;
+//         Level level = levels[blockIdx.z];
+
+//         // if (blockIdx.z != 31) return;
+//         if(x >= level.workRect.x || y >= level.workRect.y) return;
+
+//         Octave octave = octaves[level.octave];
+
+//         int st = octave.index * octave.stages;
+//         const int stEnd = st + 1000;//octave.stages;
+
+//         float confidence = 0.f;
+
+// // #pragma unroll 2
+//         for(; st < stEnd; ++st)
+//         {
+//             dprintf("\n\nstage: %d\n", st);
+//             const int nId = st * 3;
+//             Node node = nodes[nId];
+
+//             dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
+//                 node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
+
+//             float threshold = rescale(level, node.rect, node);
+//             int sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
+
+//             dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
+//                 node.rect.w, threshold);
+
+//             int next = 1 + (int)(sum >= threshold);
+//             dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
+
+//             node = nodes[nId + next];
+//             threshold = rescale(level, node.rect, node);
+//             sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
+
+//             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+//             float impact = leaves[st * 4 + lShift];
+//             confidence += impact;
+
+//             if (confidence <= stages[st]) st = stEnd + 10;
+//             dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+//             dprintf("extracted stage: %f\n", stages[st]);
+//             dprintf("computed  score: %f\n\n", confidence);
+//         }
+
+//         if(st == stEnd)
+//         {
+//             int idx = atomicInc(ctr, ndetections);
+//             // store detection
+//             objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
+//                 __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
+//         }
+//     }
+
+    __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
         const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr)
     {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int x = blockIdx.x;
+
         Level level = levels[blockIdx.z];
 
-        // if (blockIdx.z != 31) return;
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
         Octave octave = octaves[level.octave];
-
         int st = octave.index * octave.stages;
-        const int stEnd = st + 1000;//octave.stages;
+        const int stEnd = st + 1024;
 
         float confidence = 0.f;
 
-// #pragma unroll 2
-        for(; st < stEnd; ++st)
+        for(; st < stEnd; st += 32)
         {
-            dprintf("\n\nstage: %d\n", st);
-            const int nId = st * 3;
-            Node node = nodes[nId];
 
-            dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
-                node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
+            const int nId = (st + threadIdx.x) * 3;
+            dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId);
+            Node node = nodes[nId];
 
             float threshold = rescale(level, node.rect, node);
             int sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
 
-            dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
-                node.rect.w, threshold);
-
             int next = 1 + (int)(sum >= threshold);
-            dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
+            dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
 
             node = nodes[nId + next];
             threshold = rescale(level, node.rect, node);
             sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
 
             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-            float impact = leaves[st * 4 + lShift];
-            confidence += impact;
+            float impact = leaves[(st + threadIdx.x) * 4 + lShift];
 
-            if (confidence <= stages[st]) st = stEnd + 10;
-            dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-            dprintf("extracted stage: %f\n", stages[st]);
-            dprintf("computed  score: %f\n\n", confidence);
+            dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact);
+            dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]);
+            dprintf("%d: computed  score: %f\n",threadIdx.x, impact);
+
+            // scan on shuffl functions
+            for (int i = 1; i < 32; i *= 2)
+            {
+                const float n = __shfl_up(impact, i, 32);
+
+                if (threadIdx.x >= i)
+                    impact += n;
+            }
+
+            dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact);
+
+            confidence += impact;
+            if(__any((confidence <= stages[(st + threadIdx.x)]))) break;
         }
 
-        if(st == stEnd)
+        if(st == stEnd && !threadIdx.x)
         {
             int idx = atomicInc(ctr, ndetections);
             // store detection
@@ -205,7 +269,7 @@ namespace icf {
         int fh = 120;
 
         dim3 block(32, 8);
-        dim3 grid(fw / 32, fh / 8, 47);
+        dim3 grid(fw, fh / 8, 47);
 
         const Level* l = (const Level*)levels.ptr();
         const Octave* oct = ((const Octave*)octaves.ptr());
@@ -219,7 +283,7 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr);
+        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr);
 
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());

From b52fea7fae6ec38096dcd57458e0ee6be87da996 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 1 Oct 2012 13:48:16 +0400
Subject: [PATCH 27/74] update soft cascade interface:  - add class Detection
 in interface,  - split sync- and async- versions,  - add support for
 detecting at the specific scale.

---
 modules/gpu/include/opencv2/gpu/gpu.hpp | 26 ++++++++++++-
 modules/gpu/src/cuda/isf-sc.cu          | 45 ++++++++++++++++++----
 modules/gpu/src/softcascade.cpp         | 50 ++++++++++++++++++++-----
 3 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 5008e1027..f171ad904 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1537,6 +1537,18 @@ public:
 class CV_EXPORTS SoftCascade
 {
 public:
+
+    struct CV_EXPORTS Detection
+    {
+        ushort x;
+        ushort y;
+        ushort w;
+        ushort h;
+        float confidence;
+        int kind;
+
+        enum {PEDESTRIAN = 0};
+    };
     //! An empty cascade will be created.
     SoftCascade();
 
@@ -1559,9 +1571,19 @@ public:
     //! Param rois is a mask
     //! Param objects 4-channel matrix thet contain detected rectangles
     //! Param rejectfactor used for final object box computing
-    //! Param stream
     virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
-    int rejectfactor = 1, Stream stream = Stream::Null());
+    int rejectfactor = 1, int specificScale = -1);
+
+    //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values.
+    //! asynchronous version.
+    //! Param image is input frame for detector. Cascade will be applied to it.
+    //! Param rois is a mask
+    //! Param objects 4-channel matrix thet contain detected rectangles
+    //! Param rejectfactor used for final object box computing
+    //! Param ndet retrieves number of detections
+    //! Param stream wrapper for CUDA stream
+    virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
+    int rejectfactor, GpuMat& ndet, Stream stream);
 
 private:
     struct Filds;
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index f3c92cc6a..3d9a1e10f 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -105,7 +105,7 @@ namespace icf {
         float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
         const float expected_new_area = farea * relScale * relScale;
-        float approx =  sarea / expected_new_area;
+        float approx = __fdividef(sarea, expected_new_area);
 
         dprintf("%d: new rect: %d box %d %d %d %d  rel areas %f %f\n",threadIdx.x, (node.threshold >> 28),
         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
@@ -198,12 +198,13 @@ namespace icf {
 //     }
 
     __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr)
+        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
+        const int downscales)
     {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x;
 
-        Level level = levels[blockIdx.z];
+        Level level = levels[downscales + blockIdx.z];
 
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
@@ -236,7 +237,7 @@ namespace icf {
             dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact);
             dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]);
             dprintf("%d: computed  score: %f\n",threadIdx.x, impact);
-
+#pragma unroll
             // scan on shuffl functions
             for (int i = 1; i < 32; i *= 2)
             {
@@ -263,13 +264,13 @@ namespace icf {
 
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
                 const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,
-                PtrStepSz<uchar4> objects, PtrStepSzi counter)
+                PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
     {
         int fw = 160;
         int fh = 120;
 
         dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, 47);
+        dim3 grid(fw, fh / 8, downscales);
 
         const Level* l = (const Level*)levels.ptr();
         const Octave* oct = ((const Octave*)octaves.ptr());
@@ -283,8 +284,38 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr);
+        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
+        cudaSafeCall( cudaGetLastError());
 
+        grid = dim3(fw, fh / 8, 47 - downscales);
+        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, downscales);
+        cudaSafeCall( cudaGetLastError());
+        cudaSafeCall( cudaDeviceSynchronize());
+    }
+
+    void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter)
+    {
+        int fw = 160;
+        int fh = 120;
+
+        dim3 block(32, 8);
+        dim3 grid(fw, fh / 8, 1);
+
+        const Level* l = (const Level*)levels.ptr();
+        const Octave* oct = ((const Octave*)octaves.ptr());
+        const float* st = (const float*)stages.ptr();
+        const Node* nd = (const Node*)nodes.ptr();
+        const float* lf = (const float*)leaves.ptr();
+        uint* ctr = (uint*)counter.ptr();
+        Detection* det = (Detection*)objects.ptr();
+        uint max_det = objects.cols / sizeof(Detection);
+
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
+
+        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
     }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 320fbb343..fd94909cf 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -49,7 +49,11 @@ cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); }
 cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); }
 cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); }
 bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; }
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu();}
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) { throw_nogpu();}
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream)
+{
+    throw_nogpu();
+}
 
 #else
 
@@ -60,6 +64,9 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter, const int downscales);
+    void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
         const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
         PtrStepSzi counter);
 }
@@ -86,6 +93,8 @@ struct cv::gpu::SoftCascade::Filds
     int origObjWidth;
     int origObjHeight;
 
+    int downscales;
+
     GpuMat octaves;
     GpuMat stages;
     GpuMat nodes;
@@ -120,7 +129,6 @@ struct cv::gpu::SoftCascade::Filds
         FRAME_WIDTH        = 640,
         FRAME_HEIGHT       = 480,
         TOTAL_SCALES       = 55,
-//         CLASSIFIERS        = 5,
         ORIG_OBJECT_WIDTH  = 64,
         ORIG_OBJECT_HEIGHT = 128,
         HOG_BINS           = 6,
@@ -132,7 +140,14 @@ struct cv::gpu::SoftCascade::Filds
     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter);
+        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
+    }
+
+    void detectAtScale(int scale, cv::gpu::GpuMat objects, cudaStream_t stream) const
+    {
+        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
+        device::icf::detectAtScale(scale, levels, octaves, stages, nodes, leaves, hogluv, objects,
+            detCounter);
     }
 
 private:
@@ -160,7 +175,7 @@ private:
     }
 };
 
-inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
+bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
 {
     using namespace device::icf;
     minScale = mins;
@@ -351,6 +366,7 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<device::ic
     float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
 
     float scale = minScale;
+    downscales = 0;
     for (int sc = 0; sc < nscales; ++sc)
     {
         int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
@@ -366,7 +382,10 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<device::ic
         if (!width || !height)
             break;
         else
+        {
             vlevels.push_back(level);
+            if (octs[fit].scale < 1) ++downscales;
+        }
 
         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
@@ -424,8 +443,11 @@ namespace {
         return s;
     }
 }
+
+//================================== synchronous version ============================================================//
+
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
-                                GpuMat& objects, const int /*rejectfactor*/, Stream s)
+                                GpuMat& objects, const int /*rejectfactor*/, int specificScale)
 {
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3);
@@ -513,11 +535,21 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     }
 #endif
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    flds.detect(objects, stream);
+    if (specificScale == -1)
+        flds.detect(objects, 0);
+    else
+        flds.detectAtScale(specificScale, objects, 0);
 
-        //     cv::Mat out(flds.detCounter);
-        // std::cout << out << std::endl;
+    cv::Mat out(flds.detCounter);
+    int ndetections = *(out.data);
+
+    objects = GpuMat(objects, cv::Rect(0, 0, ndetections * sizeof(Detection), 1));
 }
 
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream)
+{
+    // cudaStream_t stream = StreamAccessor::getStream(s);
+}
+
+
 #endif
\ No newline at end of file

From 7db1323f81fbc9b3de83c6127a36275470060e6b Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 1 Oct 2012 13:49:21 +0400
Subject: [PATCH 28/74] add test that stores detections on the specific scale

---
 modules/gpu/test/test_softcascade.cpp | 59 +++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index c7e3a1f77..0da07298d 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -71,4 +71,63 @@ TEST(SoftCascade, detect)
     // });
 }
 
+class SCSpecific : public ::testing::TestWithParam<std::tr1::tuple<std::string, int> > {
+};
+
+namespace {
+std::string itoa(long i)
+{
+    static char s[65];
+    sprintf(s, "%ld", i);
+    return std::string(s);
+}
+}
+
+TEST_P(SCSpecific, detect)
+{
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(xml));
+
+    std::string path = GET_PARAM(0);
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + path);
+
+    ASSERT_FALSE(coloredCpu.empty());
+    GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois;
+
+    int level = GET_PARAM(1);
+    cascade.detectMultiScale(colored, rois, objectBoxes, 1, level);
+
+    cv::Mat dt(objectBoxes);
+    typedef cv::gpu::SoftCascade::Detection detection_t;
+
+    detection_t* dts = (detection_t*)dt.data;
+    cv::Mat result(coloredCpu);
+
+
+    std::cout << "Total detections " << (dt.cols / sizeof(detection_t)) << std::endl;
+    for(int i = 0; i  < (int)(dt.cols / sizeof(detection_t)); ++i)
+    {
+        detection_t d = dts[i];
+        std::cout << "detection: [" << std::setw(4) << d.x << " " << std::setw(4) << d.y
+                  << "] [" << std::setw(4) << d.w << " " << std::setw(4) << d.h << "] "
+                  << std::setw(12)  << d.confidence << std::endl;
+
+        cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
+    }
+
+    std::cout << "Result stored in " << "/home/kellan/gpu_res_1_oct_" + itoa(level) << "_"
+    + itoa((dt.cols / sizeof(detection_t))) + ".png" << std::endl;
+    cv::imwrite("/home/kellan/gpu_res_1_oct_" + itoa(level) + "_" + itoa((dt.cols / sizeof(detection_t))) + ".png",
+        result);
+    cv::imshow("res", result);
+    cv::waitKey(0);
+}
+
+INSTANTIATE_TEST_CASE_P(inLevel, SCSpecific,
+    testing::Combine(
+        testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 47)
+        ));
+
 #endif
\ No newline at end of file

From 56c7ef06e7a5e40e65c6a4d14775aa8defbc2dfb Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 1 Oct 2012 14:08:48 +0400
Subject: [PATCH 29/74] integrate Kepler version

---
 modules/gpu/src/cuda/isf-sc.cu | 120 +++++++++++++++++----------------
 1 file changed, 61 insertions(+), 59 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 3d9a1e10f..cc4502494 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -138,65 +138,7 @@ namespace icf {
         return (a - b + c - d);
     }
 
-//     __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
-//         const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr)
-//     {
-//         const int y = blockIdx.y * blockDim.y + threadIdx.y;
-//         const int x = blockIdx.x * blockDim.x + threadIdx.x;
-//         Level level = levels[blockIdx.z];
-
-//         // if (blockIdx.z != 31) return;
-//         if(x >= level.workRect.x || y >= level.workRect.y) return;
-
-//         Octave octave = octaves[level.octave];
-
-//         int st = octave.index * octave.stages;
-//         const int stEnd = st + 1000;//octave.stages;
-
-//         float confidence = 0.f;
-
-// // #pragma unroll 2
-//         for(; st < stEnd; ++st)
-//         {
-//             dprintf("\n\nstage: %d\n", st);
-//             const int nId = st * 3;
-//             Node node = nodes[nId];
-
-//             dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
-//                 node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
-
-//             float threshold = rescale(level, node.rect, node);
-//             int sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
-
-//             dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
-//                 node.rect.w, threshold);
-
-//             int next = 1 + (int)(sum >= threshold);
-//             dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
-
-//             node = nodes[nId + next];
-//             threshold = rescale(level, node.rect, node);
-//             sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
-
-//             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-//             float impact = leaves[st * 4 + lShift];
-//             confidence += impact;
-
-//             if (confidence <= stages[st]) st = stEnd + 10;
-//             dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-//             dprintf("extracted stage: %f\n", stages[st]);
-//             dprintf("computed  score: %f\n\n", confidence);
-//         }
-
-//         if(st == stEnd)
-//         {
-//             int idx = atomicInc(ctr, ndetections);
-//             // store detection
-//             objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
-//                 __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
-//         }
-//     }
-
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
     __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
         const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
         const int downscales)
@@ -261,6 +203,66 @@ namespace icf {
                 __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
         }
     }
+#else
+    __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
+        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
+        const int downscales)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        Level level = levels[blockIdx.z];
+
+        // if (blockIdx.z != 31) return;
+        if(x >= level.workRect.x || y >= level.workRect.y) return;
+
+        Octave octave = octaves[level.octave];
+
+        int st = octave.index * octave.stages;
+        const int stEnd = st + 1000;//octave.stages;
+
+        float confidence = 0.f;
+
+        for(; st < stEnd; ++st)
+        {
+            dprintf("\n\nstage: %d\n", st);
+            const int nId = st * 3;
+            Node node = nodes[nId];
+
+            dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
+                node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
+
+            float threshold = rescale(level, node.rect, node);
+            int sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
+
+            dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
+                node.rect.w, threshold);
+
+            int next = 1 + (int)(sum >= threshold);
+            dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
+
+            node = nodes[nId + next];
+            threshold = rescale(level, node.rect, node);
+            sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
+
+            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+            float impact = leaves[st * 4 + lShift];
+            confidence += impact;
+
+            if (confidence <= stages[st]) st = stEnd + 10;
+            dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+            dprintf("extracted stage: %f\n", stages[st]);
+            dprintf("computed  score: %f\n\n", confidence);
+        }
+
+        if(st == stEnd)
+        {
+            int idx = atomicInc(ctr, ndetections);
+            // store detection
+            objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
+                __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
+        }
+    }
+#endif
 
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
                 const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,

From 672cf1f44576bb82f1202c0c15c947b47469304b Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 1 Oct 2012 14:50:28 +0400
Subject: [PATCH 30/74] implement different behaviour for up- and down-scaling

---
 modules/gpu/src/cuda/isf-sc.cu        | 98 +++++++++++++++++++++++----
 modules/gpu/src/softcascade.cpp       | 13 +++-
 modules/gpu/test/test_softcascade.cpp |  2 +-
 3 files changed, 95 insertions(+), 18 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index cc4502494..c9a92e379 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -86,8 +86,11 @@ namespace icf {
     }
 
     texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
-    __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
+
+    template<bool isUp>
+    __device__ __forceinline__ float rescale(const Level& level, Node& node)
     {
+        uchar4& scaledRect = node.rect;
         float relScale = level.relScale;
         float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
@@ -119,7 +122,44 @@ namespace icf {
         return rootThreshold;
     }
 
-    __device__ __forceinline__ int get(const int x, int y, uchar4 area)
+    template<>
+    __device__ __forceinline__ float rescale<true>(const Level& level, Node& node)
+    {
+        uchar4& scaledRect = node.rect;
+        float relScale = level.relScale;
+        float farea = scaledRect.z * scaledRect.w;
+
+        dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y,
+            scaledRect.z, scaledRect.w);
+        dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1],
+            level.scaling[(node.threshold >> 28) > 6]);
+
+        // rescale
+        scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+        scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+        scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+        scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+
+        float sarea = scaledRect.z * scaledRect.w;
+
+        const float expected_new_area = farea * relScale * relScale;
+        float approx = __fdividef(sarea, expected_new_area);
+
+        dprintf("%d: new rect: %d box %d %d %d %d  rel areas %f %f\n",threadIdx.x, (node.threshold >> 28),
+        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
+
+        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
+
+        rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
+
+        dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
+            level.scaling[(node.threshold >> 28) > 6]);
+
+        return rootThreshold;
+    }
+
+    template<bool isUp>
+    __device__ __forceinline__ int get(int x, int y, uchar4 area)
     {
 
         dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
@@ -138,7 +178,30 @@ namespace icf {
         return (a - b + c - d);
     }
 
+    template<>
+    __device__ __forceinline__ int get<true>(int x, int y, uchar4 area)
+    {
+
+        dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
+        dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x,
+            x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
+            x + area.x, y + area.w);
+        dprintf("%d: at point %d %d with offset %d\n", x, y, 0);
+
+        x += area.x;
+        y += area.y;
+        int a = tex2D(thogluv, x, y);
+        int b = tex2D(thogluv, x + area.z, y);
+        int c = tex2D(thogluv, x + area.z, y + area.w);
+        int d = tex2D(thogluv, x, y + area.w);
+
+        dprintf("%d   retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d);
+
+        return (a - b + c - d);
+    }
+
 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+    template<bool isUp>
     __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
         const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
         const int downscales)
@@ -163,15 +226,15 @@ namespace icf {
             dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId);
             Node node = nodes[nId];
 
-            float threshold = rescale(level, node.rect, node);
-            int sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
+            float threshold = rescale<isUp>(level, node);
+            int sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
 
             int next = 1 + (int)(sum >= threshold);
             dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
 
             node = nodes[nId + next];
-            threshold = rescale(level, node.rect, node);
-            sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
+            threshold = rescale<isUp>(level, node);
+            sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
 
             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
             float impact = leaves[(st + threadIdx.x) * 4 + lShift];
@@ -192,7 +255,7 @@ namespace icf {
             dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact);
 
             confidence += impact;
-            if(__any((confidence <= stages[(st + threadIdx.x)]))) break;
+            if(__any((confidence <= stages[(st + threadIdx.x)]))) st += stEnd;
         }
 
         if(st == stEnd && !threadIdx.x)
@@ -204,6 +267,7 @@ namespace icf {
         }
     }
 #else
+    template<bool isUp>
     __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
         const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
         const int downscales)
@@ -231,8 +295,8 @@ namespace icf {
             dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
                 node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
 
-            float threshold = rescale(level, node.rect, node);
-            int sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
+            float threshold = rescale<isUp>(level, node);
+            int sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
 
             dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
                 node.rect.w, threshold);
@@ -241,8 +305,8 @@ namespace icf {
             dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
 
             node = nodes[nId + next];
-            threshold = rescale(level, node.rect, node);
-            sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
+            threshold = rescale<isUp>(level, node);
+            sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
 
             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
             float impact = leaves[st * 4 + lShift];
@@ -286,18 +350,18 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
+        test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
         cudaSafeCall( cudaGetLastError());
 
         grid = dim3(fw, fh / 8, 47 - downscales);
-        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, downscales);
+        test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, downscales);
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
     }
 
     void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
         const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter)
+        PtrStepSzi counter, const int downscales)
     {
         int fw = 160;
         int fh = 120;
@@ -317,7 +381,11 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        test_kernel_warp<<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+        if (scale >= downscales)
+            test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+        else
+            test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
     }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index fd94909cf..8868aa5b1 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -68,7 +68,7 @@ namespace icf {
         PtrStepSzi counter, const int downscales);
     void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
         const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter);
+        PtrStepSzi counter, const int downscales);
 }
 }}}
 
@@ -147,7 +147,7 @@ struct cv::gpu::SoftCascade::Filds
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
         device::icf::detectAtScale(scale, levels, octaves, stages, nodes, leaves, hogluv, objects,
-            detCounter);
+            detCounter, downscales);
     }
 
 private:
@@ -240,6 +240,9 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c
     {
         FileNode fns = *it;
         float scale = (float)fns[SC_OCT_SCALE];
+
+        bool isUPOctave = scale >= 1;
+
         scales.push_back(scale);
         ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
         ushort2 size;
@@ -286,6 +289,12 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c
                     rect.z = saturate_cast<uchar>((int)*(r_it++));
                     rect.w = saturate_cast<uchar>((int)*(r_it++));
 
+                    if (isUPOctave)
+                    {
+                        rect.z -= rect.x;
+                        rect.w -= rect.y;
+                    }
+
                     uint channel = saturate_cast<uint>((int)(*ftrs)[SC_F_CHANNEL]);
                     vnodes.push_back(Node(rect, channel, th));
                     ++ftrs;
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 0da07298d..4d1a4b7a6 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -63,7 +63,7 @@ TEST(SoftCascade, detect)
     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
         + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
     ASSERT_FALSE(coloredCpu.empty());
-    GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois;
+    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois;
 
     // ASSERT_NO_THROW(
     // {

From 64d6e6a48d4580964a8ce8b589a94e973804c3b9 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Tue, 2 Oct 2012 16:45:40 +0400
Subject: [PATCH 31/74] add getROISize

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  6 ++++--
 modules/gpu/src/softcascade.cpp         | 21 ++++++++++++++++-----
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index f171ad904..9b59c6004 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1572,7 +1572,7 @@ public:
     //! Param objects 4-channel matrix thet contain detected rectangles
     //! Param rejectfactor used for final object box computing
     virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
-    int rejectfactor = 1, int specificScale = -1);
+    int rejectfactor = 1, int specificScale = -1) const;
 
     //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values.
     //! asynchronous version.
@@ -1583,7 +1583,9 @@ public:
     //! Param ndet retrieves number of detections
     //! Param stream wrapper for CUDA stream
     virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
-    int rejectfactor, GpuMat& ndet, Stream stream);
+    int rejectfactor, GpuMat& ndet, Stream stream) const;
+
+    cv::Size getRoiSize() const;
 
 private:
     struct Filds;
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 8868aa5b1..af836695a 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -49,12 +49,18 @@ cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); }
 cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); }
 cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); }
 bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; }
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) { throw_nogpu();}
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream)
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) const
 {
     throw_nogpu();
 }
 
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const
+{
+    throw_nogpu();
+}
+
+cv::Size cv::gpu::SoftCascade::getRoiSize() const { throw_nogpu(); return cv::Size();}
+
 #else
 
 #include <icf.hpp>
@@ -455,8 +461,8 @@ namespace {
 
 //================================== synchronous version ============================================================//
 
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
-                                GpuMat& objects, const int /*rejectfactor*/, int specificScale)
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& rois,
+                                GpuMat& objects, const int /*rejectfactor*/, int specificScale) const
 {
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3);
@@ -555,10 +561,15 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     objects = GpuMat(objects, cv::Rect(0, 0, ndetections * sizeof(Detection), 1));
 }
 
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream)
+void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const
 {
     // cudaStream_t stream = StreamAccessor::getStream(s);
 }
 
+cv::Size cv::gpu::SoftCascade::getRoiSize() const
+{
+    return cv::Size(Filds::FRAME_WIDTH / 4, Filds::FRAME_HEIGHT / 4);
+}
+
 
 #endif
\ No newline at end of file

From eb91593c08daca478663d7d0805f0ce664fd5d9b Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Tue, 2 Oct 2012 17:25:26 +0400
Subject: [PATCH 32/74] add roi support

---
 modules/gpu/perf/perf_objdetect.cpp   |  6 +++-
 modules/gpu/src/cuda/isf-sc.cu        | 21 ++++++++++---
 modules/gpu/src/softcascade.cpp       | 44 +++++++++++++++++++--------
 modules/gpu/test/test_softcascade.cpp | 16 ++++++----
 4 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index 8531372b0..a86337112 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -104,7 +104,11 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
         cv::gpu::SoftCascade cascade;
         ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
 
-        cv::gpu::GpuMat rois, objectBoxes(1, 16384, CV_8UC1);
+        cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+
+        rois.setTo(0);
+        cv::gpu::GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
+        sub.setTo(cv::Scalar::all(1));
         cascade.detectMultiScale(colored, rois, objectBoxes);
 
         TEST_CYCLE()
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index c9a92e379..4bde7f7ea 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -86,6 +86,7 @@ namespace icf {
     }
 
     texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
+    texture<char,  cudaTextureType2D, cudaReadModeElementType> troi;
 
     template<bool isUp>
     __device__ __forceinline__ float rescale(const Level& level, Node& node)
@@ -213,6 +214,8 @@ namespace icf {
 
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
+        if (!tex2D(troi, x, y)) return;
+
         Octave octave = octaves[level.octave];
         int st = octave.index * octave.stages;
         const int stEnd = st + 1024;
@@ -279,6 +282,10 @@ namespace icf {
         // if (blockIdx.z != 31) return;
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
+        int roi = tex2D(troi, x, y);
+        printf("%d\n", roi);
+        if (!roi) return;
+
         Octave octave = octaves[level.octave];
 
         int st = octave.index * octave.stages;
@@ -328,7 +335,7 @@ namespace icf {
     }
 #endif
 
-    void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
+    void detect(const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
                 const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,
                 PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
     {
@@ -350,6 +357,9 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
+        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<char>();
+        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step));
+
         test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
         cudaSafeCall( cudaGetLastError());
 
@@ -359,9 +369,9 @@ namespace icf {
         cudaSafeCall( cudaDeviceSynchronize());
     }
 
-    void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter, const int downscales)
+    void detectAtScale(const int scale, const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves,
+        const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv,
+        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
     {
         int fw = 160;
         int fh = 120;
@@ -381,6 +391,9 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
+        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<char>();
+        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step));
+
         if (scale >= downscales)
             test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
         else
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index af836695a..9ea365c5e 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -69,12 +69,29 @@ namespace cv { namespace gpu { namespace device {
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
-    void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter, const int downscales);
-    void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter, const int downscales);
+
+    void detect(const PtrStepSzb& rois,
+                const PtrStepSzb& levels,
+                const PtrStepSzb& octaves,
+                const PtrStepSzf& stages,
+                const PtrStepSzb& nodes,
+                const PtrStepSzf& leaves,
+                const PtrStepSzi& hogluv,
+                PtrStepSz<uchar4> objects,
+                PtrStepSzi counter,
+                const int downscales);
+
+    void detectAtScale(const int scale,
+                       const PtrStepSzb& rois,
+                       const PtrStepSzb& levels,
+                       const PtrStepSzb& octaves,
+                       const PtrStepSzf& stages,
+                       const PtrStepSzb& nodes,
+                       const PtrStepSzf& leaves,
+                       const PtrStepSzi& hogluv,
+                       PtrStepSz<uchar4> objects,
+                       PtrStepSzi counter,
+                       const int downscales);
 }
 }}}
 
@@ -143,16 +160,16 @@ struct cv::gpu::SoftCascade::Filds
     };
 
     bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
+    void detect(cv::gpu::GpuMat roi, cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
+        device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
     }
 
-    void detectAtScale(int scale, cv::gpu::GpuMat objects, cudaStream_t stream) const
+    void detectAtScale(int scale, cv::gpu::GpuMat roi, cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detectAtScale(scale, levels, octaves, stages, nodes, leaves, hogluv, objects,
+        device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects,
             detCounter, downscales);
     }
 
@@ -467,6 +484,9 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3);
 
+    // we guess user knows about shrincage
+    CV_Assert((rois.size() == getRoiSize()) && (rois.type() == CV_8UC1));
+
     // only this window size allowed
     CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
 
@@ -551,9 +571,9 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
 #endif
 
     if (specificScale == -1)
-        flds.detect(objects, 0);
+        flds.detect(rois,objects, 0);
     else
-        flds.detectAtScale(specificScale, objects, 0);
+        flds.detectAtScale(specificScale, rois, objects, 0);
 
     cv::Mat out(flds.detCounter);
     int ndetections = *(out.data);
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 4d1a4b7a6..84602915f 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -63,12 +63,13 @@ TEST(SoftCascade, detect)
     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
         + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
     ASSERT_FALSE(coloredCpu.empty());
-    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois;
 
-    // ASSERT_NO_THROW(
-    // {
-        cascade.detectMultiScale(colored, rois, objectBoxes);
-    // });
+    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    rois.setTo(0);
+    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
+    sub.setTo(cv::Scalar::all(1));
+
+    cascade.detectMultiScale(colored, rois, objectBoxes);
 }
 
 class SCSpecific : public ::testing::TestWithParam<std::tr1::tuple<std::string, int> > {
@@ -93,7 +94,10 @@ TEST_P(SCSpecific, detect)
     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + path);
 
     ASSERT_FALSE(coloredCpu.empty());
-    GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois;
+    GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    rois.setTo(0);
+    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
+    sub.setTo(cv::Scalar::all(1));
 
     int level = GET_PARAM(1);
     cascade.detectMultiScale(colored, rois, objectBoxes, 1, level);

From dd595376ba861ce13f5015abc8297bd92bf5938c Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 3 Oct 2012 14:26:26 +0400
Subject: [PATCH 33/74] Add performance test for  detection in ROI; refactor
 soft cascade performance tests

---
 modules/gpu/test/test_softcascade.cpp | 248 +++++++++++++++++++-------
 1 file changed, 181 insertions(+), 67 deletions(-)

diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 84602915f..0b266f827 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -41,20 +41,197 @@
 //M*/
 
 #include <test_precomp.hpp>
+#include <time.h>
 
 #ifdef HAVE_CUDA
-
 using cv::gpu::GpuMat;
 
-TEST(SoftCascade, readCascade)
+// show detection results on input image with cv::imshow
+//#define SHOW_DETECTIONS
+
+#if defined SHOW_DETECTIONS
+# define SHOW(res)           \
+    cv::imshow(#res, result);\
+    cv::waitKey(0);
+#else
+# define SHOW(res)
+#endif
+
+#define GPU_TEST_P(fixture, name, params)                         \
+    class fixture##_##name : public fixture {                     \
+     public:                                                      \
+      fixture##_##name() {}                                       \
+     protected:                                                   \
+      virtual void body();                                        \
+    };                                                            \
+    TEST_P(fixture##_##name, name /*none*/){ body();}             \
+    INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);  \
+    void fixture##_##name::body()
+
+
+typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
+
+struct SoftCascadeTest : public ::testing::TestWithParam<roi_fixture_t>
+{
+    typedef cv::gpu::SoftCascade::Detection detection_t;
+    static cv::Rect getFromTable(int idx)
+    {
+        static const cv::Rect rois[] =
+        {
+            cv::Rect( 65,  20,  35, 80),
+            cv::Rect( 95,  35,  45, 40),
+            cv::Rect( 45,  35,  45, 40),
+            cv::Rect( 25,  27,  50, 45),
+            cv::Rect(100,  50,  45, 40),
+
+            cv::Rect( 60,  30,  45, 40),
+            cv::Rect( 40,  55,  50, 40),
+            cv::Rect( 48,  37,  72, 80),
+            cv::Rect( 48,  32,  85, 58),
+            cv::Rect( 48,   0,  32, 27)
+        };
+
+        return rois[idx];
+    }
+
+    static std::string itoa(long i)
+    {
+        static char s[65];
+        sprintf(s, "%ld", i);
+        return std::string(s);
+    }
+
+    static std::string getImageName(int level)
+    {
+        time_t rawtime;
+        struct tm * timeinfo;
+        char buffer [80];
+
+        time ( &rawtime );
+        timeinfo = localtime ( &rawtime );
+
+        strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo);
+        return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
+    }
+
+    static void print(std::ostream &out, const detection_t& d)
+    {
+        out << "\x1b[32m[ detection]\x1b[0m ("
+            << std::setw(4)  << d.x
+            << " "
+            << std::setw(4)  << d.y
+            << ") ("
+            << std::setw(4)  << d.w
+            << " "
+            << std::setw(4)  << d.h
+            << ") "
+            << std::setw(12) << d.confidence
+            <<  std::endl;
+    }
+
+    static void printTotal(std::ostream &out, int detbytes)
+    {
+        out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(detection_t)) << std::endl;
+    }
+
+    static void writeResult(const cv::Mat& result, const int level)
+    {
+        std::string path = cv::tempfile(getImageName(level).c_str());
+        cv::imwrite(path, result);
+        std::cout << "\x1b[32m" << "[          ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
+    }
+};
+
+GPU_TEST_P(SoftCascadeTest, detectInROI,
+    testing::Combine(
+        testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 5)))
+{
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1));
+    ASSERT_FALSE(coloredCpu.empty());
+
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0)));
+
+    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    rois.setTo(0);
+
+    int nroi = GET_PARAM(2);
+    cv::RNG rng;
+    for (int i = 0; i < nroi; ++i)
+    {
+        cv::Rect r = getFromTable(rng(10));
+        GpuMat sub(rois, r);
+        sub.setTo(1);
+    }
+
+    cascade.detectMultiScale(colored, rois, objectBoxes);
+
+    ///
+    cv::Mat dt(objectBoxes);
+    typedef cv::gpu::SoftCascade::Detection detection_t;
+
+    detection_t* dts = (detection_t*)dt.data;
+    cv::Mat result(coloredCpu);
+
+    printTotal(std::cout, dt.cols);
+    for (int i = 0; i  < (int)(dt.cols / sizeof(detection_t)); ++i)
+    {
+        detection_t d = dts[i];
+        print(std::cout, d);
+        cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
+    }
+
+    SHOW(result);
+}
+
+GPU_TEST_P(SoftCascadeTest, detectInLevel,
+        testing::Combine(
+        testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 47)
+        ))
+{
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + GET_PARAM(0);
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(xml));
+
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1));
+    ASSERT_FALSE(coloredCpu.empty());
+
+    typedef cv::gpu::SoftCascade::Detection detection_t;
+    GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    rois.setTo(1);
+
+    int level = GET_PARAM(2);
+    cascade.detectMultiScale(colored, rois, objectBoxes, 1, level);
+
+    cv::Mat dt(objectBoxes);
+
+    detection_t* dts = (detection_t*)dt.data;
+    cv::Mat result(coloredCpu);
+
+    printTotal(std::cout, dt.cols);
+    for (int i = 0; i  < (int)(dt.cols / sizeof(detection_t)); ++i)
+    {
+        detection_t d = dts[i];
+        print(std::cout, d);
+        cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
+    }
+
+    writeResult(result, level);
+    SHOW(result);
+}
+
+TEST(SoftCascadeTest, readCascade)
 {
     std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml";
     cv::gpu::SoftCascade cascade;
     ASSERT_TRUE(cascade.load(xml));
-
 }
 
-TEST(SoftCascade, detect)
+TEST(SoftCascadeTest, detect)
 {
     std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
     cv::gpu::SoftCascade cascade;
@@ -71,67 +248,4 @@ TEST(SoftCascade, detect)
 
     cascade.detectMultiScale(colored, rois, objectBoxes);
 }
-
-class SCSpecific : public ::testing::TestWithParam<std::tr1::tuple<std::string, int> > {
-};
-
-namespace {
-std::string itoa(long i)
-{
-    static char s[65];
-    sprintf(s, "%ld", i);
-    return std::string(s);
-}
-}
-
-TEST_P(SCSpecific, detect)
-{
-    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(xml));
-
-    std::string path = GET_PARAM(0);
-    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + path);
-
-    ASSERT_FALSE(coloredCpu.empty());
-    GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
-    rois.setTo(0);
-    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
-    sub.setTo(cv::Scalar::all(1));
-
-    int level = GET_PARAM(1);
-    cascade.detectMultiScale(colored, rois, objectBoxes, 1, level);
-
-    cv::Mat dt(objectBoxes);
-    typedef cv::gpu::SoftCascade::Detection detection_t;
-
-    detection_t* dts = (detection_t*)dt.data;
-    cv::Mat result(coloredCpu);
-
-
-    std::cout << "Total detections " << (dt.cols / sizeof(detection_t)) << std::endl;
-    for(int i = 0; i  < (int)(dt.cols / sizeof(detection_t)); ++i)
-    {
-        detection_t d = dts[i];
-        std::cout << "detection: [" << std::setw(4) << d.x << " " << std::setw(4) << d.y
-                  << "] [" << std::setw(4) << d.w << " " << std::setw(4) << d.h << "] "
-                  << std::setw(12)  << d.confidence << std::endl;
-
-        cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
-    }
-
-    std::cout << "Result stored in " << "/home/kellan/gpu_res_1_oct_" + itoa(level) << "_"
-    + itoa((dt.cols / sizeof(detection_t))) + ".png" << std::endl;
-    cv::imwrite("/home/kellan/gpu_res_1_oct_" + itoa(level) + "_" + itoa((dt.cols / sizeof(detection_t))) + ".png",
-        result);
-    cv::imshow("res", result);
-    cv::waitKey(0);
-}
-
-INSTANTIATE_TEST_CASE_P(inLevel, SCSpecific,
-    testing::Combine(
-        testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
-        testing::Range(0, 47)
-        ));
-
 #endif
\ No newline at end of file

From 838842cc96e1f09cbb97c189a9653c46657ac107 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 3 Oct 2012 16:36:00 +0400
Subject: [PATCH 34/74] Add performance detection test in ROI; refactored soft
 cascade performance tests

---
 modules/gpu/perf/perf_objdetect.cpp | 187 +++++++++++++++++++++++++++-
 1 file changed, 185 insertions(+), 2 deletions(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index a86337112..2224194bb 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -89,11 +89,90 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
     SANITY_CHECK(found_locations);
 }
 
+//================================================= ICF SoftCascade =================================================//
+
 typedef pair<string, string> pair_string;
 DEF_PARAM_TEST_1(SoftCascade, pair_string);
 
-PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml",
-                                                              "cv/cascadeandhog/bahnhof/image_00000000_0.png")))
+
+// struct SoftCascadeTest : public perf::TestBaseWithParam<roi_fixture_t>
+// {
+//     typedef cv::gpu::SoftCascade::Detection detection_t;
+//     static cv::Rect getFromTable(int idx)
+//     {
+//         static const cv::Rect rois[] =
+//         {
+//             cv::Rect( 65,  20,  35, 80),
+//             cv::Rect( 95,  35,  45, 40),
+//             cv::Rect( 45,  35,  45, 40),
+//             cv::Rect( 25,  27,  50, 45),
+//             cv::Rect(100,  50,  45, 40),
+
+//             cv::Rect( 60,  30,  45, 40),
+//             cv::Rect( 40,  55,  50, 40),
+//             cv::Rect( 48,  37,  72, 80),
+//             cv::Rect( 48,  32,  85, 58),
+//             cv::Rect( 48,   0,  32, 27)
+//         };
+
+//         return rois[idx];
+//     }
+
+//     static std::string itoa(long i)
+//     {
+//         static char s[65];
+//         sprintf(s, "%ld", i);
+//         return std::string(s);
+//     }
+
+//     static std::string getImageName(int level)
+//     {
+//         time_t rawtime;
+//         struct tm * timeinfo;
+//         char buffer [80];
+
+//         time ( &rawtime );
+//         timeinfo = localtime ( &rawtime );
+
+//         strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo);
+//         return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
+//     }
+
+//     static void print(std::ostream &out, const detection_t& d)
+//     {
+//         out << "\x1b[32m[ detection]\x1b[0m ("
+//             << std::setw(4)  << d.x
+//             << " "
+//             << std::setw(4)  << d.y
+//             << ") ("
+//             << std::setw(4)  << d.w
+//             << " "
+//             << std::setw(4)  << d.h
+//             << ") "
+//             << std::setw(12) << d.confidence
+//             <<  std::endl;
+//     }
+
+//     static void printTotal(std::ostream &out, int detbytes)
+//     {
+//         out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(detection_t)) << std::endl;
+//     }
+
+//     static void writeResult(const cv::Mat& result, const int level)
+//     {
+//         std::string path = cv::tempfile(getImageName(level).c_str());
+//         cv::imwrite(path, result);
+//         std::cout << "\x1b[32m" << "[          ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
+//     }
+// };
+
+typedef std::tr1::tuple<std::string, std::string> fixture_t;
+typedef perf::TestBaseWithParam<fixture_t> SoftCascadeTest;
+
+PERF_TEST_P(SoftCascadeTest, detect,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
 {
     if (runOnGpu)
     {
@@ -133,6 +212,110 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
     }
 }
 
+static cv::Rect getFromTable(int idx)
+{
+    static const cv::Rect rois[] =
+    {
+        cv::Rect( 65,  20,  35, 80),
+        cv::Rect( 95,  35,  45, 40),
+        cv::Rect( 45,  35,  45, 40),
+        cv::Rect( 25,  27,  50, 45),
+        cv::Rect(100,  50,  45, 40),
+
+        cv::Rect( 60,  30,  45, 40),
+        cv::Rect( 40,  55,  50, 40),
+        cv::Rect( 48,  37,  72, 80),
+        cv::Rect( 48,  32,  85, 58),
+        cv::Rect( 48,   0,  32, 27)
+    };
+
+    return rois[idx];
+}
+
+typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
+typedef perf::TestBaseWithParam<roi_fixture_t> SoftCascadeTestRoi;
+
+PERF_TEST_P(SoftCascadeTestRoi, detectInRoi,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 5)))
+{
+    if (runOnGpu)
+    {
+        cv::Mat cpu = readImage (GET_PARAM(1));
+        ASSERT_FALSE(cpu.empty());
+        cv::gpu::GpuMat colored(cpu);
+
+        cv::gpu::SoftCascade cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+
+        cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+        rois.setTo(0);
+
+        int nroi = GET_PARAM(2);
+        cv::RNG rng;
+        for (int i = 0; i < nroi; ++i)
+        {
+            cv::Rect r = getFromTable(rng(10));
+            cv::gpu::GpuMat sub(rois, r);
+            sub.setTo(1);
+        }
+
+        cv::gpu::GpuMat curr = objectBoxes;
+        cascade.detectMultiScale(colored, rois, curr);
+
+        TEST_CYCLE()
+        {
+            curr = objectBoxes;
+            cascade.detectMultiScale(colored, rois, curr);
+        }
+    }
+    else
+    {
+        FAIL();
+    }
+}
+
+PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 10)))
+{
+    if (runOnGpu)
+    {
+        cv::Mat cpu = readImage (GET_PARAM(1));
+        ASSERT_FALSE(cpu.empty());
+        cv::gpu::GpuMat colored(cpu);
+
+        cv::gpu::SoftCascade cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+
+        cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+        rois.setTo(0);
+
+        int idx = GET_PARAM(2);
+        cv::Rect r = getFromTable(idx);
+        cv::gpu::GpuMat sub(rois, r);
+        sub.setTo(1);
+
+        cv::gpu::GpuMat curr = objectBoxes;
+        cascade.detectMultiScale(colored, rois, curr);
+
+        TEST_CYCLE()
+        {
+            curr = objectBoxes;
+            cascade.detectMultiScale(colored, rois, curr);
+        }
+    }
+    else
+    {
+        FAIL();
+    }
+}
+
+
 ///////////////////////////////////////////////////////////////
 // HaarClassifier
 

From fdef0adf95052f79a600f77cfb54e41383fa2e58 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 3 Oct 2012 16:39:37 +0400
Subject: [PATCH 35/74] Corrects objects matrix in case of the absence of
 objects

---
 modules/gpu/src/softcascade.cpp | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 9ea365c5e..8b73ae639 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -422,19 +422,15 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<device::ic
         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
 
-        // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-        //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x,
-        //level.objSize.y);
-
-        std::cout << "level " << sc
-                  << " octeve "
-                  << vlevels[sc].octave
-                  << " relScale "
-                  << vlevels[sc].relScale
-                  << " " << vlevels[sc].shrScale
-                  << " [" << (int)vlevels[sc].objSize.x
-                  << " " <<  (int)vlevels[sc].objSize.y << "] ["
-        <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
+        // std::cout << "level " << sc
+        //           << " octeve "
+        //           << vlevels[sc].octave
+        //           << " relScale "
+        //           << vlevels[sc].relScale
+        //           << " " << vlevels[sc].shrScale
+        //           << " [" << (int)vlevels[sc].objSize.x
+        //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
+        // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
     }
 
     levels.upload(cv::Mat(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
@@ -578,7 +574,10 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     cv::Mat out(flds.detCounter);
     int ndetections = *(out.data);
 
-    objects = GpuMat(objects, cv::Rect(0, 0, ndetections * sizeof(Detection), 1));
+    if (! ndetections)
+        objects = GpuMat();
+    else
+        objects = GpuMat(objects, cv::Rect(0, 0, ndetections * sizeof(Detection), 1));
 }
 
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const

From 30bce16ad6a79e1b2e1f9f78e524a9faddb63b1f Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 8 Oct 2012 15:37:28 +0400
Subject: [PATCH 36/74] optimize roi loads

only one thread load roi for all block
---
 modules/gpu/perf/perf_objdetect.cpp   | 42 ++++++++++--------
 modules/gpu/src/cuda/isf-sc.cu        | 62 +++++++++------------------
 modules/gpu/src/softcascade.cpp       |  2 +-
 modules/gpu/test/test_softcascade.cpp | 22 +++++++---
 4 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index 2224194bb..e6efcc2d6 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -176,33 +176,35 @@ PERF_TEST_P(SoftCascadeTest, detect,
 {
     if (runOnGpu)
     {
-        cv::Mat cpu = readImage (GetParam().second);
+        cv::Mat cpu = readImage (GET_PARAM(1));
         ASSERT_FALSE(cpu.empty());
         cv::gpu::GpuMat colored(cpu);
 
         cv::gpu::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
 
-        cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
-
-        rois.setTo(0);
-        cv::gpu::GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
-        sub.setTo(cv::Scalar::all(1));
-        cascade.detectMultiScale(colored, rois, objectBoxes);
+        cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
+        rois.setTo(1);
+        cv::gpu::transpose(rois, trois);
+        cascade.detectMultiScale(colored, trois, objectBoxes);
 
         TEST_CYCLE()
         {
-            cascade.detectMultiScale(colored, rois, objectBoxes);
+            cascade.detectMultiScale(colored, trois, objectBoxes);
         }
-    } else
+    }
+    else
     {
-        cv::Mat colored = readImage(GetParam().second);
+        cv::Mat colored = readImage(GET_PARAM(1));
         ASSERT_FALSE(colored.empty());
 
         cv::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(getDataPath(GetParam().first)));
+        ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
 
-        std::vector<cv::Rect> rois, objectBoxes;
+        std::vector<cv::Rect> rois;
+
+        typedef cv::SoftCascade::Detection Detection;
+        std::vector<Detection>objectBoxes;
         cascade.detectMultiScale(colored, rois, objectBoxes);
 
         TEST_CYCLE()
@@ -262,13 +264,16 @@ PERF_TEST_P(SoftCascadeTestRoi, detectInRoi,
             sub.setTo(1);
         }
 
+        cv::gpu::GpuMat trois;
+        cv::gpu::transpose(rois, trois);
+
         cv::gpu::GpuMat curr = objectBoxes;
-        cascade.detectMultiScale(colored, rois, curr);
+        cascade.detectMultiScale(colored, trois, curr);
 
         TEST_CYCLE()
         {
             curr = objectBoxes;
-            cascade.detectMultiScale(colored, rois, curr);
+            cascade.detectMultiScale(colored, trois, curr);
         }
     }
     else
@@ -301,7 +306,10 @@ PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi,
         sub.setTo(1);
 
         cv::gpu::GpuMat curr = objectBoxes;
-        cascade.detectMultiScale(colored, rois, curr);
+        cv::gpu::GpuMat trois;
+        cv::gpu::transpose(rois, trois);
+
+        cascade.detectMultiScale(colored, trois, curr);
 
         TEST_CYCLE()
         {
@@ -372,7 +380,7 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
     cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(img.empty());
 
-    if (PERF_RUN_GPU())
+    if (runOnGpu)
     {
         cv::gpu::CascadeClassifier_GPU d_cascade;
         ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 4bde7f7ea..8df6907df 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -86,7 +86,6 @@ namespace icf {
     }
 
     texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
-    texture<char,  cudaTextureType2D, cudaReadModeElementType> troi;
 
     template<bool isUp>
     __device__ __forceinline__ float rescale(const Level& level, Node& node)
@@ -130,11 +129,6 @@ namespace icf {
         float relScale = level.relScale;
         float farea = scaledRect.z * scaledRect.w;
 
-        dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y,
-            scaledRect.z, scaledRect.w);
-        dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1],
-            level.scaling[(node.threshold >> 28) > 6]);
-
         // rescale
         scaledRect.x = __float2int_rn(relScale * scaledRect.x);
         scaledRect.y = __float2int_rn(relScale * scaledRect.y);
@@ -146,15 +140,7 @@ namespace icf {
         const float expected_new_area = farea * relScale * relScale;
         float approx = __fdividef(sarea, expected_new_area);
 
-        dprintf("%d: new rect: %d box %d %d %d %d  rel areas %f %f\n",threadIdx.x, (node.threshold >> 28),
-        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
-
-        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
-
-        rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
-
-        dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
-            level.scaling[(node.threshold >> 28) > 6]);
+        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
 
         return rootThreshold;
     }
@@ -162,33 +148,17 @@ namespace icf {
     template<bool isUp>
     __device__ __forceinline__ int get(int x, int y, uchar4 area)
     {
-
-        dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
-        dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x,
-            x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
-            x + area.x, y + area.w);
-        dprintf("%d: at point %d %d with offset %d\n", x, y, 0);
-
         int a = tex2D(thogluv, x + area.x, y + area.y);
         int b = tex2D(thogluv, x + area.z, y + area.y);
         int c = tex2D(thogluv, x + area.z, y + area.w);
         int d = tex2D(thogluv, x + area.x, y + area.w);
 
-        dprintf("%d   retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d);
-
         return (a - b + c - d);
     }
 
     template<>
     __device__ __forceinline__ int get<true>(int x, int y, uchar4 area)
     {
-
-        dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w);
-        dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x,
-            x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
-            x + area.x, y + area.w);
-        dprintf("%d: at point %d %d with offset %d\n", x, y, 0);
-
         x += area.x;
         y += area.y;
         int a = tex2D(thogluv, x, y);
@@ -196,11 +166,10 @@ namespace icf {
         int c = tex2D(thogluv, x + area.z, y + area.w);
         int d = tex2D(thogluv, x, y + area.w);
 
-        dprintf("%d   retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d);
-
         return (a - b + c - d);
     }
 
+    texture<float2,  cudaTextureType2D, cudaReadModeElementType> troi;
 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
     template<bool isUp>
     __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
@@ -210,12 +179,21 @@ namespace icf {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x;
 
+        __shared__ volatile char roiCache[8];
+
+        if (!threadIdx.y && !threadIdx.x)
+        {
+            ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
+        }
+
+        __syncthreads();
+
+        if (!roiCache[threadIdx.y]) return;
+
         Level level = levels[downscales + blockIdx.z];
 
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
-        if (!tex2D(troi, x, y)) return;
-
         Octave octave = octaves[level.octave];
         int st = octave.index * octave.stages;
         const int stEnd = st + 1024;
@@ -282,9 +260,9 @@ namespace icf {
         // if (blockIdx.z != 31) return;
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
-        int roi = tex2D(troi, x, y);
-        printf("%d\n", roi);
-        if (!roi) return;
+        // int roi = tex2D(troi, x, y);
+        // printf("%d\n", roi);
+        // if (!roi) return;
 
         Octave octave = octaves[level.octave];
 
@@ -357,8 +335,8 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<char>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step));
+        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
+        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
 
         test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
         cudaSafeCall( cudaGetLastError());
@@ -391,8 +369,8 @@ namespace icf {
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<char>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step));
+        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
+        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
 
         if (scale >= downscales)
             test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 8b73ae639..e7fcfff27 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -481,7 +481,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     CV_Assert(colored.type() == CV_8UC3);
 
     // we guess user knows about shrincage
-    CV_Assert((rois.size() == getRoiSize()) && (rois.type() == CV_8UC1));
+    CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
 
     // only this window size allowed
     CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 0b266f827..04fa9b181 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -47,7 +47,7 @@
 using cv::gpu::GpuMat;
 
 // show detection results on input image with cv::imshow
-//#define SHOW_DETECTIONS
+#define SHOW_DETECTIONS
 
 #if defined SHOW_DETECTIONS
 # define SHOW(res)           \
@@ -154,26 +154,30 @@ GPU_TEST_P(SoftCascadeTest, detectInROI,
     cv::gpu::SoftCascade cascade;
     ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0)));
 
-    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
     rois.setTo(0);
 
     int nroi = GET_PARAM(2);
+    cv::Mat result(coloredCpu);
     cv::RNG rng;
     for (int i = 0; i < nroi; ++i)
     {
         cv::Rect r = getFromTable(rng(10));
         GpuMat sub(rois, r);
         sub.setTo(1);
+        r.x *= 4; r.y *= 4; r.width *= 4; r.height *= 4;
+        cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
     }
 
-    cascade.detectMultiScale(colored, rois, objectBoxes);
+    cv::gpu::transpose(rois, trois);
+
+    cascade.detectMultiScale(colored, trois, objectBoxes);
 
     ///
     cv::Mat dt(objectBoxes);
     typedef cv::gpu::SoftCascade::Detection detection_t;
 
     detection_t* dts = (detection_t*)dt.data;
-    cv::Mat result(coloredCpu);
 
     printTotal(std::cout, dt.cols);
     for (int i = 0; i  < (int)(dt.cols / sizeof(detection_t)); ++i)
@@ -204,8 +208,11 @@ GPU_TEST_P(SoftCascadeTest, detectInLevel,
     GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
     rois.setTo(1);
 
+    cv::gpu::GpuMat trois;
+    cv::gpu::transpose(rois, trois);
+
     int level = GET_PARAM(2);
-    cascade.detectMultiScale(colored, rois, objectBoxes, 1, level);
+    cascade.detectMultiScale(colored, trois, objectBoxes, 1, level);
 
     cv::Mat dt(objectBoxes);
 
@@ -246,6 +253,9 @@ TEST(SoftCascadeTest, detect)
     GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
     sub.setTo(cv::Scalar::all(1));
 
-    cascade.detectMultiScale(colored, rois, objectBoxes);
+    cv::gpu::GpuMat trois;
+    cv::gpu::transpose(rois, trois);
+
+    cascade.detectMultiScale(colored, trois, objectBoxes);
 }
 #endif
\ No newline at end of file

From 1b9bccb856c7ff2f80c99b6b085db264cffcf79d Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 8 Oct 2012 20:20:57 +0400
Subject: [PATCH 37/74] move Level into shared memory

---
 modules/gpu/src/cuda/isf-sc.cu | 24 +++++++-----------------
 modules/gpu/src/icf.hpp        |  2 ++
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 8df6907df..f755f8549 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -94,11 +94,6 @@ namespace icf {
         float relScale = level.relScale;
         float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
-        dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y,
-            scaledRect.z, scaledRect.w);
-        dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1],
-            level.scaling[(node.threshold >> 28) > 6]);
-
         // rescale
         scaledRect.x = __float2int_rn(relScale * scaledRect.x);
         scaledRect.y = __float2int_rn(relScale * scaledRect.y);
@@ -110,14 +105,7 @@ namespace icf {
         const float expected_new_area = farea * relScale * relScale;
         float approx = __fdividef(sarea, expected_new_area);
 
-        dprintf("%d: new rect: %d box %d %d %d %d  rel areas %f %f\n",threadIdx.x, (node.threshold >> 28),
-        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
-
-        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
-        rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
-
-        dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold,
-            level.scaling[(node.threshold >> 28) > 6]);
+        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
 
         return rootThreshold;
     }
@@ -179,18 +167,20 @@ namespace icf {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x;
 
-        __shared__ volatile char roiCache[8];
+        // load Lavel
+        __shared__ Level level;
 
+        // check POI
+        __shared__ volatile char roiCache[8];
         if (!threadIdx.y && !threadIdx.x)
-        {
             ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
-        }
 
         __syncthreads();
 
         if (!roiCache[threadIdx.y]) return;
 
-        Level level = levels[downscales + blockIdx.z];
+        if (!threadIdx.x)
+            level = levels[downscales + blockIdx.z];
 
         if(x >= level.workRect.x || y >= level.workRect.y) return;
 
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 35658892f..a103341fb 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -90,6 +90,8 @@ struct __align__(8) Level //is actually 24 bytes
         objSize.x  = round(oct.size.x * relScale);
         objSize.y  = round(oct.size.y * relScale);
     }
+
+    __device Level(){}
 };
 
 struct __align__(8) Node

From 0424e2c8d2da9aefa9b8b6898216e7361e3141c3 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 14:19:24 +0400
Subject: [PATCH 38/74] remove debug code

---
 modules/gpu/src/softcascade.cpp | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index e7fcfff27..b834e0319 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -463,15 +463,6 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
     return true;
 }
 
-#define USE_REFERENCE_VALUES
-namespace {
-    char *itoa(long i, char* s, int /*dummy_radix*/)
-    {
-        sprintf(s, "%ld", i);
-        return s;
-    }
-}
-
 //================================== synchronous version ============================================================//
 
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& rois,
@@ -488,22 +479,6 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
 
     Filds& flds = *filds;
 
-#if defined USE_REFERENCE_VALUES
-    cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
-
-    cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
-    char buff[33];
-
-    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-    {
-        cv::Mat channel;
-        imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
-
-        // std::cout << "channel " << i << std::endl << channel << std::endl;
-        GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
-        gchannel.upload(channel);
-    }
-#else
     GpuMat& plane = flds.plane;
     GpuMat& shrunk = flds.shrunk;
     cudaMemset(plane.data, 0, plane.step * plane.rows);
@@ -512,8 +487,6 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     int fh = Filds::FRAME_HEIGHT;
 
     GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh));
-
-    //cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);
     cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY);
 
     //create hog
@@ -564,7 +537,6 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
         GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
         cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
     }
-#endif
 
     if (specificScale == -1)
         flds.detect(rois,objects, 0);

From ef431f70b640acb969a90b44ed5ffef5a2774719 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 16:26:32 +0400
Subject: [PATCH 39/74] fix buggy threshold zeroing if feature has zero area

---
 modules/gpu/src/cuda/isf-sc.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index f755f8549..f74673c05 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -103,7 +103,7 @@ namespace icf {
         float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
         const float expected_new_area = farea * relScale * relScale;
-        float approx = __fdividef(sarea, expected_new_area);
+        float approx = (sarea == 0)? 1: __fdividef(sarea, expected_new_area);
 
         float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
 
@@ -226,7 +226,7 @@ namespace icf {
             dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact);
 
             confidence += impact;
-            if(__any((confidence <= stages[(st + threadIdx.x)]))) st += stEnd;
+            if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
         }
 
         if(st == stEnd && !threadIdx.x)

From 312a58fcec5d8478cf95352e990ab24601c0108a Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 16:27:23 +0400
Subject: [PATCH 40/74] fix performance test

---
 modules/gpu/perf/perf_objdetect.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index e6efcc2d6..ced8ee17d 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -186,11 +186,14 @@ PERF_TEST_P(SoftCascadeTest, detect,
         cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
         rois.setTo(1);
         cv::gpu::transpose(rois, trois);
-        cascade.detectMultiScale(colored, trois, objectBoxes);
+
+        cv::gpu::GpuMat curr = objectBoxes;
+        cascade.detectMultiScale(colored, trois, curr);
 
         TEST_CYCLE()
         {
-            cascade.detectMultiScale(colored, trois, objectBoxes);
+            curr = objectBoxes;
+            cascade.detectMultiScale(colored, trois, curr);
         }
     }
     else

From fa62e2b72f5d1d84b1d056d0e42b80486e210f71 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 16:47:49 +0400
Subject: [PATCH 41/74] move preprocessing into separate function

---
 modules/gpu/src/softcascade.cpp | 134 +++++++++++++++++---------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index b834e0319..8fa82867f 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -93,6 +93,10 @@ namespace icf {
                        PtrStepSzi counter,
                        const int downscales);
 }
+namespace imgproc
+{
+    void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+}
 }}}
 
 struct cv::gpu::SoftCascade::Filds
@@ -104,8 +108,8 @@ struct cv::gpu::SoftCascade::Filds
         fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1);
         luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
         shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1);
-        integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
-        hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1);
+        integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1);
+        hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 64, CV_32SC1);
         detCounter.create(1,1, CV_32SC1);
     }
 
@@ -146,6 +150,8 @@ struct cv::gpu::SoftCascade::Filds
 
     std::vector<float> scales;
 
+    static const int shrinkage = 4;
+
     enum { BOOST = 0 };
     enum
     {
@@ -160,19 +166,80 @@ struct cv::gpu::SoftCascade::Filds
     };
 
     bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(cv::gpu::GpuMat roi, cv::gpu::GpuMat objects, cudaStream_t stream) const
+    void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
         device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
     }
 
-    void detectAtScale(int scale, cv::gpu::GpuMat roi, cv::gpu::GpuMat objects, cudaStream_t stream) const
+    void detectAtScale(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
         device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects,
             detCounter, downscales);
     }
 
+    void preprocess(const cv::gpu::GpuMat& colored)
+    {
+        cudaMemset(plane.data, 0, plane.step * plane.rows);
+
+        int fw = Filds::FRAME_WIDTH;
+        int fh = Filds::FRAME_HEIGHT;
+
+        GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh));
+        cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY);
+
+        //create hog
+        GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
+        GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
+
+        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f);
+        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f);
+
+        GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
+        GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
+
+        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true);
+
+        // normolize magnitude to uchar interval and angles to 6 bins
+
+        GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
+        GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
+
+        cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag);
+        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
+
+        //create uchar magnitude
+        GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh));
+        nmag.convertTo(cmag, CV_8UC1);
+
+        // create luv
+        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv);
+
+        std::vector<GpuMat> splited;
+        for(int i = 0; i < Filds::LUV_BINS; ++i)
+        {
+            splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
+        }
+
+        cv::gpu::split(luv, splited);
+
+        device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS);
+
+        GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
+        cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+
+        fw /= shrinkage;
+        fh /= shrinkage;
+
+        for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+        {
+            GpuMat channel(shrunk, cv::Rect(0, fh  * i, fw, fh ));
+            GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
+            cv::gpu::integralBuffered(channel, sum, integralBuffer);
+        }
+    }
+
 private:
     void calcLevels(const std::vector<device::icf::Octave>& octs,
                                                     int frameW, int frameH, int nscales);
@@ -479,64 +546,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
 
     Filds& flds = *filds;
 
-    GpuMat& plane = flds.plane;
-    GpuMat& shrunk = flds.shrunk;
-    cudaMemset(plane.data, 0, plane.step * plane.rows);
-
-    int fw = Filds::FRAME_WIDTH;
-    int fh = Filds::FRAME_HEIGHT;
-
-    GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh));
-    cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY);
-
-    //create hog
-    GpuMat dfdx(flds.fplane, cv::Rect(0,  0, fw, fh));
-    GpuMat dfdy(flds.fplane, cv::Rect(0, fh, fw, fh));
-
-    cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f);
-    cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f);
-
-    GpuMat mag(flds.fplane, cv::Rect(0, 2 * fh, fw, fh));
-    GpuMat ang(flds.fplane, cv::Rect(0, 3 * fh, fw, fh));
-
-    cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true);
-
-    // normolize magnitude to uchar interval and angles to 6 bins
-
-    GpuMat nmag(flds.fplane, cv::Rect(0, 4 * fh, fw, fh));
-    GpuMat nang(flds.fplane, cv::Rect(0, 5 * fh, fw, fh));
-
-    cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag);
-    cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
-
-    //create uchar magnitude
-    GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh));
-    nmag.convertTo(cmag, CV_8UC1);
-
-    // create luv
-    cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
-
-    std::vector<GpuMat> splited;
-    for(int i = 0; i < Filds::LUV_BINS; ++i)
-    {
-        splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
-    }
-
-    cv::gpu::split(flds.luv, splited);
-
-    device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS);
-
-    GpuMat hogluv(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
-    cv::gpu::resize(hogluv, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-
-    fw /= 4;
-    fh /= 4;
-    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-    {
-        GpuMat channel(shrunk, cv::Rect(0, fh  * i, fw, fh ));
-        GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
-        cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
-    }
+    flds.preprocess(colored);
 
     if (specificScale == -1)
         flds.detect(rois,objects, 0);

From 916ba4c0ea1493c6883e45d990a85421f975a60b Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 17:05:23 +0400
Subject: [PATCH 42/74] refactor preprocessing

---
 modules/gpu/src/softcascade.cpp | 122 +++++++++++++++++++-------------
 1 file changed, 71 insertions(+), 51 deletions(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 8fa82867f..c93949f1c 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -183,61 +183,16 @@ struct cv::gpu::SoftCascade::Filds
     {
         cudaMemset(plane.data, 0, plane.step * plane.rows);
 
-        int fw = Filds::FRAME_WIDTH;
-        int fh = Filds::FRAME_HEIGHT;
+        static const int fw = Filds::FRAME_WIDTH;
+        static const int fh = Filds::FRAME_HEIGHT;
 
         GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh));
         cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY);
+        createHogBins(gray);
 
-        //create hog
-        GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
-        GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
+        createLuvBins(colored);
 
-        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f);
-        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f);
-
-        GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
-        GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
-
-        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true);
-
-        // normolize magnitude to uchar interval and angles to 6 bins
-
-        GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
-        GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
-
-        cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag);
-        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
-
-        //create uchar magnitude
-        GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh));
-        nmag.convertTo(cmag, CV_8UC1);
-
-        // create luv
-        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv);
-
-        std::vector<GpuMat> splited;
-        for(int i = 0; i < Filds::LUV_BINS; ++i)
-        {
-            splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
-        }
-
-        cv::gpu::split(luv, splited);
-
-        device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS);
-
-        GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
-        cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-
-        fw /= shrinkage;
-        fh /= shrinkage;
-
-        for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-        {
-            GpuMat channel(shrunk, cv::Rect(0, fh  * i, fw, fh ));
-            GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
-            cv::gpu::integralBuffered(channel, sum, integralBuffer);
-        }
+        integrate();
     }
 
 private:
@@ -263,6 +218,72 @@ private:
         }
         return res;
     }
+
+    void createHogBins(const cv::gpu::GpuMat& gray)
+    {
+        static const int fw = Filds::FRAME_WIDTH;
+        static const int fh = Filds::FRAME_HEIGHT;
+
+        GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
+        GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
+
+        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f);
+        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f);
+
+        GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
+        GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
+
+        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true);
+
+        // normolize magnitude to uchar interval and angles to 6 bins
+
+        GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
+        GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
+
+        cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag);
+        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
+
+        //create uchar magnitude
+        GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh));
+        nmag.convertTo(cmag, CV_8UC1);
+
+        device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS);
+    }
+
+    void createLuvBins(const cv::gpu::GpuMat& colored)
+    {
+        static const int fw = Filds::FRAME_WIDTH;
+        static const int fh = Filds::FRAME_HEIGHT;
+
+        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv);
+
+        std::vector<GpuMat> splited;
+        for(int i = 0; i < Filds::LUV_BINS; ++i)
+        {
+            splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
+        }
+
+        cv::gpu::split(luv, splited);
+    }
+
+    void integrate()
+    {
+        int fw = Filds::FRAME_WIDTH;
+        int fh = Filds::FRAME_HEIGHT;
+
+        GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
+        cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+
+        fw /= shrinkage;
+        fh /= shrinkage;
+
+        for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+        {
+            GpuMat channel(shrunk, cv::Rect(0, fh  * i, fw, fh ));
+            GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
+            cv::gpu::integralBuffered(channel, sum, integralBuffer);
+        }
+    }
 };
 
 bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
@@ -572,5 +593,4 @@ cv::Size cv::gpu::SoftCascade::getRoiSize() const
     return cv::Size(Filds::FRAME_WIDTH / 4, Filds::FRAME_HEIGHT / 4);
 }
 
-
 #endif
\ No newline at end of file

From 0898c3c651ef6196a2287d61ef9d6644b5653743 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 18:24:48 +0400
Subject: [PATCH 43/74] kernel policy

---
 modules/gpu/src/cuda/isf-sc.cu  | 62 +++++++++------------------------
 modules/gpu/src/icf.hpp         | 27 ++++++++++++++
 modules/gpu/src/softcascade.cpp | 44 +++++------------------
 3 files changed, 52 insertions(+), 81 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index f74673c05..74e47ba19 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -303,21 +303,16 @@ namespace icf {
     }
 #endif
 
-    void detect(const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-                const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,
-                PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
+    template<>
+    void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale) const
     {
         int fw = 160;
         int fh = 120;
 
         dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, downscales);
+        dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1);
 
-        const Level* l = (const Level*)levels.ptr();
-        const Octave* oct = ((const Octave*)octaves.ptr());
-        const float* st = (const float*)stages.ptr();
-        const Node* nd = (const Node*)nodes.ptr();
-        const float* lf = (const float*)leaves.ptr();
         uint* ctr = (uint*)counter.ptr();
         Detection* det = (Detection*)objects.ptr();
         uint max_det = objects.cols / sizeof(Detection);
@@ -328,44 +323,21 @@ namespace icf {
         cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
         cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
 
-        test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
-        cudaSafeCall( cudaGetLastError());
+        if (scale == -1)
+        {
+            test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
+            cudaSafeCall( cudaGetLastError());
 
-        grid = dim3(fw, fh / 8, 47 - downscales);
-        test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, downscales);
-        cudaSafeCall( cudaGetLastError());
-        cudaSafeCall( cudaDeviceSynchronize());
-    }
-
-    void detectAtScale(const int scale, const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves,
-        const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv,
-        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
-    {
-        int fw = 160;
-        int fh = 120;
-
-        dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, 1);
-
-        const Level* l = (const Level*)levels.ptr();
-        const Octave* oct = ((const Octave*)octaves.ptr());
-        const float* st = (const float*)stages.ptr();
-        const Node* nd = (const Node*)nodes.ptr();
-        const float* lf = (const float*)leaves.ptr();
-        uint* ctr = (uint*)counter.ptr();
-        Detection* det = (Detection*)objects.ptr();
-        uint max_det = objects.cols / sizeof(Detection);
-
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
-        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
-
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
-
-        if (scale >= downscales)
-            test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+            grid = dim3(fw, fh / 8, 47 - downscales);
+            test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
+        }
         else
-            test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+        {
+            if (scale >= downscales)
+                test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+            else
+                test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+        }
 
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index a103341fb..06c81149e 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -124,6 +124,33 @@ struct __align__(16) Detection
     : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };
 
+struct CascadePolicy
+{
+    enum {STA_X = 32, STA_Y = 8};
+};
+
+template<typename Policy>
+struct CascadeInvoker
+{
+    CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {}
+    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages,
+                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()),
+       nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr())
+    {}
+
+    const Level*  levels;
+    const Octave* octaves;
+
+    const float*  stages;
+
+    const Node*   nodes;
+    const float*  leaves;
+
+    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter, const int downscales, const int csale = -1) const;
+};
+
 }
 }}}
 
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index c93949f1c..f25c5a34d 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -69,29 +69,6 @@ namespace cv { namespace gpu { namespace device {
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
-
-    void detect(const PtrStepSzb& rois,
-                const PtrStepSzb& levels,
-                const PtrStepSzb& octaves,
-                const PtrStepSzf& stages,
-                const PtrStepSzb& nodes,
-                const PtrStepSzf& leaves,
-                const PtrStepSzi& hogluv,
-                PtrStepSz<uchar4> objects,
-                PtrStepSzi counter,
-                const int downscales);
-
-    void detectAtScale(const int scale,
-                       const PtrStepSzb& rois,
-                       const PtrStepSzb& levels,
-                       const PtrStepSzb& octaves,
-                       const PtrStepSzf& stages,
-                       const PtrStepSzb& nodes,
-                       const PtrStepSzf& leaves,
-                       const PtrStepSzi& hogluv,
-                       PtrStepSz<uchar4> objects,
-                       PtrStepSzi counter,
-                       const int downscales);
 }
 namespace imgproc
 {
@@ -150,6 +127,8 @@ struct cv::gpu::SoftCascade::Filds
 
     std::vector<float> scales;
 
+    device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
+
     static const int shrinkage = 4;
 
     enum { BOOST = 0 };
@@ -166,17 +145,11 @@ struct cv::gpu::SoftCascade::Filds
     };
 
     bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
+    void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
-    }
-
-    void detectAtScale(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
-    {
-        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects,
-            detCounter, downscales);
+        // device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker(levels, octaves, stages, nodes, leaves);
+        invoker(roi, hogluv, objects, detCounter, downscales, scale);
     }
 
     void preprocess(const cv::gpu::GpuMat& colored)
@@ -439,6 +412,8 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c
     calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
     CV_Assert(!levels.empty());
 
+    invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
+
     return true;
 }
 
@@ -569,10 +544,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
 
     flds.preprocess(colored);
 
-    if (specificScale == -1)
-        flds.detect(rois,objects, 0);
-    else
-        flds.detectAtScale(specificScale, rois, objects, 0);
+    flds.detect(specificScale, rois, objects, 0);
 
     cv::Mat out(flds.detCounter);
     int ndetections = *(out.data);

From f196e9fda44d71d0be5081dcb0c3d618cd8f06a7 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 19:11:39 +0400
Subject: [PATCH 44/74] add factory method for Fields structure

---
 modules/gpu/src/softcascade.cpp | 578 ++++++++++++++++----------------
 1 file changed, 281 insertions(+), 297 deletions(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index f25c5a34d..fc7114b5f 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -78,77 +78,255 @@ namespace imgproc
 
 struct cv::gpu::SoftCascade::Filds
 {
+    struct CascadeIntrinsics
+    {
+        static const float lambda = 1.099f, a = 0.89f;
 
-    Filds()
+        static float getFor(int channel, float scaling)
+        {
+            CV_Assert(channel < 10);
+
+            if (fabs(scaling - 1.f) < FLT_EPSILON)
+                return 1.f;
+
+            // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
+            static const float A[2][2] =
+            {   //channel <= 6, otherwise
+                {        0.89f, 1.f}, // down
+                {        1.00f, 1.f}  // up
+            };
+
+            static const float B[2][2] =
+            {   //channel <= 6,  otherwise
+                { 1.099f / ::log(2), 2.f}, // down
+                {             0.f, 2.f}  // up
+            };
+
+            float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
+            float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
+
+            // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
+            return a * ::pow(scaling, b);
+        }
+    };
+
+    static Filds* parseCascade(const FileNode &root, const float mins, const float maxs)
+    {
+        static const char *const SC_STAGE_TYPE          = "stageType";
+        static const char *const SC_BOOST               = "BOOST";
+
+        static const char *const SC_FEATURE_TYPE        = "featureType";
+        static const char *const SC_ICF                 = "ICF";
+
+        // only Ada Boost supported
+        std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
+        CV_Assert(stageTypeStr == SC_BOOST);
+
+        // only HOG-like integral channel features cupported
+        string featureTypeStr = (string)root[SC_FEATURE_TYPE];
+        CV_Assert(featureTypeStr == SC_ICF);
+
+        static const char *const SC_ORIG_W              = "width";
+        static const char *const SC_ORIG_H              = "height";
+
+        int origWidth = (int)root[SC_ORIG_W];
+        CV_Assert(origWidth  == ORIG_OBJECT_WIDTH);
+
+        int origHeight = (int)root[SC_ORIG_H];
+        CV_Assert(origHeight == ORIG_OBJECT_HEIGHT);
+
+        static const char *const SC_OCTAVES             = "octaves";
+        static const char *const SC_STAGES              = "stages";
+        static const char *const SC_FEATURES            = "features";
+
+        static const char *const SC_WEEK                = "weakClassifiers";
+        static const char *const SC_INTERNAL            = "internalNodes";
+        static const char *const SC_LEAF                = "leafValues";
+
+        static const char *const SC_OCT_SCALE           = "scale";
+        static const char *const SC_OCT_STAGES          = "stageNum";
+        static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
+
+        static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
+
+        static const char * const SC_F_CHANNEL          = "channel";
+        static const char * const SC_F_RECT             = "rect";
+
+
+        FileNode fn = root[SC_OCTAVES];
+            if (fn.empty()) return false;
+
+        using namespace device::icf;
+
+        std::vector<Octave>  voctaves;
+        std::vector<float>   vstages;
+        std::vector<Node>    vnodes;
+        std::vector<float>   vleaves;
+
+        FileNodeIterator it = fn.begin(), it_end = fn.end();
+        int feature_offset = 0;
+        ushort octIndex = 0;
+        ushort shrinkage = 1;
+
+        for (; it != it_end; ++it)
+        {
+            FileNode fns = *it;
+            float scale = (float)fns[SC_OCT_SCALE];
+
+            bool isUPOctave = scale >= 1;
+
+            ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
+            ushort2 size;
+            size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
+            size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
+            shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
+
+            Octave octave(octIndex, nstages, shrinkage, size, scale);
+            CV_Assert(octave.stages > 0);
+            voctaves.push_back(octave);
+
+            FileNode ffs = fns[SC_FEATURES];
+            if (ffs.empty()) return false;
+
+            FileNodeIterator ftrs = ffs.begin();
+
+            fns = fns[SC_STAGES];
+            if (fn.empty()) return false;
+
+            // for each stage (~ decision tree with H = 2)
+            FileNodeIterator st = fns.begin(), st_end = fns.end();
+            for (; st != st_end; ++st )
+            {
+                fns = *st;
+                vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
+
+                fns = fns[SC_WEEK];
+                FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
+                for (; ftr != ft_end; ++ftr)
+                {
+                    fns = (*ftr)[SC_INTERNAL];
+                    FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
+                    for (; inIt != inIt_end;)
+                    {
+                        // int feature = (int)(*(inIt +=2)) + feature_offset;
+                        inIt +=3;
+                        // extract feature, Todo:check it
+                        uint th = saturate_cast<uint>((float)(*(inIt++)));
+                        cv::FileNode ftn = (*ftrs)[SC_F_RECT];
+                        cv::FileNodeIterator r_it = ftn.begin();
+                        uchar4 rect;
+                        rect.x = saturate_cast<uchar>((int)*(r_it++));
+                        rect.y = saturate_cast<uchar>((int)*(r_it++));
+                        rect.z = saturate_cast<uchar>((int)*(r_it++));
+                        rect.w = saturate_cast<uchar>((int)*(r_it++));
+
+                        if (isUPOctave)
+                        {
+                            rect.z -= rect.x;
+                            rect.w -= rect.y;
+                        }
+
+                        uint channel = saturate_cast<uint>((int)(*ftrs)[SC_F_CHANNEL]);
+                        vnodes.push_back(Node(rect, channel, th));
+                        ++ftrs;
+                    }
+
+                    fns = (*ftr)[SC_LEAF];
+                    inIt = fns.begin(), inIt_end = fns.end();
+                    for (; inIt != inIt_end; ++inIt)
+                        vleaves.push_back((float)(*inIt));
+                }
+            }
+
+            feature_offset += octave.stages * 3;
+            ++octIndex;
+        }
+
+        cv::Mat hoctaves(1, voctaves.size() * sizeof(Octave), CV_8UC1, (uchar*)&(voctaves[0]));
+        CV_Assert(!hoctaves.empty());
+
+        cv::Mat hstages(cv::Mat(vstages).reshape(1,1));
+        CV_Assert(!hstages.empty());
+
+        cv::Mat hnodes(1, vnodes.size() * sizeof(Node), CV_8UC1, (uchar*)&(vnodes[0]) );
+        CV_Assert(!hnodes.empty());
+
+        cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1));
+        CV_Assert(!hleaves.empty());
+
+        std::vector<Level> vlevels;
+        float logFactor = (::log(maxs) - ::log(mins)) / (TOTAL_SCALES -1);
+
+        float scale = mins;
+        int downscales = 0;
+        for (int sc = 0; sc < TOTAL_SCALES; ++sc)
+        {
+            int width  = ::std::max(0.0f, FRAME_WIDTH - (origWidth  * scale));
+            int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale));
+
+            float logScale = ::log(scale);
+            int fit = fitOctave(voctaves, logScale);
+
+            Level level(fit, voctaves[fit], scale, width, height);
+            level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
+            level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
+
+            if (!width || !height)
+                break;
+            else
+            {
+                vlevels.push_back(level);
+                if (voctaves[fit].scale < 1) ++downscales;
+            }
+
+            if (::fabs(scale - maxs) < FLT_EPSILON) break;
+            scale = ::std::min(maxs, ::expf(::log(scale) + logFactor));
+
+            // std::cout << "level " << sc
+            //           << " octeve "
+            //           << vlevels[sc].octave
+            //           << " relScale "
+            //           << vlevels[sc].relScale
+            //           << " " << vlevels[sc].shrScale
+            //           << " [" << (int)vlevels[sc].objSize.x
+            //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
+            // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
+        }
+
+        cv::Mat hlevels(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) );
+        CV_Assert(!hlevels.empty());
+
+        Filds* filds = new Filds(mins, maxs, origWidth, origHeight, shrinkage, downscales,
+            hoctaves, hstages, hnodes, hleaves, hlevels);
+
+        return filds;
+    }
+
+    Filds( const float mins, const float maxs, const int ow, const int oh, const int shr, const int ds,
+        cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves, cv::Mat hlevels)
+    : minScale(mins), maxScale(maxs), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds)
     {
         plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
         fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1);
         luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
-        shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1);
+        shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1);
         integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1);
-        hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 64, CV_32SC1);
+        hogluv.create((FRAME_HEIGHT / shr + 1) * HOG_LUV_BINS, FRAME_WIDTH / shr + 64, CV_32SC1);
         detCounter.create(1,1, CV_32SC1);
+
+        octaves.upload(hoctaves);
+        stages.upload(hstages);
+        nodes.upload(hnodes);
+        leaves.upload(hleaves);
+        levels.upload(hlevels);
+
+        invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
+
     }
 
-    // scales range
-    float minScale;
-    float maxScale;
-
-    int origObjWidth;
-    int origObjHeight;
-
-    int downscales;
-
-    GpuMat octaves;
-    GpuMat stages;
-    GpuMat nodes;
-    GpuMat leaves;
-    GpuMat levels;
-
-    GpuMat detCounter;
-
-    // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
-    GpuMat plane;
-
-    // preallocated buffer for floating point operations
-    GpuMat fplane;
-
-    // temporial mat for cvtColor
-    GpuMat luv;
-
-    // 160x120x10
-    GpuMat shrunk;
-
-    // temporial mat for integrall
-    GpuMat integralBuffer;
-
-    // 161x121x10
-    GpuMat hogluv;
-
-    std::vector<float> scales;
-
-    device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
-
-    static const int shrinkage = 4;
-
-    enum { BOOST = 0 };
-    enum
-    {
-        FRAME_WIDTH        = 640,
-        FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128,
-        HOG_BINS           = 6,
-        LUV_BINS           = 3,
-        HOG_LUV_BINS       = 10
-    };
-
-    bool fill(const FileNode &root, const float mins, const float maxs);
     void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
     {
         cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        // device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker(levels, octaves, stages, nodes, leaves);
         invoker(roi, hogluv, objects, detCounter, downscales, scale);
     }
 
@@ -169,11 +347,9 @@ struct cv::gpu::SoftCascade::Filds
     }
 
 private:
-    void calcLevels(const std::vector<device::icf::Octave>& octs,
-                                                    int frameW, int frameH, int nscales);
 
     typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
-    int fitOctave(const std::vector<device::icf::Octave>& octs, const float& logFactor) const
+    static int fitOctave(const std::vector<device::icf::Octave>& octs, const float& logFactor)
     {
         float minAbsLog = FLT_MAX;
         int res =  0;
@@ -257,247 +433,61 @@ private:
             cv::gpu::integralBuffered(channel, sum, integralBuffer);
         }
     }
-};
 
-bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
-{
-    using namespace device::icf;
-    minScale = mins;
-    maxScale = maxs;
+public:
 
-    // cascade properties
-    static const char *const SC_STAGE_TYPE          = "stageType";
-    static const char *const SC_BOOST               = "BOOST";
+    // scales range
+    float minScale;
+    float maxScale;
 
-    static const char *const SC_FEATURE_TYPE        = "featureType";
-    static const char *const SC_ICF                 = "ICF";
+    int origObjWidth;
+    int origObjHeight;
 
-    static const char *const SC_ORIG_W              = "width";
-    static const char *const SC_ORIG_H              = "height";
+    const int shrinkage;
+    int downscales;
 
-    static const char *const SC_OCTAVES             = "octaves";
-    static const char *const SC_STAGES              = "stages";
-    static const char *const SC_FEATURES            = "features";
+    // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
+    GpuMat plane;
 
-    static const char *const SC_WEEK                = "weakClassifiers";
-    static const char *const SC_INTERNAL            = "internalNodes";
-    static const char *const SC_LEAF                = "leafValues";
+    // preallocated buffer for floating point operations
+    GpuMat fplane;
 
-    static const char *const SC_OCT_SCALE           = "scale";
-    static const char *const SC_OCT_STAGES          = "stageNum";
-    static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
+    // temporial mat for cvtColor
+    GpuMat luv;
 
-    static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
+    // 160x120x10
+    GpuMat shrunk;
 
-    static const char * const SC_F_CHANNEL          = "channel";
-    static const char * const SC_F_RECT             = "rect";
+    // temporial mat for integrall
+    GpuMat integralBuffer;
 
-    // only Ada Boost supported
-    std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
-    CV_Assert(stageTypeStr == SC_BOOST);
+    // 161x121x10
+    GpuMat hogluv;
 
-    // only HOG-like integral channel features cupported
-    string featureTypeStr = (string)root[SC_FEATURE_TYPE];
-    CV_Assert(featureTypeStr == SC_ICF);
+    GpuMat detCounter;
 
-    origObjWidth = (int)root[SC_ORIG_W];
-    CV_Assert(origObjWidth  == ORIG_OBJECT_WIDTH);
+    // Cascade from xml
+    GpuMat octaves;
+    GpuMat stages;
+    GpuMat nodes;
+    GpuMat leaves;
+    GpuMat levels;
 
-    origObjHeight = (int)root[SC_ORIG_H];
-    CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT);
+    device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
 
-    FileNode fn = root[SC_OCTAVES];
-        if (fn.empty()) return false;
-
-    std::vector<Octave>  voctaves;
-    std::vector<float>   vstages;
-    std::vector<Node>    vnodes;
-    std::vector<float>   vleaves;
-    scales.clear();
-
-    FileNodeIterator it = fn.begin(), it_end = fn.end();
-    int feature_offset = 0;
-    ushort octIndex = 0;
-    ushort shrinkage = 1;
-
-    for (; it != it_end; ++it)
+    enum { BOOST = 0 };
+    enum
     {
-        FileNode fns = *it;
-        float scale = (float)fns[SC_OCT_SCALE];
-
-        bool isUPOctave = scale >= 1;
-
-        scales.push_back(scale);
-        ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
-        ushort2 size;
-        size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
-        size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
-        shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
-
-        Octave octave(octIndex, nstages, shrinkage, size, scale);
-        CV_Assert(octave.stages > 0);
-        voctaves.push_back(octave);
-
-        FileNode ffs = fns[SC_FEATURES];
-        if (ffs.empty()) return false;
-
-        FileNodeIterator ftrs = ffs.begin();
-
-        fns = fns[SC_STAGES];
-        if (fn.empty()) return false;
-
-        // for each stage (~ decision tree with H = 2)
-        FileNodeIterator st = fns.begin(), st_end = fns.end();
-        for (; st != st_end; ++st )
-        {
-            fns = *st;
-            vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
-
-            fns = fns[SC_WEEK];
-            FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
-            for (; ftr != ft_end; ++ftr)
-            {
-                fns = (*ftr)[SC_INTERNAL];
-                FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
-                for (; inIt != inIt_end;)
-                {
-                    // int feature = (int)(*(inIt +=2)) + feature_offset;
-                    inIt +=3;
-                    // extract feature, Todo:check it
-                    uint th = saturate_cast<uint>((float)(*(inIt++)));
-                    cv::FileNode ftn = (*ftrs)[SC_F_RECT];
-                    cv::FileNodeIterator r_it = ftn.begin();
-                    uchar4 rect;
-                    rect.x = saturate_cast<uchar>((int)*(r_it++));
-                    rect.y = saturate_cast<uchar>((int)*(r_it++));
-                    rect.z = saturate_cast<uchar>((int)*(r_it++));
-                    rect.w = saturate_cast<uchar>((int)*(r_it++));
-
-                    if (isUPOctave)
-                    {
-                        rect.z -= rect.x;
-                        rect.w -= rect.y;
-                    }
-
-                    uint channel = saturate_cast<uint>((int)(*ftrs)[SC_F_CHANNEL]);
-                    vnodes.push_back(Node(rect, channel, th));
-                    ++ftrs;
-                }
-
-                fns = (*ftr)[SC_LEAF];
-                inIt = fns.begin(), inIt_end = fns.end();
-                for (; inIt != inIt_end; ++inIt)
-                    vleaves.push_back((float)(*inIt));
-            }
-        }
-
-        feature_offset += octave.stages * 3;
-        ++octIndex;
-    }
-
-    // upload in gpu memory
-    octaves.upload(cv::Mat(1, voctaves.size() * sizeof(Octave), CV_8UC1, (uchar*)&(voctaves[0]) ));
-    CV_Assert(!octaves.empty());
-
-    stages.upload(cv::Mat(vstages).reshape(1,1));
-    CV_Assert(!stages.empty());
-
-    nodes.upload(cv::Mat(1, vnodes.size() * sizeof(Node), CV_8UC1, (uchar*)&(vnodes[0]) ));
-    CV_Assert(!nodes.empty());
-
-    leaves.upload(cv::Mat(vleaves).reshape(1,1));
-    CV_Assert(!leaves.empty());
-
-    // compute levels
-    calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
-    CV_Assert(!levels.empty());
-
-    invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
-
-    return true;
-}
-
-namespace {
-    struct CascadeIntrinsics
-    {
-        static const float lambda = 1.099f, a = 0.89f;
-
-        static float getFor(int channel, float scaling)
-        {
-            CV_Assert(channel < 10);
-
-            if (fabs(scaling - 1.f) < FLT_EPSILON)
-                return 1.f;
-
-            // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
-            static const float A[2][2] =
-            {   //channel <= 6, otherwise
-                {        0.89f, 1.f}, // down
-                {        1.00f, 1.f}  // up
-            };
-
-            static const float B[2][2] =
-            {   //channel <= 6,  otherwise
-                { 1.099f / log(2), 2.f}, // down
-                {             0.f, 2.f}  // up
-            };
-
-            float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
-            float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
-
-            // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
-            return a * pow(scaling, b);
-        }
+        FRAME_WIDTH        = 640,
+        FRAME_HEIGHT       = 480,
+        TOTAL_SCALES       = 55,
+        ORIG_OBJECT_WIDTH  = 64,
+        ORIG_OBJECT_HEIGHT = 128,
+        HOG_BINS           = 6,
+        LUV_BINS           = 3,
+        HOG_LUV_BINS       = 10
     };
-}
-
-inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<device::icf::Octave>& octs,
-                                                    int frameW, int frameH, int nscales)
-{
-    CV_Assert(nscales > 1);
-    using device::icf::Level;
-
-    std::vector<Level> vlevels;
-    float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
-
-    float scale = minScale;
-    downscales = 0;
-    for (int sc = 0; sc < nscales; ++sc)
-    {
-        int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
-        int height = ::std::max(0.0f, frameH - (origObjHeight * scale));
-
-        float logScale = ::log(scale);
-        int fit = fitOctave(octs, logScale);
-
-        Level level(fit, octs[fit], scale, width, height);
-        level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
-        level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
-
-        if (!width || !height)
-            break;
-        else
-        {
-            vlevels.push_back(level);
-            if (octs[fit].scale < 1) ++downscales;
-        }
-
-        if (::fabs(scale - maxScale) < FLT_EPSILON) break;
-        scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
-
-        // std::cout << "level " << sc
-        //           << " octeve "
-        //           << vlevels[sc].octave
-        //           << " relScale "
-        //           << vlevels[sc].relScale
-        //           << " " << vlevels[sc].shrScale
-        //           << " [" << (int)vlevels[sc].objSize.x
-        //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
-        // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
-    }
-
-    levels.upload(cv::Mat(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
-}
+};
 
 cv::gpu::SoftCascade::SoftCascade() : filds(0) {}
 
@@ -513,21 +503,15 @@ cv::gpu::SoftCascade::~SoftCascade()
 
 bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, const float maxScale)
 {
-    if (filds)
-        delete filds;
-    filds = 0;
+    if (filds) delete filds;
 
     cv::FileStorage fs(filename, FileStorage::READ);
     if (!fs.isOpened()) return false;
 
-    filds = new Filds;
-    Filds& flds = *filds;
-    if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
-    return true;
+    filds = Filds::parseCascade(fs.getFirstTopLevelNode(), minScale, maxScale);
+    return filds != 0;
 }
 
-//================================== synchronous version ============================================================//
-
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& rois,
                                 GpuMat& objects, const int /*rejectfactor*/, int specificScale) const
 {
@@ -562,7 +546,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat
 
 cv::Size cv::gpu::SoftCascade::getRoiSize() const
 {
-    return cv::Size(Filds::FRAME_WIDTH / 4, Filds::FRAME_HEIGHT / 4);
+    return cv::Size(Filds::FRAME_WIDTH / (*filds).shrinkage, Filds::FRAME_HEIGHT / (*filds).shrinkage);
 }
 
 #endif
\ No newline at end of file

From 2bcb8dbd83a67e27868fba2101bc443d9f892f5c Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Thu, 11 Oct 2012 21:56:36 +0400
Subject: [PATCH 45/74] refactor performance tests

---
 modules/gpu/perf/perf_softcascade.cpp | 182 ++++++++++++++++++++++++++
 1 file changed, 182 insertions(+)
 create mode 100644 modules/gpu/perf/perf_softcascade.cpp

diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp
new file mode 100644
index 000000000..783089fcb
--- /dev/null
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -0,0 +1,182 @@
+#include "perf_precomp.hpp"
+
+#define GPU_PERF_TEST_P(fixture, name, params)  \
+    class fixture##_##name : public fixture {\
+     public:\
+      fixture##_##name() {}\
+     protected:\
+             virtual void __cpu();\
+        virtual void __gpu();\
+      virtual void PerfTestBody();\
+    };\
+    TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); if (runOnGpu) __gpu(); else __cpu();}\
+    INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\
+    void fixture##_##name::PerfTestBody()
+
+#define RUN_CPU(fixture, name)\
+    void fixture##_##name::__cpu()
+
+#define RUN_GPU(fixture, name)\
+    void fixture##_##name::__gpu()
+
+#define FAIL_NO_CPU(fixture, name)\
+void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";}
+
+
+typedef std::tr1::tuple<std::string, std::string> fixture_t;
+typedef perf::TestBaseWithParam<fixture_t> SoftCascadeTest;
+
+GPU_PERF_TEST_P(SoftCascadeTest, detect,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
+{ }
+
+RUN_GPU(SoftCascadeTest, detect)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
+    rois.setTo(1);
+    cv::gpu::transpose(rois, trois);
+
+    cv::gpu::GpuMat curr = objectBoxes;
+    cascade.detectMultiScale(colored, trois, curr);
+
+    TEST_CYCLE()
+    {
+        curr = objectBoxes;
+        cascade.detectMultiScale(colored, trois, curr);
+    }
+}
+
+RUN_CPU(SoftCascadeTest, detect)
+{
+    cv::Mat colored = readImage(GET_PARAM(1));
+    ASSERT_FALSE(colored.empty());
+
+    cv::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
+
+    std::vector<cv::Rect> rois;
+
+    typedef cv::SoftCascade::Detection Detection;
+    std::vector<Detection>objectBoxes;
+    cascade.detectMultiScale(colored, rois, objectBoxes);
+
+    TEST_CYCLE()
+    {
+        cascade.detectMultiScale(colored, rois, objectBoxes);
+    }
+}
+
+static cv::Rect getFromTable(int idx)
+{
+    static const cv::Rect rois[] =
+    {
+        cv::Rect( 65,  20,  35, 80),
+        cv::Rect( 95,  35,  45, 40),
+        cv::Rect( 45,  35,  45, 40),
+        cv::Rect( 25,  27,  50, 45),
+        cv::Rect(100,  50,  45, 40),
+
+        cv::Rect( 60,  30,  45, 40),
+        cv::Rect( 40,  55,  50, 40),
+        cv::Rect( 48,  37,  72, 80),
+        cv::Rect( 48,  32,  85, 58),
+        cv::Rect( 48,   0,  32, 27)
+    };
+
+    return rois[idx];
+}
+
+typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
+typedef perf::TestBaseWithParam<roi_fixture_t> SoftCascadeTestRoi;
+
+GPU_PERF_TEST_P(SoftCascadeTestRoi, detectInRoi,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 5)))
+{}
+
+RUN_GPU(SoftCascadeTestRoi, detectInRoi)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    rois.setTo(0);
+
+    int nroi = GET_PARAM(2);
+    cv::RNG rng;
+    for (int i = 0; i < nroi; ++i)
+    {
+        cv::Rect r = getFromTable(rng(10));
+        cv::gpu::GpuMat sub(rois, r);
+        sub.setTo(1);
+    }
+
+    cv::gpu::GpuMat trois;
+    cv::gpu::transpose(rois, trois);
+
+    cv::gpu::GpuMat curr = objectBoxes;
+    cascade.detectMultiScale(colored, trois, curr);
+
+    TEST_CYCLE()
+    {
+        curr = objectBoxes;
+        cascade.detectMultiScale(colored, trois, curr);
+    }
+}
+
+FAIL_NO_CPU(SoftCascadeTestRoi, detectInRoi)
+
+
+GPU_PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+        testing::Range(0, 10)))
+{}
+
+RUN_GPU(SoftCascadeTestRoi, detectEachRoi)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    rois.setTo(0);
+
+    int idx = GET_PARAM(2);
+    cv::Rect r = getFromTable(idx);
+    cv::gpu::GpuMat sub(rois, r);
+    sub.setTo(1);
+
+    cv::gpu::GpuMat curr = objectBoxes;
+    cv::gpu::GpuMat trois;
+    cv::gpu::transpose(rois, trois);
+
+    cascade.detectMultiScale(colored, trois, curr);
+
+    TEST_CYCLE()
+    {
+        curr = objectBoxes;
+        cascade.detectMultiScale(colored, trois, curr);
+    }
+}
+
+FAIL_NO_CPU(SoftCascadeTestRoi, detectEachRoi)
\ No newline at end of file

From 022a8b9698c0c488c03fdfb83b47bf1675b6712a Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 15 Oct 2012 15:27:33 +0400
Subject: [PATCH 46/74] fix rounding bug in Level creation

---
 modules/gpu/src/icf.hpp         | 11 +----------
 modules/gpu/src/imgproc.cpp     |  2 +-
 modules/gpu/src/softcascade.cpp | 12 +++++++++++-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 06c81149e..d829012c8 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -81,16 +81,7 @@ struct __align__(8) Level //is actually 24 bytes
     uchar2 workRect;
     uchar2 objSize;
 
-    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
-    {
-        workRect.x = round(w / (float)oct.shrinkage);
-        workRect.y = round(h / (float)oct.shrinkage);
-
-        objSize.x  = round(oct.size.x * relScale);
-        objSize.y  = round(oct.size.y * relScale);
-    }
-
+    Level(int idx, const Octave& oct, const float scale, const int w, const int h);
     __device Level(){}
 };
 
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 0bf9c81c2..df02213b1 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -553,7 +553,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
 
     src.locateROI(whole, offset);
 
-    if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
+    if (false && info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
     {
         GpuMat srcAlligned;
 
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index fc7114b5f..e5d8cb9fb 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -65,6 +65,16 @@ cv::Size cv::gpu::SoftCascade::getRoiSize() const { throw_nogpu(); return cv::Si
 
 #include <icf.hpp>
 
+cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+:  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+{
+    workRect.x = round(w / (float)oct.shrinkage);
+    workRect.y = round(h / (float)oct.shrinkage);
+
+    objSize.x  = cv::saturate_cast<uchar>(oct.size.x * relScale);
+    objSize.y  = cv::saturate_cast<uchar>(oct.size.y * relScale);
+}
+
 namespace cv { namespace gpu { namespace device {
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
@@ -72,7 +82,7 @@ namespace icf {
 }
 namespace imgproc
 {
-    void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
 }
 }}}
 

From ca81628a9a3fe8e82f5ba3348a3036e5b6475e45 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 15 Oct 2012 17:38:21 +0400
Subject: [PATCH 47/74] fix retrieval of detections count

---
 modules/gpu/src/cuda/isf-sc.cu        | 2 +-
 modules/gpu/src/softcascade.cpp       | 2 +-
 modules/gpu/test/test_softcascade.cpp | 4 ++++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 74e47ba19..7aef41abc 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -313,7 +313,7 @@ namespace icf {
         dim3 block(32, 8);
         dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1);
 
-        uint* ctr = (uint*)counter.ptr();
+        uint* ctr = (uint*)(counter.ptr(0));
         Detection* det = (Detection*)objects.ptr();
         uint max_det = objects.cols / sizeof(Detection);
 
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index e5d8cb9fb..560c25196 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -541,7 +541,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
     flds.detect(specificScale, rois, objects, 0);
 
     cv::Mat out(flds.detCounter);
-    int ndetections = *(out.data);
+    int ndetections = *(out.ptr<int>(0));
 
     if (! ndetections)
         objects = GpuMat();
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 04fa9b181..04c38557c 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -257,5 +257,9 @@ TEST(SoftCascadeTest, detect)
     cv::gpu::transpose(rois, trois);
 
     cascade.detectMultiScale(colored, trois, objectBoxes);
+
+    typedef cv::gpu::SoftCascade::Detection Detection;
+    cv::Mat detections(objectBoxes);
+    ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U);
 }
 #endif
\ No newline at end of file

From fa55d51b6ae616b90a036de7a231335f1fce475f Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 15 Oct 2012 18:13:58 +0400
Subject: [PATCH 48/74] add sanity check to performance tests for soft cascade

---
 modules/gpu/perf/perf_softcascade.cpp | 59 ++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 6 deletions(-)

diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp
index 783089fcb..d379d7fe5 100644
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -5,7 +5,7 @@
      public:\
       fixture##_##name() {}\
      protected:\
-             virtual void __cpu();\
+        virtual void __cpu();\
         virtual void __gpu();\
       virtual void PerfTestBody();\
     };\
@@ -22,6 +22,44 @@
 #define FAIL_NO_CPU(fixture, name)\
 void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";}
 
+namespace {
+    struct DetectionLess
+    {
+        bool operator()(const cv::gpu::SoftCascade::Detection& a,
+            const cv::gpu::SoftCascade::Detection& b) const
+        {
+            if (a.x != b.x) return a.x < b.x;
+            else if (a.y != b.y) return a.y < b.y;
+            else if (a.w != b.w) return a.w < b.w;
+            else return a.h < b.h;
+        }
+
+        bool operator()(const cv::SoftCascade::Detection& a,
+            const cv::SoftCascade::Detection& b) const
+        {
+            const cv::Rect& ra = a.rect;
+            const cv::Rect& rb = b.rect;
+
+            if (ra.x != rb.x) return ra.x < rb.x;
+            else if (ra.y != rb.y) return ra.y < rb.y;
+            else if (ra.width != rb.width) return ra.width < rb.width;
+            else return ra.height < rb.height;
+        }
+    };
+
+    cv::Mat sortDetections(cv::gpu::GpuMat& objects)
+    {
+        cv::Mat detections(objects);
+
+        typedef cv::gpu::SoftCascade::Detection Detection;
+        Detection* begin = (Detection*)(detections.ptr<char>(0));
+        Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
+        std::sort(begin, end, DetectionLess());
+
+        return detections;
+    }
+}
+
 
 typedef std::tr1::tuple<std::string, std::string> fixture_t;
 typedef perf::TestBaseWithParam<fixture_t> SoftCascadeTest;
@@ -41,7 +79,7 @@ RUN_GPU(SoftCascadeTest, detect)
     cv::gpu::SoftCascade cascade;
     ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
 
-    cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SoftCascade::Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
     rois.setTo(1);
     cv::gpu::transpose(rois, trois);
 
@@ -53,6 +91,8 @@ RUN_GPU(SoftCascadeTest, detect)
         curr = objectBoxes;
         cascade.detectMultiScale(colored, trois, curr);
     }
+
+    SANITY_CHECK(sortDetections(curr));
 }
 
 RUN_CPU(SoftCascadeTest, detect)
@@ -66,13 +106,16 @@ RUN_CPU(SoftCascadeTest, detect)
     std::vector<cv::Rect> rois;
 
     typedef cv::SoftCascade::Detection Detection;
-    std::vector<Detection>objectBoxes;
-    cascade.detectMultiScale(colored, rois, objectBoxes);
+    std::vector<Detection>objects;
+    cascade.detectMultiScale(colored, rois, objects);
 
     TEST_CYCLE()
     {
-        cascade.detectMultiScale(colored, rois, objectBoxes);
+        cascade.detectMultiScale(colored, rois, objects);
     }
+
+    std::sort(objects.begin(), objects.end(), DetectionLess());
+    SANITY_CHECK(objects);
 }
 
 static cv::Rect getFromTable(int idx)
@@ -137,6 +180,8 @@ RUN_GPU(SoftCascadeTestRoi, detectInRoi)
         curr = objectBoxes;
         cascade.detectMultiScale(colored, trois, curr);
     }
+
+    SANITY_CHECK(sortDetections(curr));
 }
 
 FAIL_NO_CPU(SoftCascadeTestRoi, detectInRoi)
@@ -177,6 +222,8 @@ RUN_GPU(SoftCascadeTestRoi, detectEachRoi)
         curr = objectBoxes;
         cascade.detectMultiScale(colored, trois, curr);
     }
+
+    SANITY_CHECK(sortDetections(curr));
 }
 
-FAIL_NO_CPU(SoftCascadeTestRoi, detectEachRoi)
\ No newline at end of file
+FAIL_NO_CPU(SoftCascadeTestRoi, detectEachRoi)

From 2bd35c4358400657b0b6b1a47905152410e01628 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 15 Oct 2012 21:55:57 +0400
Subject: [PATCH 49/74] add support for precomputed integrals

---
 modules/gpu/perf/perf_softcascade.cpp | 56 +++++++++++++++++++++++++--
 modules/gpu/src/softcascade.cpp       | 15 +++++--
 modules/gpu/test/test_softcascade.cpp | 33 ++++++++++++++++
 3 files changed, 96 insertions(+), 8 deletions(-)

diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp
index d379d7fe5..582561c7c 100644
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -9,7 +9,7 @@
         virtual void __gpu();\
       virtual void PerfTestBody();\
     };\
-    TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); if (runOnGpu) __gpu(); else __cpu();}\
+    TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); if (PERF_RUN_GPU()) __gpu(); else __cpu();}\
     INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\
     void fixture##_##name::PerfTestBody()
 
@@ -19,7 +19,7 @@
 #define RUN_GPU(fixture, name)\
     void fixture##_##name::__gpu()
 
-#define FAIL_NO_CPU(fixture, name)\
+#define NO_CPU(fixture, name)\
 void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";}
 
 namespace {
@@ -184,7 +184,7 @@ RUN_GPU(SoftCascadeTestRoi, detectInRoi)
     SANITY_CHECK(sortDetections(curr));
 }
 
-FAIL_NO_CPU(SoftCascadeTestRoi, detectInRoi)
+NO_CPU(SoftCascadeTestRoi, detectInRoi)
 
 
 GPU_PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi,
@@ -226,4 +226,52 @@ RUN_GPU(SoftCascadeTestRoi, detectEachRoi)
     SANITY_CHECK(sortDetections(curr));
 }
 
-FAIL_NO_CPU(SoftCascadeTestRoi, detectEachRoi)
+NO_CPU(SoftCascadeTestRoi, detectEachRoi)
+
+GPU_PERF_TEST_P(SoftCascadeTest, detectOnIntegral,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/integrals.xml"))))
+{ }
+
+    static std::string itoa(long i)
+    {
+        static char s[65];
+        sprintf(s, "%ld", i);
+        return std::string(s);
+    }
+
+RUN_GPU(SoftCascadeTest, detectOnIntegral)
+{
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1);
+    for (int i = 0; i < 10; ++i)
+    {
+        cv::Mat channel;
+        fs[std::string("channel") + itoa(i)] >> channel;
+        cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
+        gchannel.upload(channel);
+    }
+
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SoftCascade::Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
+    rois.setTo(1);
+    cv::gpu::transpose(rois, trois);
+
+    cv::gpu::GpuMat curr = objectBoxes;
+    cascade.detectMultiScale(hogluv, trois, curr);
+
+    TEST_CYCLE()
+    {
+        curr = objectBoxes;
+        cascade.detectMultiScale(hogluv, trois, curr);
+    }
+
+    SANITY_CHECK(sortDetections(curr));
+}
+
+NO_CPU(SoftCascadeTest, detectOnIntegral)
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 560c25196..d9519e873 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -526,17 +526,24 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
                                 GpuMat& objects, const int /*rejectfactor*/, int specificScale) const
 {
     // only color images are supperted
-    CV_Assert(colored.type() == CV_8UC3);
+    CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
 
     // we guess user knows about shrincage
     CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
 
-    // only this window size allowed
-    CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
 
     Filds& flds = *filds;
 
-    flds.preprocess(colored);
+    if (colored.type() == CV_8UC3)
+    {
+        // only this window size allowed
+        CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
+        flds.preprocess(colored);
+    }
+    else
+    {
+        colored.copyTo(flds.hogluv);
+    }
 
     flds.detect(specificScale, rois, objects, 0);
 
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 04c38557c..bf880297b 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -262,4 +262,37 @@ TEST(SoftCascadeTest, detect)
     cv::Mat detections(objectBoxes);
     ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U);
 }
+
+TEST(SoftCascadeTest, detectOnIntegral)
+{
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    cv::gpu::SoftCascade cascade;
+    ASSERT_TRUE(cascade.load(xml));
+
+    std::string intPath = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/integrals.xml";
+    cv::FileStorage fs(intPath, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    GpuMat hogluv(121 * 10, 161, CV_32SC1);
+    for (int i = 0; i < 10; ++i)
+    {
+        cv::Mat channel;
+        fs[std::string("channel") + SoftCascadeTest::itoa(i)] >> channel;
+        GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
+        gchannel.upload(channel);
+    }
+
+    GpuMat objectBoxes(1, 100000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    rois.setTo(1);
+
+    cv::gpu::GpuMat trois;
+    cv::gpu::transpose(rois, trois);
+
+    cascade.detectMultiScale(hogluv, trois, objectBoxes);
+
+    typedef cv::gpu::SoftCascade::Detection Detection;
+    cv::Mat detections(objectBoxes);
+
+    ASSERT_EQ(detections.cols / sizeof(Detection) ,2042U);
+}
 #endif
\ No newline at end of file

From fba62c9251053410adc71ead3d9e594567426242 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sat, 3 Nov 2012 18:03:31 +0400
Subject: [PATCH 50/74] fix compilation problem after rebase

---
 modules/gpu/perf/perf_objdetect.cpp   | 242 +-------------------------
 modules/gpu/perf/perf_softcascade.cpp |  56 +++---
 2 files changed, 31 insertions(+), 267 deletions(-)

diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index ced8ee17d..6d040ac02 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -89,244 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
     SANITY_CHECK(found_locations);
 }
 
-//================================================= ICF SoftCascade =================================================//
-
-typedef pair<string, string> pair_string;
-DEF_PARAM_TEST_1(SoftCascade, pair_string);
-
-
-// struct SoftCascadeTest : public perf::TestBaseWithParam<roi_fixture_t>
-// {
-//     typedef cv::gpu::SoftCascade::Detection detection_t;
-//     static cv::Rect getFromTable(int idx)
-//     {
-//         static const cv::Rect rois[] =
-//         {
-//             cv::Rect( 65,  20,  35, 80),
-//             cv::Rect( 95,  35,  45, 40),
-//             cv::Rect( 45,  35,  45, 40),
-//             cv::Rect( 25,  27,  50, 45),
-//             cv::Rect(100,  50,  45, 40),
-
-//             cv::Rect( 60,  30,  45, 40),
-//             cv::Rect( 40,  55,  50, 40),
-//             cv::Rect( 48,  37,  72, 80),
-//             cv::Rect( 48,  32,  85, 58),
-//             cv::Rect( 48,   0,  32, 27)
-//         };
-
-//         return rois[idx];
-//     }
-
-//     static std::string itoa(long i)
-//     {
-//         static char s[65];
-//         sprintf(s, "%ld", i);
-//         return std::string(s);
-//     }
-
-//     static std::string getImageName(int level)
-//     {
-//         time_t rawtime;
-//         struct tm * timeinfo;
-//         char buffer [80];
-
-//         time ( &rawtime );
-//         timeinfo = localtime ( &rawtime );
-
-//         strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo);
-//         return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
-//     }
-
-//     static void print(std::ostream &out, const detection_t& d)
-//     {
-//         out << "\x1b[32m[ detection]\x1b[0m ("
-//             << std::setw(4)  << d.x
-//             << " "
-//             << std::setw(4)  << d.y
-//             << ") ("
-//             << std::setw(4)  << d.w
-//             << " "
-//             << std::setw(4)  << d.h
-//             << ") "
-//             << std::setw(12) << d.confidence
-//             <<  std::endl;
-//     }
-
-//     static void printTotal(std::ostream &out, int detbytes)
-//     {
-//         out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(detection_t)) << std::endl;
-//     }
-
-//     static void writeResult(const cv::Mat& result, const int level)
-//     {
-//         std::string path = cv::tempfile(getImageName(level).c_str());
-//         cv::imwrite(path, result);
-//         std::cout << "\x1b[32m" << "[          ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
-//     }
-// };
-
-typedef std::tr1::tuple<std::string, std::string> fixture_t;
-typedef perf::TestBaseWithParam<fixture_t> SoftCascadeTest;
-
-PERF_TEST_P(SoftCascadeTest, detect,
-    testing::Combine(
-        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
-        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
-{
-    if (runOnGpu)
-    {
-        cv::Mat cpu = readImage (GET_PARAM(1));
-        ASSERT_FALSE(cpu.empty());
-        cv::gpu::GpuMat colored(cpu);
-
-        cv::gpu::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
-
-        cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
-        rois.setTo(1);
-        cv::gpu::transpose(rois, trois);
-
-        cv::gpu::GpuMat curr = objectBoxes;
-        cascade.detectMultiScale(colored, trois, curr);
-
-        TEST_CYCLE()
-        {
-            curr = objectBoxes;
-            cascade.detectMultiScale(colored, trois, curr);
-        }
-    }
-    else
-    {
-        cv::Mat colored = readImage(GET_PARAM(1));
-        ASSERT_FALSE(colored.empty());
-
-        cv::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
-
-        std::vector<cv::Rect> rois;
-
-        typedef cv::SoftCascade::Detection Detection;
-        std::vector<Detection>objectBoxes;
-        cascade.detectMultiScale(colored, rois, objectBoxes);
-
-        TEST_CYCLE()
-        {
-            cascade.detectMultiScale(colored, rois, objectBoxes);
-        }
-    }
-}
-
-static cv::Rect getFromTable(int idx)
-{
-    static const cv::Rect rois[] =
-    {
-        cv::Rect( 65,  20,  35, 80),
-        cv::Rect( 95,  35,  45, 40),
-        cv::Rect( 45,  35,  45, 40),
-        cv::Rect( 25,  27,  50, 45),
-        cv::Rect(100,  50,  45, 40),
-
-        cv::Rect( 60,  30,  45, 40),
-        cv::Rect( 40,  55,  50, 40),
-        cv::Rect( 48,  37,  72, 80),
-        cv::Rect( 48,  32,  85, 58),
-        cv::Rect( 48,   0,  32, 27)
-    };
-
-    return rois[idx];
-}
-
-typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
-typedef perf::TestBaseWithParam<roi_fixture_t> SoftCascadeTestRoi;
-
-PERF_TEST_P(SoftCascadeTestRoi, detectInRoi,
-    testing::Combine(
-        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
-        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
-        testing::Range(0, 5)))
-{
-    if (runOnGpu)
-    {
-        cv::Mat cpu = readImage (GET_PARAM(1));
-        ASSERT_FALSE(cpu.empty());
-        cv::gpu::GpuMat colored(cpu);
-
-        cv::gpu::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
-
-        cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
-        rois.setTo(0);
-
-        int nroi = GET_PARAM(2);
-        cv::RNG rng;
-        for (int i = 0; i < nroi; ++i)
-        {
-            cv::Rect r = getFromTable(rng(10));
-            cv::gpu::GpuMat sub(rois, r);
-            sub.setTo(1);
-        }
-
-        cv::gpu::GpuMat trois;
-        cv::gpu::transpose(rois, trois);
-
-        cv::gpu::GpuMat curr = objectBoxes;
-        cascade.detectMultiScale(colored, trois, curr);
-
-        TEST_CYCLE()
-        {
-            curr = objectBoxes;
-            cascade.detectMultiScale(colored, trois, curr);
-        }
-    }
-    else
-    {
-        FAIL();
-    }
-}
-
-PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi,
-    testing::Combine(
-        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
-        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
-        testing::Range(0, 10)))
-{
-    if (runOnGpu)
-    {
-        cv::Mat cpu = readImage (GET_PARAM(1));
-        ASSERT_FALSE(cpu.empty());
-        cv::gpu::GpuMat colored(cpu);
-
-        cv::gpu::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
-
-        cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
-        rois.setTo(0);
-
-        int idx = GET_PARAM(2);
-        cv::Rect r = getFromTable(idx);
-        cv::gpu::GpuMat sub(rois, r);
-        sub.setTo(1);
-
-        cv::gpu::GpuMat curr = objectBoxes;
-        cv::gpu::GpuMat trois;
-        cv::gpu::transpose(rois, trois);
-
-        cascade.detectMultiScale(colored, trois, curr);
-
-        TEST_CYCLE()
-        {
-            curr = objectBoxes;
-            cascade.detectMultiScale(colored, rois, curr);
-        }
-    }
-    else
-    {
-        FAIL();
-    }
-}
-
-
 ///////////////////////////////////////////////////////////////
 // HaarClassifier
 
@@ -383,7 +145,7 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
     cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(img.empty());
 
-    if (runOnGpu)
+    if (PERF_RUN_GPU())
     {
         cv::gpu::CascadeClassifier_GPU d_cascade;
         ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
@@ -418,4 +180,4 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
     }
 }
 
-} // namespace
+} // namespace
\ No newline at end of file
diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp
index 582561c7c..9b53b2e84 100644
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -34,17 +34,17 @@ namespace {
             else return a.h < b.h;
         }
 
-        bool operator()(const cv::SoftCascade::Detection& a,
-            const cv::SoftCascade::Detection& b) const
-        {
-            const cv::Rect& ra = a.rect;
-            const cv::Rect& rb = b.rect;
+        // bool operator()(const cv::SoftCascade::Detection& a,
+        //     const cv::SoftCascade::Detection& b) const
+        // {
+        //     const cv::Rect& ra = a.rect;
+        //     const cv::Rect& rb = b.rect;
 
-            if (ra.x != rb.x) return ra.x < rb.x;
-            else if (ra.y != rb.y) return ra.y < rb.y;
-            else if (ra.width != rb.width) return ra.width < rb.width;
-            else return ra.height < rb.height;
-        }
+        //     if (ra.x != rb.x) return ra.x < rb.x;
+        //     else if (ra.y != rb.y) return ra.y < rb.y;
+        //     else if (ra.width != rb.width) return ra.width < rb.width;
+        //     else return ra.height < rb.height;
+        // }
     };
 
     cv::Mat sortDetections(cv::gpu::GpuMat& objects)
@@ -95,28 +95,30 @@ RUN_GPU(SoftCascadeTest, detect)
     SANITY_CHECK(sortDetections(curr));
 }
 
-RUN_CPU(SoftCascadeTest, detect)
-{
-    cv::Mat colored = readImage(GET_PARAM(1));
-    ASSERT_FALSE(colored.empty());
+NO_CPU(SoftCascadeTest, detect)
 
-    cv::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
+// RUN_CPU(SoftCascadeTest, detect)
+// {
+//     cv::Mat colored = readImage(GET_PARAM(1));
+//     ASSERT_FALSE(colored.empty());
 
-    std::vector<cv::Rect> rois;
+//     cv::SoftCascade cascade;
+//     ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
 
-    typedef cv::SoftCascade::Detection Detection;
-    std::vector<Detection>objects;
-    cascade.detectMultiScale(colored, rois, objects);
+//     std::vector<cv::Rect> rois;
 
-    TEST_CYCLE()
-    {
-        cascade.detectMultiScale(colored, rois, objects);
-    }
+//     typedef cv::SoftCascade::Detection Detection;
+//     std::vector<Detection>objects;
+//     cascade.detectMultiScale(colored, rois, objects);
 
-    std::sort(objects.begin(), objects.end(), DetectionLess());
-    SANITY_CHECK(objects);
-}
+//     TEST_CYCLE()
+//     {
+//         cascade.detectMultiScale(colored, rois, objects);
+//     }
+
+//     std::sort(objects.begin(), objects.end(), DetectionLess());
+//     SANITY_CHECK(objects);
+// }
 
 static cv::Rect getFromTable(int idx)
 {

From ac5cd4827908952bf29a40879c1b4490c6d3127d Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sun, 4 Nov 2012 03:29:20 +0400
Subject: [PATCH 51/74] add DeviceInfo parameter to the soft cascade tests

---
 modules/gpu/test/test_softcascade.cpp | 67 ++++++++++++++++-----------
 1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index bf880297b..fb936be88 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -68,12 +68,10 @@ using cv::gpu::GpuMat;
     INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);  \
     void fixture##_##name::body()
 
+namespace {
 
-typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
+    typedef cv::gpu::SoftCascade::Detection Detection;
 
-struct SoftCascadeTest : public ::testing::TestWithParam<roi_fixture_t>
-{
-    typedef cv::gpu::SoftCascade::Detection detection_t;
     static cv::Rect getFromTable(int idx)
     {
         static const cv::Rect rois[] =
@@ -114,7 +112,7 @@ struct SoftCascadeTest : public ::testing::TestWithParam<roi_fixture_t>
         return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
     }
 
-    static void print(std::ostream &out, const detection_t& d)
+    static void print(std::ostream &out, const Detection& d)
     {
         out << "\x1b[32m[ detection]\x1b[0m ("
             << std::setw(4)  << d.x
@@ -131,7 +129,7 @@ struct SoftCascadeTest : public ::testing::TestWithParam<roi_fixture_t>
 
     static void printTotal(std::ostream &out, int detbytes)
     {
-        out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(detection_t)) << std::endl;
+        out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl;
     }
 
     static void writeResult(const cv::Mat& result, const int level)
@@ -140,24 +138,27 @@ struct SoftCascadeTest : public ::testing::TestWithParam<roi_fixture_t>
         cv::imwrite(path, result);
         std::cout << "\x1b[32m" << "[          ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
     }
-};
+}
 
-GPU_TEST_P(SoftCascadeTest, detectInROI,
+typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SoftCascadeTestRoi;
+GPU_TEST_P(SoftCascadeTestRoi, detect,
     testing::Combine(
+        ALL_DEVICES,
         testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
         testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
         testing::Range(0, 5)))
 {
-    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1));
+    cv::gpu::setDevice(GET_PARAM(0).deviceID());
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
     ASSERT_FALSE(coloredCpu.empty());
 
     cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0)));
+    ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1)));
 
     GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
     rois.setTo(0);
 
-    int nroi = GET_PARAM(2);
+    int nroi = GET_PARAM(3);
     cv::Mat result(coloredCpu);
     cv::RNG rng;
     for (int i = 0; i < nroi; ++i)
@@ -173,16 +174,15 @@ GPU_TEST_P(SoftCascadeTest, detectInROI,
 
     cascade.detectMultiScale(colored, trois, objectBoxes);
 
-    ///
     cv::Mat dt(objectBoxes);
-    typedef cv::gpu::SoftCascade::Detection detection_t;
+    typedef cv::gpu::SoftCascade::Detection Detection;
 
-    detection_t* dts = (detection_t*)dt.data;
+    Detection* dts = (Detection*)dt.data;
 
     printTotal(std::cout, dt.cols);
-    for (int i = 0; i  < (int)(dt.cols / sizeof(detection_t)); ++i)
+    for (int i = 0; i  < (int)(dt.cols / sizeof(Detection)); ++i)
     {
-        detection_t d = dts[i];
+        Detection d = dts[i];
         print(std::cout, d);
         cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
     }
@@ -190,39 +190,43 @@ GPU_TEST_P(SoftCascadeTest, detectInROI,
     SHOW(result);
 }
 
-GPU_TEST_P(SoftCascadeTest, detectInLevel,
+typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SoftCascadeTestLevel;
+GPU_TEST_P(SoftCascadeTestLevel, detect,
         testing::Combine(
+        ALL_DEVICES,
         testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
         testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
         testing::Range(0, 47)
         ))
 {
-    std::string xml =  cvtest::TS::ptr()->get_data_path() + GET_PARAM(0);
+    cv::gpu::setDevice(GET_PARAM(0).deviceID());
+
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + GET_PARAM(1);
     cv::gpu::SoftCascade cascade;
     ASSERT_TRUE(cascade.load(xml));
 
-    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1));
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
     ASSERT_FALSE(coloredCpu.empty());
 
-    typedef cv::gpu::SoftCascade::Detection detection_t;
-    GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    typedef cv::gpu::SoftCascade::Detection Detection;
+    GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
     rois.setTo(1);
 
     cv::gpu::GpuMat trois;
     cv::gpu::transpose(rois, trois);
 
-    int level = GET_PARAM(2);
+    int level = GET_PARAM(3);
     cascade.detectMultiScale(colored, trois, objectBoxes, 1, level);
 
     cv::Mat dt(objectBoxes);
 
-    detection_t* dts = (detection_t*)dt.data;
+    Detection* dts = (Detection*)dt.data;
     cv::Mat result(coloredCpu);
 
     printTotal(std::cout, dt.cols);
-    for (int i = 0; i  < (int)(dt.cols / sizeof(detection_t)); ++i)
+    for (int i = 0; i  < (int)(dt.cols / sizeof(Detection)); ++i)
     {
-        detection_t d = dts[i];
+        Detection d = dts[i];
         print(std::cout, d);
         cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
     }
@@ -238,8 +242,12 @@ TEST(SoftCascadeTest, readCascade)
     ASSERT_TRUE(cascade.load(xml));
 }
 
-TEST(SoftCascadeTest, detect)
+typedef ::testing::TestWithParam<cv::gpu::DeviceInfo > SoftCascadeTestAll;
+GPU_TEST_P(SoftCascadeTestAll, detect,
+        ALL_DEVICES
+        )
 {
+    cv::gpu::setDevice(GetParam().deviceID());
     std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
     cv::gpu::SoftCascade cascade;
     ASSERT_TRUE(cascade.load(xml));
@@ -263,8 +271,11 @@ TEST(SoftCascadeTest, detect)
     ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U);
 }
 
-TEST(SoftCascadeTest, detectOnIntegral)
+GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral,
+        ALL_DEVICES
+        )
 {
+    cv::gpu::setDevice(GetParam().deviceID());
     std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
     cv::gpu::SoftCascade cascade;
     ASSERT_TRUE(cascade.load(xml));
@@ -277,7 +288,7 @@ TEST(SoftCascadeTest, detectOnIntegral)
     for (int i = 0; i < 10; ++i)
     {
         cv::Mat channel;
-        fs[std::string("channel") + SoftCascadeTest::itoa(i)] >> channel;
+        fs[std::string("channel") + itoa(i)] >> channel;
         GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
         gchannel.upload(channel);
     }

From df392cc830df738cdb8f0f2a2398fad684752f3e Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 5 Nov 2012 13:52:24 +0400
Subject: [PATCH 52/74] use fast integral for soft cascade

---
 modules/gpu/src/cuda/integral_image.cu | 83 ++++++++++++++++++++++++++
 modules/gpu/src/cuda/isf-sc.cu         |  4 +-
 modules/gpu/src/softcascade.cpp        | 31 +++++-----
 modules/gpu/test/test_softcascade.cpp  |  1 +
 4 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu
index 558f9085d..5bd35bdc7 100644
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -383,6 +383,89 @@ namespace cv { namespace gpu { namespace device
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
+
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = buffer.ptr(y) + tidx;
+                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *dst = sum;
+            }
+        #endif
+        }
+
+        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
+        // ToDo: partial dy
+        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
+            int blockStep, cudaStream_t stream)
+        {
+            {
+                const int block = blockStep;
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        }
     }
 }}}
 
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 7aef41abc..3391bb1a0 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -198,14 +198,14 @@ namespace icf {
             Node node = nodes[nId];
 
             float threshold = rescale<isUp>(level, node);
-            int sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
+            int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
 
             int next = 1 + (int)(sum >= threshold);
             dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
 
             node = nodes[nId + next];
             threshold = rescale<isUp>(level, node);
-            sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
+            sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
 
             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
             float impact = leaves[(st + threadIdx.x) * 4 + lShift];
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index d9519e873..2d43a5440 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -76,14 +76,20 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale
 }
 
 namespace cv { namespace gpu { namespace device {
+
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
 }
-namespace imgproc
-{
-    void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
+
+namespace imgproc {
+    void shfl_integral_gpu_buffered(PtrStepSzb, PtrStepSz<uint4>, PtrStepSz<unsigned int>, int, cudaStream_t);
+
+    template <typename T>
+    void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+                    PtrStepSzb dst, int interpolation, cudaStream_t stream);
 }
+
 }}}
 
 struct cv::gpu::SoftCascade::Filds
@@ -319,9 +325,13 @@ struct cv::gpu::SoftCascade::Filds
         plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
         fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1);
         luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
+
         shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1);
-        integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1);
-        hogluv.create((FRAME_HEIGHT / shr + 1) * HOG_LUV_BINS, FRAME_WIDTH / shr + 64, CV_32SC1);
+        integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1);
+
+        hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1);
+        hogluv.setTo(cv::Scalar::all(0));
+
         detCounter.create(1,1, CV_32SC1);
 
         octaves.upload(hoctaves);
@@ -432,16 +442,7 @@ private:
 
         GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
         cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-
-        fw /= shrinkage;
-        fh /= shrinkage;
-
-        for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-        {
-            GpuMat channel(shrunk, cv::Rect(0, fh  * i, fw, fh ));
-            GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
-            cv::gpu::integralBuffered(channel, sum, integralBuffer);
-        }
+        device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0);
     }
 
 public:
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index fb936be88..1146b062b 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -271,6 +271,7 @@ GPU_TEST_P(SoftCascadeTestAll, detect,
     ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U);
 }
 
+//ToDo: fix me
 GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral,
         ALL_DEVICES
         )

From 9b251f81309aad65344e3a345c87e132ff37e8d5 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 5 Nov 2012 14:21:46 +0400
Subject: [PATCH 53/74] remove Sobel normalization

---
 modules/gpu/src/softcascade.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 2d43a5440..1e0c271b9 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -396,8 +396,8 @@ private:
         GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
         GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
 
-        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f);
-        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f);
+        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0);
+        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1);
 
         GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
         GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
@@ -409,7 +409,7 @@ private:
         GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
         GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
 
-        cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag);
+        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag);
         cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
 
         //create uchar magnitude

From 27eb2e27db92c47d70c1e9d89187c2a28df93dc7 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 5 Nov 2012 18:52:35 +0400
Subject: [PATCH 54/74] enable fast  integral for Kepler

---
 modules/gpu/src/imgproc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index df02213b1..0bf9c81c2 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -553,7 +553,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
 
     src.locateROI(whole, offset);
 
-    if (false && info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
+    if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048)
     {
         GpuMat srcAlligned;
 

From e6eb1b99e19b5e8e99e1d8aa872363335a6a0a92 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sat, 10 Nov 2012 00:48:37 +0400
Subject: [PATCH 55/74] fix negative confidence bug

---
 modules/gpu/src/cuda/isf-sc.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 3391bb1a0..ee9a9f674 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -229,7 +229,7 @@ namespace icf {
             if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
         }
 
-        if(st == stEnd && !threadIdx.x)
+        if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
         {
             int idx = atomicInc(ctr, ndetections);
             // store detection

From 40600fa5048ddf43bf8fc7602694ad723234c87d Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sat, 10 Nov 2012 00:49:51 +0400
Subject: [PATCH 56/74] GPU version becomes algorithm

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  67 +++++-----
 modules/gpu/perf/perf_softcascade.cpp   | 146 +++++++++++----------
 modules/gpu/src/gpu_init.cpp            |  60 +++++++++
 modules/gpu/src/softcascade.cpp         | 161 +++++++++++++-----------
 modules/gpu/test/test_softcascade.cpp   | 147 +++++++++++++---------
 5 files changed, 346 insertions(+), 235 deletions(-)
 create mode 100644 modules/gpu/src/gpu_init.cpp

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 9b59c6004..4fc6179d8 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1534,10 +1534,12 @@ public:
 
 // ======================== GPU version for soft cascade ===================== //
 
-class CV_EXPORTS SoftCascade
+// Implementation of soft (stageless) cascaded detector.
+class CV_EXPORTS SCascade : public Algorithm
 {
 public:
 
+    // Representation of detectors result.
     struct CV_EXPORTS Detection
     {
         ushort x;
@@ -1549,47 +1551,44 @@ public:
 
         enum {PEDESTRIAN = 0};
     };
-    //! An empty cascade will be created.
-    SoftCascade();
 
-    //! Cascade will be created from file for scales from minScale to maxScale.
-    //! Param filename is a path to xml-serialized cascade.
-    //! Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
-    //! Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
-    SoftCascade( const string& filename, const float minScale = 0.4f, const float maxScale = 5.f);
+    // An empty cascade will be created.
+    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
+    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
+    // Param scales is a number of scales from minScale to maxScale.
+    // Param rejfactor is used for NMS.
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
 
-    //! cascade will be loaded from file "filename". The previous cascade will be destroyed.
-    //! Param filename is a path to xml-serialized cascade.
-    //! Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
-    //! Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
-    bool load( const string& filename, const float minScale = 0.4f, const float maxScale = 5.f);
+    virtual ~SCascade();
 
-    virtual ~SoftCascade();
+    cv::AlgorithmInfo* info() const;
 
-    //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values
-    //! Param image is input frame for detector. Cascade will be applied to it.
-    //! Param rois is a mask
-    //! Param objects 4-channel matrix thet contain detected rectangles
-    //! Param rejectfactor used for final object box computing
-    virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
-    int rejectfactor = 1, int specificScale = -1) const;
+    // Load cascade from FileNode.
+    // Param fn is a root node for cascade. Should be <cascade>.
+    virtual bool load(const FileNode& fn);
 
-    //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values.
-    //! asynchronous version.
-    //! Param image is input frame for detector. Cascade will be applied to it.
-    //! Param rois is a mask
-    //! Param objects 4-channel matrix thet contain detected rectangles
-    //! Param rejectfactor used for final object box computing
-    //! Param ndet retrieves number of detections
-    //! Param stream wrapper for CUDA stream
-    virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects,
-    int rejectfactor, GpuMat& ndet, Stream stream) const;
+    // Load cascade config.
+    virtual void read(const FileNode& fn);
 
-    cv::Size getRoiSize() const;
+    // Return the vector of Decection objcts.
+    // Param image is a frame on which detector will be applied.
+    // Param rois is a vector of regions of interest. Only the objects that fall into one of the regions will be returned.
+    // Param objects is an output array of Detections
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+    virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const;
+
+    void genRoi(InputArray roi, OutputArray mask) const;
 
 private:
-    struct Filds;
-    Filds* filds;
+
+    struct Fields;
+    Fields* fields;
+
+    double minScale;
+    double maxScale;
+
+    int scales;
+    int rejfactor;
 };
 
 ////////////////////////////////// SURF //////////////////////////////////////////
diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp
index 9b53b2e84..1e62af8eb 100644
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -25,8 +25,8 @@ void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";
 namespace {
     struct DetectionLess
     {
-        bool operator()(const cv::gpu::SoftCascade::Detection& a,
-            const cv::gpu::SoftCascade::Detection& b) const
+        bool operator()(const cv::gpu::SCascade::Detection& a,
+            const cv::gpu::SCascade::Detection& b) const
         {
             if (a.x != b.x) return a.x < b.x;
             else if (a.y != b.y) return a.y < b.y;
@@ -51,7 +51,7 @@ namespace {
     {
         cv::Mat detections(objects);
 
-        typedef cv::gpu::SoftCascade::Detection Detection;
+        typedef cv::gpu::SCascade::Detection Detection;
         Detection* begin = (Detection*)(detections.ptr<char>(0));
         Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
         std::sort(begin, end, DetectionLess());
@@ -62,52 +62,54 @@ namespace {
 
 
 typedef std::tr1::tuple<std::string, std::string> fixture_t;
-typedef perf::TestBaseWithParam<fixture_t> SoftCascadeTest;
+typedef perf::TestBaseWithParam<fixture_t> SCascadeTest;
 
-GPU_PERF_TEST_P(SoftCascadeTest, detect,
+GPU_PERF_TEST_P(SCascadeTest, detect,
     testing::Combine(
         testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
         testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
 { }
 
-RUN_GPU(SoftCascadeTest, detect)
+RUN_GPU(SCascadeTest, detect)
 {
     cv::Mat cpu = readImage (GET_PARAM(1));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+    cv::gpu::SCascade cascade;
 
-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SoftCascade::Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois;
     rois.setTo(1);
-    cv::gpu::transpose(rois, trois);
+    cascade.genRoi(rois, trois);
 
-    cv::gpu::GpuMat curr = objectBoxes;
-    cascade.detectMultiScale(colored, trois, curr);
+    cascade.detect(colored, trois, objectBoxes);
 
     TEST_CYCLE()
     {
-        curr = objectBoxes;
-        cascade.detectMultiScale(colored, trois, curr);
+        cascade.detect(colored, trois, objectBoxes);
     }
 
-    SANITY_CHECK(sortDetections(curr));
+    SANITY_CHECK(sortDetections(objectBoxes));
 }
 
-NO_CPU(SoftCascadeTest, detect)
+NO_CPU(SCascadeTest, detect)
 
-// RUN_CPU(SoftCascadeTest, detect)
+// RUN_CPU(SCascadeTest, detect)
 // {
 //     cv::Mat colored = readImage(GET_PARAM(1));
 //     ASSERT_FALSE(colored.empty());
 
-//     cv::SoftCascade cascade;
+//     cv::SCascade cascade;
 //     ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
 
 //     std::vector<cv::Rect> rois;
 
-//     typedef cv::SoftCascade::Detection Detection;
+//     typedef cv::SCascade::Detection Detection;
 //     std::vector<Detection>objects;
 //     cascade.detectMultiScale(colored, rois, objects);
 
@@ -124,42 +126,46 @@ static cv::Rect getFromTable(int idx)
 {
     static const cv::Rect rois[] =
     {
-        cv::Rect( 65,  20,  35, 80),
-        cv::Rect( 95,  35,  45, 40),
-        cv::Rect( 45,  35,  45, 40),
-        cv::Rect( 25,  27,  50, 45),
-        cv::Rect(100,  50,  45, 40),
+        cv::Rect( 65 * 4,  20 * 4,  35 * 4, 80 * 4),
+        cv::Rect( 95 * 4,  35 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 45 * 4,  35 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 25 * 4,  27 * 4,  50 * 4, 45 * 4),
+        cv::Rect(100 * 4,  50 * 4,  45 * 4, 40 * 4),
 
-        cv::Rect( 60,  30,  45, 40),
-        cv::Rect( 40,  55,  50, 40),
-        cv::Rect( 48,  37,  72, 80),
-        cv::Rect( 48,  32,  85, 58),
-        cv::Rect( 48,   0,  32, 27)
+        cv::Rect( 60 * 4,  30 * 4,  45 * 4, 40 * 4),
+        cv::Rect( 40 * 4,  55 * 4,  50 * 4, 40 * 4),
+        cv::Rect( 48 * 4,  37 * 4,  72 * 4, 80 * 4),
+        cv::Rect( 48 * 4,  32 * 4,  85 * 4, 58 * 4),
+        cv::Rect( 48 * 4,   0 * 4,  32 * 4, 27 * 4)
     };
 
     return rois[idx];
 }
 
 typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
-typedef perf::TestBaseWithParam<roi_fixture_t> SoftCascadeTestRoi;
+typedef perf::TestBaseWithParam<roi_fixture_t> SCascadeTestRoi;
 
-GPU_PERF_TEST_P(SoftCascadeTestRoi, detectInRoi,
+GPU_PERF_TEST_P(SCascadeTestRoi, detectInRoi,
     testing::Combine(
         testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
         testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
         testing::Range(0, 5)))
 {}
 
-RUN_GPU(SoftCascadeTestRoi, detectInRoi)
+RUN_GPU(SCascadeTestRoi, detectInRoi)
 {
     cv::Mat cpu = readImage (GET_PARAM(1));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+    cv::gpu::SCascade cascade;
 
-    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
 
     int nroi = GET_PARAM(2);
@@ -172,40 +178,42 @@ RUN_GPU(SoftCascadeTestRoi, detectInRoi)
     }
 
     cv::gpu::GpuMat trois;
-    cv::gpu::transpose(rois, trois);
+    cascade.genRoi(rois, trois);
 
-    cv::gpu::GpuMat curr = objectBoxes;
-    cascade.detectMultiScale(colored, trois, curr);
+    cascade.detect(colored, trois, objectBoxes);
 
     TEST_CYCLE()
     {
-        curr = objectBoxes;
-        cascade.detectMultiScale(colored, trois, curr);
+        cascade.detect(colored, trois, objectBoxes);
     }
 
-    SANITY_CHECK(sortDetections(curr));
+    SANITY_CHECK(sortDetections(objectBoxes));
 }
 
-NO_CPU(SoftCascadeTestRoi, detectInRoi)
+NO_CPU(SCascadeTestRoi, detectInRoi)
 
 
-GPU_PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi,
+GPU_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,
     testing::Combine(
         testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
         testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
         testing::Range(0, 10)))
 {}
 
-RUN_GPU(SoftCascadeTestRoi, detectEachRoi)
+RUN_GPU(SCascadeTestRoi, detectEachRoi)
 {
     cv::Mat cpu = readImage (GET_PARAM(1));
     ASSERT_FALSE(cpu.empty());
     cv::gpu::GpuMat colored(cpu);
 
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+    cv::gpu::SCascade cascade;
 
-    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
 
     int idx = GET_PARAM(2);
@@ -213,24 +221,22 @@ RUN_GPU(SoftCascadeTestRoi, detectEachRoi)
     cv::gpu::GpuMat sub(rois, r);
     sub.setTo(1);
 
-    cv::gpu::GpuMat curr = objectBoxes;
     cv::gpu::GpuMat trois;
-    cv::gpu::transpose(rois, trois);
+    cascade.genRoi(rois, trois);
 
-    cascade.detectMultiScale(colored, trois, curr);
+    cascade.detect(colored, trois, objectBoxes);
 
     TEST_CYCLE()
     {
-        curr = objectBoxes;
-        cascade.detectMultiScale(colored, trois, curr);
+        cascade.detect(colored, trois, objectBoxes);
     }
 
-    SANITY_CHECK(sortDetections(curr));
+    SANITY_CHECK(sortDetections(objectBoxes));
 }
 
-NO_CPU(SoftCascadeTestRoi, detectEachRoi)
+NO_CPU(SCascadeTestRoi, detectEachRoi)
 
-GPU_PERF_TEST_P(SoftCascadeTest, detectOnIntegral,
+GPU_PERF_TEST_P(SCascadeTest, detectOnIntegral,
     testing::Combine(
         testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
         testing::Values(std::string("cv/cascadeandhog/integrals.xml"))))
@@ -243,37 +249,39 @@ GPU_PERF_TEST_P(SoftCascadeTest, detectOnIntegral,
         return std::string(s);
     }
 
-RUN_GPU(SoftCascadeTest, detectOnIntegral)
+RUN_GPU(SCascadeTest, detectOnIntegral)
 {
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
-    ASSERT_TRUE(fs.isOpened());
+    cv::FileStorage fsi(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
+    ASSERT_TRUE(fsi.isOpened());
 
     cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1);
     for (int i = 0; i < 10; ++i)
     {
         cv::Mat channel;
-        fs[std::string("channel") + itoa(i)] >> channel;
+        fsi[std::string("channel") + itoa(i)] >> channel;
         cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
         gchannel.upload(channel);
     }
 
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0))));
+    cv::gpu::SCascade cascade;
 
-    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SoftCascade::Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(cv::Size(640, 480), CV_8UC1), trois;
     rois.setTo(1);
-    cv::gpu::transpose(rois, trois);
+    cascade.genRoi(rois, trois);
 
-    cv::gpu::GpuMat curr = objectBoxes;
-    cascade.detectMultiScale(hogluv, trois, curr);
+    cascade.detect(hogluv, trois, objectBoxes);
 
     TEST_CYCLE()
     {
-        curr = objectBoxes;
-        cascade.detectMultiScale(hogluv, trois, curr);
+        cascade.detect(hogluv, trois, objectBoxes);
     }
 
-    SANITY_CHECK(sortDetections(curr));
+    SANITY_CHECK(sortDetections(objectBoxes));
 }
 
-NO_CPU(SoftCascadeTest, detectOnIntegral)
\ No newline at end of file
+NO_CPU(SCascadeTest, detectOnIntegral)
\ No newline at end of file
diff --git a/modules/gpu/src/gpu_init.cpp b/modules/gpu/src/gpu_init.cpp
new file mode 100644
index 000000000..f25bc2ceb
--- /dev/null
+++ b/modules/gpu/src/gpu_init.cpp
@@ -0,0 +1,60 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <precomp.hpp>
+
+namespace cv { namespace gpu
+{
+
+CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
+                  obj.info()->addParam(obj, "minScale",  obj.minScale);
+                  obj.info()->addParam(obj, "maxScale",  obj.maxScale);
+                  obj.info()->addParam(obj, "scales",    obj.scales);
+                  obj.info()->addParam(obj, "rejfactor", obj.rejfactor));
+
+bool initModule_gpu(void)
+{
+    Ptr<Algorithm> sc = createSCascade();
+    return sc->info() != 0;
+}
+
+} }
\ No newline at end of file
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 1e0c271b9..02481ed37 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -45,21 +45,18 @@
 
 #if !defined (HAVE_CUDA)
 
-cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); }
-cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); }
-cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); }
-bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; }
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) const
-{
-    throw_nogpu();
-}
+cv::gpu::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
 
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const
-{
-    throw_nogpu();
-}
+cv::gpu::SCascade::~SCascade() { throw_nogpu(); }
 
-cv::Size cv::gpu::SoftCascade::getRoiSize() const { throw_nogpu(); return cv::Size();}
+bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
+
+void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
+void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, const int, Stream&) const { throw_nogpu(); }
+
+void cv::gpu::SCascade::genRoi(InputArray, OutputArray) const { throw_nogpu(); }
+
+void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 
 #else
 
@@ -92,7 +89,7 @@ namespace imgproc {
 
 }}}
 
-struct cv::gpu::SoftCascade::Filds
+struct cv::gpu::SCascade::Fields
 {
     struct CascadeIntrinsics
     {
@@ -126,7 +123,7 @@ struct cv::gpu::SoftCascade::Filds
         }
     };
 
-    static Filds* parseCascade(const FileNode &root, const float mins, const float maxs)
+    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs)
     {
         static const char *const SC_STAGE_TYPE          = "stageType";
         static const char *const SC_BOOST               = "BOOST";
@@ -312,13 +309,13 @@ struct cv::gpu::SoftCascade::Filds
         cv::Mat hlevels(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) );
         CV_Assert(!hlevels.empty());
 
-        Filds* filds = new Filds(mins, maxs, origWidth, origHeight, shrinkage, downscales,
+        Fields* fields = new Fields(mins, maxs, origWidth, origHeight, shrinkage, downscales,
             hoctaves, hstages, hnodes, hleaves, hlevels);
 
-        return filds;
+        return fields;
     }
 
-    Filds( const float mins, const float maxs, const int ow, const int oh, const int shr, const int ds,
+    Fields( const float mins, const float maxs, const int ow, const int oh, const int shr, const int ds,
         cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves, cv::Mat hlevels)
     : minScale(mins), maxScale(maxs), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds)
     {
@@ -332,7 +329,7 @@ struct cv::gpu::SoftCascade::Filds
         hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1);
         hogluv.setTo(cv::Scalar::all(0));
 
-        detCounter.create(1,1, CV_32SC1);
+        detCounter.create(sizeof(Detection) / sizeof(int),1, CV_32SC1);
 
         octaves.upload(hoctaves);
         stages.upload(hstages);
@@ -344,20 +341,21 @@ struct cv::gpu::SoftCascade::Filds
 
     }
 
-    void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
+    void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, cudaStream_t stream) const
     {
-        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        invoker(roi, hogluv, objects, detCounter, downscales, scale);
+        cudaMemset(count.data, 0, sizeof(Detection));
+        cudaSafeCall( cudaGetLastError());
+        invoker(roi, hogluv, objects, count, downscales, scale);
     }
 
     void preprocess(const cv::gpu::GpuMat& colored)
     {
         cudaMemset(plane.data, 0, plane.step * plane.rows);
 
-        static const int fw = Filds::FRAME_WIDTH;
-        static const int fh = Filds::FRAME_HEIGHT;
+        static const int fw = Fields::FRAME_WIDTH;
+        static const int fh = Fields::FRAME_HEIGHT;
 
-        GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh));
+        GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh));
         cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY);
         createHogBins(gray);
 
@@ -390,8 +388,8 @@ private:
 
     void createHogBins(const cv::gpu::GpuMat& gray)
     {
-        static const int fw = Filds::FRAME_WIDTH;
-        static const int fh = Filds::FRAME_HEIGHT;
+        static const int fw = Fields::FRAME_WIDTH;
+        static const int fh = Fields::FRAME_HEIGHT;
 
         GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
         GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
@@ -413,21 +411,21 @@ private:
         cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
 
         //create uchar magnitude
-        GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh));
+        GpuMat cmag(plane, cv::Rect(0, fh * Fields::HOG_BINS, fw, fh));
         nmag.convertTo(cmag, CV_8UC1);
 
-        device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS);
+        device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS);
     }
 
     void createLuvBins(const cv::gpu::GpuMat& colored)
     {
-        static const int fw = Filds::FRAME_WIDTH;
-        static const int fh = Filds::FRAME_HEIGHT;
+        static const int fw = Fields::FRAME_WIDTH;
+        static const int fh = Fields::FRAME_HEIGHT;
 
         cv::gpu::cvtColor(colored, luv, CV_BGR2Luv);
 
         std::vector<GpuMat> splited;
-        for(int i = 0; i < Filds::LUV_BINS; ++i)
+        for(int i = 0; i < Fields::LUV_BINS; ++i)
         {
             splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
         }
@@ -437,10 +435,10 @@ private:
 
     void integrate()
     {
-        int fw = Filds::FRAME_WIDTH;
-        int fh = Filds::FRAME_HEIGHT;
+        int fw = Fields::FRAME_WIDTH;
+        int fh = Fields::FRAME_HEIGHT;
 
-        GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS));
+        GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS));
         cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
         device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0);
     }
@@ -500,45 +498,33 @@ public:
     };
 };
 
-cv::gpu::SoftCascade::SoftCascade() : filds(0) {}
+cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int rjf)
+: fields(0),  minScale(mins), maxScale(maxs), scales(sc), rejfactor(rjf) {}
 
-cv::gpu::SoftCascade::SoftCascade( const string& filename, const float minScale, const float maxScale) : filds(0)
+cv::gpu::SCascade::~SCascade() { delete fields; }
+
+bool cv::gpu::SCascade::load(const FileNode& fn)
 {
-    load(filename, minScale, maxScale);
+    if (fields) delete fields;
+    fields = Fields::parseCascade(fn, minScale, maxScale);
+    return fields != 0;
 }
 
-cv::gpu::SoftCascade::~SoftCascade()
-{
-    delete filds;
-}
-
-bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, const float maxScale)
-{
-    if (filds) delete filds;
-
-    cv::FileStorage fs(filename, FileStorage::READ);
-    if (!fs.isOpened()) return false;
-
-    filds = Filds::parseCascade(fs.getFirstTopLevelNode(), minScale, maxScale);
-    return filds != 0;
-}
-
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& rois,
-                                GpuMat& objects, const int /*rejectfactor*/, int specificScale) const
+void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const
 {
+    const GpuMat colored = image.getGpuMat();
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
 
     // we guess user knows about shrincage
-    CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
+    // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
 
-
-    Filds& flds = *filds;
+    Fields& flds = *fields;
 
     if (colored.type() == CV_8UC3)
     {
         // only this window size allowed
-        CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT);
+        CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
         flds.preprocess(colored);
     }
     else
@@ -546,25 +532,60 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
         colored.copyTo(flds.hogluv);
     }
 
-    flds.detect(specificScale, rois, objects, 0);
+    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
 
-    cv::Mat out(flds.detCounter);
-    int ndetections = *(out.ptr<int>(0));
+    GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+    objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols -  sizeof(Detection), 1));
+    cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (! ndetections)
-        objects = GpuMat();
+    flds.detect(-1, rois, tmp, objects, stream);
+}
+
+void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const
+{
+    const GpuMat colored = image.getGpuMat();
+    // only color images are supperted
+    CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
+
+    // we guess user knows about shrincage
+    // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
+
+    Fields& flds = *fields;
+
+    if (colored.type() == CV_8UC3)
+    {
+        // only this window size allowed
+        CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
+        flds.preprocess(colored);
+    }
     else
-        objects = GpuMat(objects, cv::Rect(0, 0, ndetections * sizeof(Detection), 1));
+    {
+        colored.copyTo(flds.hogluv);
+    }
+
+    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
+
+    GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+    objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols -  sizeof(Detection), 1));
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    flds.detect(level, rois, tmp, objects, stream);
 }
 
-void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const
+void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask) const
 {
-    // cudaStream_t stream = StreamAccessor::getStream(s);
+    const GpuMat roi = _roi.getGpuMat();
+    _mask.create( roi.cols / 4, roi.rows / 4, roi.type() );
+    GpuMat mask = _mask.getGpuMat();
+    cv::gpu::GpuMat tmp;
+
+    cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+    cv::gpu::transpose(tmp, mask);
 }
 
-cv::Size cv::gpu::SoftCascade::getRoiSize() const
+void cv::gpu::SCascade::read(const FileNode& fn)
 {
-    return cv::Size(Filds::FRAME_WIDTH / (*filds).shrinkage, Filds::FRAME_HEIGHT / (*filds).shrinkage);
+    Algorithm::read(fn);
 }
 
 #endif
\ No newline at end of file
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 1146b062b..f26c44f0e 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -70,23 +70,23 @@ using cv::gpu::GpuMat;
 
 namespace {
 
-    typedef cv::gpu::SoftCascade::Detection Detection;
+    typedef cv::gpu::SCascade::Detection Detection;
 
     static cv::Rect getFromTable(int idx)
     {
         static const cv::Rect rois[] =
         {
-            cv::Rect( 65,  20,  35, 80),
-            cv::Rect( 95,  35,  45, 40),
-            cv::Rect( 45,  35,  45, 40),
-            cv::Rect( 25,  27,  50, 45),
-            cv::Rect(100,  50,  45, 40),
+            cv::Rect( 65 * 4,  20 * 4,  35 * 4, 80 * 4),
+            cv::Rect( 95 * 4,  35 * 4,  45 * 4, 40 * 4),
+            cv::Rect( 45 * 4,  35 * 4,  45 * 4, 40 * 4),
+            cv::Rect( 25 * 4,  27 * 4,  50 * 4, 45 * 4),
+            cv::Rect(100 * 4,  50 * 4,  45 * 4, 40 * 4),
 
-            cv::Rect( 60,  30,  45, 40),
-            cv::Rect( 40,  55,  50, 40),
-            cv::Rect( 48,  37,  72, 80),
-            cv::Rect( 48,  32,  85, 58),
-            cv::Rect( 48,   0,  32, 27)
+            cv::Rect( 60 * 4,  30 * 4,  45 * 4, 40 * 4),
+            cv::Rect( 40 * 4,  55 * 4,  50 * 4, 40 * 4),
+            cv::Rect( 48 * 4,  37 * 4,  72 * 4, 80 * 4),
+            cv::Rect( 48 * 4,  32 * 4,  85 * 4, 58 * 4),
+            cv::Rect( 48 * 4,   0 * 4,  32 * 4, 27 * 4)
         };
 
         return rois[idx];
@@ -140,11 +140,11 @@ namespace {
     }
 }
 
-typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SoftCascadeTestRoi;
-GPU_TEST_P(SoftCascadeTestRoi, detect,
+typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestRoi;
+GPU_TEST_P(SCascadeTestRoi, detect,
     testing::Combine(
         ALL_DEVICES,
-        testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
         testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
         testing::Range(0, 5)))
 {
@@ -152,10 +152,14 @@ GPU_TEST_P(SoftCascadeTestRoi, detect,
     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
     ASSERT_FALSE(coloredCpu.empty());
 
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1)));
+    cv::gpu::SCascade cascade;
 
-    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois;
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1), trois;
     rois.setTo(0);
 
     int nroi = GET_PARAM(3);
@@ -166,21 +170,21 @@ GPU_TEST_P(SoftCascadeTestRoi, detect,
         cv::Rect r = getFromTable(rng(10));
         GpuMat sub(rois, r);
         sub.setTo(1);
-        r.x *= 4; r.y *= 4; r.width *= 4; r.height *= 4;
         cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
     }
 
-    cv::gpu::transpose(rois, trois);
-
-    cascade.detectMultiScale(colored, trois, objectBoxes);
+    cascade.genRoi(rois, trois);
+    cascade.detect(colored, trois, objectBoxes);
 
     cv::Mat dt(objectBoxes);
-    typedef cv::gpu::SoftCascade::Detection Detection;
+    typedef cv::gpu::SCascade::Detection Detection;
 
-    Detection* dts = (Detection*)dt.data;
+    Detection* dts = ((Detection*)dt.data) + 1;
+    int* count = dt.ptr<int>(0);
 
-    printTotal(std::cout, dt.cols);
-    for (int i = 0; i  < (int)(dt.cols / sizeof(Detection)); ++i)
+    printTotal(std::cout, *count);
+
+    for (int i = 0; i  < *count; ++i)
     {
         Detection d = dts[i];
         print(std::cout, d);
@@ -188,43 +192,49 @@ GPU_TEST_P(SoftCascadeTestRoi, detect,
     }
 
     SHOW(result);
+
 }
 
-typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SoftCascadeTestLevel;
-GPU_TEST_P(SoftCascadeTestLevel, detect,
+typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestLevel;
+GPU_TEST_P(SCascadeTestLevel, detect,
         testing::Combine(
         ALL_DEVICES,
-        testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
         testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
         testing::Range(0, 47)
         ))
 {
     cv::gpu::setDevice(GET_PARAM(0).deviceID());
 
-    std::string xml =  cvtest::TS::ptr()->get_data_path() + GET_PARAM(1);
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(xml));
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
     ASSERT_FALSE(coloredCpu.empty());
 
-    typedef cv::gpu::SoftCascade::Detection Detection;
-    GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    typedef cv::gpu::SCascade::Detection Detection;
+    GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(1);
 
     cv::gpu::GpuMat trois;
-    cv::gpu::transpose(rois, trois);
+    cascade.genRoi(rois, trois);
 
     int level = GET_PARAM(3);
-    cascade.detectMultiScale(colored, trois, objectBoxes, 1, level);
+    cascade.detect(colored, trois, objectBoxes, level);
 
     cv::Mat dt(objectBoxes);
 
-    Detection* dts = (Detection*)dt.data;
+    Detection* dts = ((Detection*)dt.data) + 1;
+    int* count = dt.ptr<int>(0);
+
     cv::Mat result(coloredCpu);
 
-    printTotal(std::cout, dt.cols);
-    for (int i = 0; i  < (int)(dt.cols / sizeof(Detection)); ++i)
+    printTotal(std::cout, *count);
+    for (int i = 0; i  < *count; ++i)
     {
         Detection d = dts[i];
         print(std::cout, d);
@@ -235,76 +245,89 @@ GPU_TEST_P(SoftCascadeTestLevel, detect,
     SHOW(result);
 }
 
-TEST(SoftCascadeTest, readCascade)
+TEST(SCascadeTest, readCascade)
 {
     std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml";
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(xml));
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 }
 
-typedef ::testing::TestWithParam<cv::gpu::DeviceInfo > SoftCascadeTestAll;
-GPU_TEST_P(SoftCascadeTestAll, detect,
+typedef ::testing::TestWithParam<cv::gpu::DeviceInfo > SCascadeTestAll;
+GPU_TEST_P(SCascadeTestAll, detect,
         ALL_DEVICES
         )
 {
     cv::gpu::setDevice(GetParam().deviceID());
     std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(xml));
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
         + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
     ASSERT_FALSE(coloredCpu.empty());
 
-    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
     rois.setTo(0);
     GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
     sub.setTo(cv::Scalar::all(1));
 
     cv::gpu::GpuMat trois;
-    cv::gpu::transpose(rois, trois);
+    cascade.genRoi(rois, trois);
 
-    cascade.detectMultiScale(colored, trois, objectBoxes);
+    cascade.detect(colored, trois, objectBoxes);
 
-    typedef cv::gpu::SoftCascade::Detection Detection;
+    typedef cv::gpu::SCascade::Detection Detection;
     cv::Mat detections(objectBoxes);
-    ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U);
+    int a = *(detections.ptr<int>(0));
+    ASSERT_EQ(a ,2460);
 }
 
-//ToDo: fix me
-GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral,
+GPU_TEST_P(SCascadeTestAll, detectOnIntegral,
         ALL_DEVICES
         )
 {
     cv::gpu::setDevice(GetParam().deviceID());
     std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
-    cv::gpu::SoftCascade cascade;
-    ASSERT_TRUE(cascade.load(xml));
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
     std::string intPath = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/integrals.xml";
-    cv::FileStorage fs(intPath, cv::FileStorage::READ);
-    ASSERT_TRUE(fs.isOpened());
+    cv::FileStorage fsi(intPath, cv::FileStorage::READ);
+    ASSERT_TRUE(fsi.isOpened());
 
     GpuMat hogluv(121 * 10, 161, CV_32SC1);
     for (int i = 0; i < 10; ++i)
     {
         cv::Mat channel;
-        fs[std::string("channel") + itoa(i)] >> channel;
+        fsi[std::string("channel") + itoa(i)] >> channel;
         GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
         gchannel.upload(channel);
     }
 
-    GpuMat objectBoxes(1, 100000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1);
+    GpuMat objectBoxes(1, 100000, CV_8UC1), rois(cv::Size(640, 480), CV_8UC1);
     rois.setTo(1);
 
     cv::gpu::GpuMat trois;
-    cv::gpu::transpose(rois, trois);
+    cascade.genRoi(rois, trois);
 
-    cascade.detectMultiScale(hogluv, trois, objectBoxes);
+    cascade.detect(hogluv, trois, objectBoxes);
 
-    typedef cv::gpu::SoftCascade::Detection Detection;
+    typedef cv::gpu::SCascade::Detection Detection;
     cv::Mat detections(objectBoxes);
+    int a = *(detections.ptr<int>(0));
 
-    ASSERT_EQ(detections.cols / sizeof(Detection) ,2042U);
+    ASSERT_EQ( a ,1024);
 }
 #endif
\ No newline at end of file

From 0cbf9eb22a264493da9ebf2e1101af60534cc12c Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sat, 10 Nov 2012 03:59:09 +0400
Subject: [PATCH 57/74] add support for CUDA streams

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  2 +-
 modules/gpu/perf/perf_softcascade.cpp   | 42 ++++++++++++++-
 modules/gpu/src/cuda/integral_image.cu  |  1 -
 modules/gpu/src/cuda/isf-sc.cu          | 28 ++++++----
 modules/gpu/src/icf.hpp                 |  2 +-
 modules/gpu/src/softcascade.cpp         | 72 ++++++++++++++-----------
 modules/gpu/test/test_softcascade.cpp   | 39 ++++++++++++++
 7 files changed, 140 insertions(+), 46 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 4fc6179d8..8f327f227 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1577,7 +1577,7 @@ public:
     virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
     virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const;
 
-    void genRoi(InputArray roi, OutputArray mask) const;
+    void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
 
 private:
 
diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp
index 1e62af8eb..3e82cc5bb 100644
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -284,4 +284,44 @@ RUN_GPU(SCascadeTest, detectOnIntegral)
     SANITY_CHECK(sortDetections(objectBoxes));
 }
 
-NO_CPU(SCascadeTest, detectOnIntegral)
\ No newline at end of file
+NO_CPU(SCascadeTest, detectOnIntegral)
+
+GPU_PERF_TEST_P(SCascadeTest, detectStream,
+    testing::Combine(
+        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
+{ }
+
+RUN_GPU(SCascadeTest, detectStream)
+{
+    cv::Mat cpu = readImage (GET_PARAM(1));
+    ASSERT_FALSE(cpu.empty());
+    cv::gpu::GpuMat colored(cpu);
+
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois;
+    rois.setTo(1);
+
+    cv::gpu::Stream s;
+
+    cascade.genRoi(rois, trois, s);
+
+    cascade.detect(colored, trois, objectBoxes, s);
+
+    TEST_CYCLE()
+    {
+        cascade.detect(colored, trois, objectBoxes, s);
+    }
+
+    cudaDeviceSynchronize();
+
+    SANITY_CHECK(sortDetections(objectBoxes));
+}
+
+NO_CPU(SCascadeTest, detectStream)
\ No newline at end of file
diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu
index 5bd35bdc7..200960b43 100644
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -444,7 +444,6 @@ namespace cv { namespace gpu { namespace device
         }
 
         // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
-        // ToDo: partial dy
         void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
             int blockStep, cudaStream_t stream)
         {
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index ee9a9f674..0de2d8e37 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -71,7 +71,7 @@ namespace icf {
     }
 
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
-                  const int fw,  const int fh, const int bins)
+                  const int fw,  const int fh, const int bins, cudaStream_t stream )
     {
         const uchar* mag = (const uchar*)hogluv.ptr(fh * bins);
         uchar* hog = (uchar*)hogluv.ptr();
@@ -80,9 +80,12 @@ namespace icf {
         dim3 block(32, 8);
         dim3 grid(fw / 32, fh / 8);
 
-        magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh);
-        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaDeviceSynchronize() );
+        magToHist<<<grid, block, 0, stream>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh);
+        if (!stream)
+        {
+            cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
     }
 
     texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
@@ -305,7 +308,7 @@ namespace icf {
 
     template<>
     void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale) const
+        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
     {
         int fw = 160;
         int fh = 120;
@@ -325,22 +328,25 @@ namespace icf {
 
         if (scale == -1)
         {
-            test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
+            test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
             cudaSafeCall( cudaGetLastError());
 
             grid = dim3(fw, fh / 8, 47 - downscales);
-            test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
+            test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
         }
         else
         {
             if (scale >= downscales)
-                test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+                test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
             else
-                test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+                test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
         }
 
-        cudaSafeCall( cudaGetLastError());
-        cudaSafeCall( cudaDeviceSynchronize());
+        if (!stream)
+        {
+            cudaSafeCall( cudaGetLastError());
+            cudaSafeCall( cudaDeviceSynchronize());
+        }
     }
 }
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index d829012c8..60df55882 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -139,7 +139,7 @@ struct CascadeInvoker
     const float*  leaves;
 
     void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter, const int downscales, const int csale = -1) const;
+        PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const;
 };
 
 }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 02481ed37..fdde2618e 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -54,7 +54,7 @@ bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
 void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
 void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, const int, Stream&) const { throw_nogpu(); }
 
-void cv::gpu::SCascade::genRoi(InputArray, OutputArray) const { throw_nogpu(); }
+void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_nogpu(); }
 
 void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 
@@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device {
 
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
-        const int fw, const int fh, const int bins);
+        const int fw, const int fh, const int bins, cudaStream_t stream);
 }
 
 namespace imgproc {
@@ -341,27 +341,30 @@ struct cv::gpu::SCascade::Fields
 
     }
 
-    void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, cudaStream_t stream) const
+    void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
     {
         cudaMemset(count.data, 0, sizeof(Detection));
         cudaSafeCall( cudaGetLastError());
-        invoker(roi, hogluv, objects, count, downscales, scale);
+        invoker(roi, hogluv, objects, count, downscales, scale, stream);
     }
 
-    void preprocess(const cv::gpu::GpuMat& colored)
+    void preprocess(const cv::gpu::GpuMat& colored, Stream& s)
     {
-        cudaMemset(plane.data, 0, plane.step * plane.rows);
+        if (s)
+            s.enqueueMemSet(plane, 0);
+        else
+            cudaMemset(plane.data, 0, plane.step * plane.rows);
 
         static const int fw = Fields::FRAME_WIDTH;
         static const int fh = Fields::FRAME_HEIGHT;
 
         GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh));
-        cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY);
-        createHogBins(gray);
+        cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY, s);
+        createHogBins(gray ,s);
 
-        createLuvBins(colored);
+        createLuvBins(colored, s);
 
-        integrate();
+        integrate(s);
     }
 
 private:
@@ -386,7 +389,7 @@ private:
         return res;
     }
 
-    void createHogBins(const cv::gpu::GpuMat& gray)
+    void createHogBins(const cv::gpu::GpuMat& gray, Stream& s)
     {
         static const int fw = Fields::FRAME_WIDTH;
         static const int fh = Fields::FRAME_HEIGHT;
@@ -394,35 +397,38 @@ private:
         GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
         GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
 
-        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0);
-        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1);
+        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s);
+        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s);
 
         GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
         GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
 
-        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true);
+        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
 
         // normolize magnitude to uchar interval and angles to 6 bins
-
         GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
         GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
 
-        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag);
-        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang);
+        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag, 1, -1, s);
+        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
 
         //create uchar magnitude
         GpuMat cmag(plane, cv::Rect(0, fh * Fields::HOG_BINS, fw, fh));
-        nmag.convertTo(cmag, CV_8UC1);
+        if (s)
+            s.enqueueConvert(nmag, cmag, CV_8UC1);
+        else
+            nmag.convertTo(cmag, CV_8UC1);
 
-        device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS);
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS, stream);
     }
 
-    void createLuvBins(const cv::gpu::GpuMat& colored)
+    void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s)
     {
         static const int fw = Fields::FRAME_WIDTH;
         static const int fh = Fields::FRAME_HEIGHT;
 
-        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv);
+        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
 
         std::vector<GpuMat> splited;
         for(int i = 0; i < Fields::LUV_BINS; ++i)
@@ -430,17 +436,18 @@ private:
             splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
         }
 
-        cv::gpu::split(luv, splited);
+        cv::gpu::split(luv, splited, s);
     }
 
-    void integrate()
+    void integrate( Stream& s)
     {
         int fw = Fields::FRAME_WIDTH;
         int fh = Fields::FRAME_HEIGHT;
 
         GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS));
-        cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-        device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0);
+        cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s);
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream);
     }
 
 public:
@@ -482,6 +489,8 @@ public:
     GpuMat leaves;
     GpuMat levels;
 
+    GpuMat sobelBuf;
+
     device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
 
     enum { BOOST = 0 };
@@ -516,6 +525,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
 
+    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
+
     // we guess user knows about shrincage
     // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
 
@@ -525,14 +536,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     {
         // only this window size allowed
         CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
-        flds.preprocess(colored);
+        flds.preprocess(colored, s);
     }
     else
     {
         colored.copyTo(flds.hogluv);
     }
 
-    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
 
     GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
     objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols -  sizeof(Detection), 1));
@@ -556,7 +566,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     {
         // only this window size allowed
         CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
-        flds.preprocess(colored);
+        flds.preprocess(colored, s);
     }
     else
     {
@@ -572,15 +582,15 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     flds.detect(level, rois, tmp, objects, stream);
 }
 
-void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask) const
+void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const
 {
     const GpuMat roi = _roi.getGpuMat();
     _mask.create( roi.cols / 4, roi.rows / 4, roi.type() );
     GpuMat mask = _mask.getGpuMat();
     cv::gpu::GpuMat tmp;
 
-    cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
-    cv::gpu::transpose(tmp, mask);
+    cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA, stream);
+    cv::gpu::transpose(tmp, mask, stream);
 }
 
 void cv::gpu::SCascade::read(const FileNode& fn)
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index f26c44f0e..cfae940c7 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -330,4 +330,43 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral,
 
     ASSERT_EQ( a ,1024);
 }
+
+GPU_TEST_P(SCascadeTestAll, detectStream,
+        ALL_DEVICES
+        )
+{
+    cv::gpu::setDevice(GetParam().deviceID());
+    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
+    cv::gpu::SCascade cascade;
+
+    cv::FileStorage fs(xml, cv::FileStorage::READ);
+    ASSERT_TRUE(fs.isOpened());
+
+    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+
+    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
+        + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
+    ASSERT_FALSE(coloredCpu.empty());
+
+    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
+    rois.setTo(0);
+    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
+    sub.setTo(cv::Scalar::all(1));
+
+    cv::gpu::Stream s;
+
+    cv::gpu::GpuMat trois;
+    cascade.genRoi(rois, trois, s);
+
+    cascade.detect(colored, trois, objectBoxes, s);
+
+    cudaDeviceSynchronize();
+
+    typedef cv::gpu::SCascade::Detection Detection;
+    cv::Mat detections(objectBoxes);
+    int a = *(detections.ptr<int>(0));
+    ASSERT_EQ(a ,2460);
+}
+
+
 #endif
\ No newline at end of file

From 916967cac5070d231195713f5a1057bd546c1d4d Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sat, 10 Nov 2012 04:26:38 +0400
Subject: [PATCH 58/74] add comments to class declaration

---
 modules/gpu/include/opencv2/gpu/gpu.hpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 8f327f227..9a43760f9 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1570,13 +1570,22 @@ public:
     // Load cascade config.
     virtual void read(const FileNode& fn);
 
-    // Return the vector of Decection objcts.
+    // Return the matrix of of detectioned objects.
     // Param image is a frame on which detector will be applied.
-    // Param rois is a vector of regions of interest. Only the objects that fall into one of the regions will be returned.
-    // Param objects is an output array of Detections
+    // Param rois is a regions of interests mask generated by genRoi.
+    //    Only the objects that fall into one of the regions will be returned.
+    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
+    //    The first element of the matrix is  actually a count of detections.
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
+    // Param level used for execution cascade on specific scales pyramid level.
     virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
     virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const;
 
+    // Convert ROI matrix into the suitable for detect method.
+    // Param roi is an input matrix of the same size as the image.
+    //    There non zero value mean that detector should be executed in this point.
+    // Param mask is an output mask
+    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
     void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
 
 private:

From d3ac282487793bc2a609ac1a58973dfa4e714864 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Sat, 10 Nov 2012 05:48:06 +0400
Subject: [PATCH 59/74] GPU soft cascade documentation

---
 modules/gpu/doc/object_detection.rst | 119 +++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst
index 133660236..64348717c 100644
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -199,6 +199,125 @@ Returns block descriptors computed for the whole image.
 The function is mainly used to learn the classifier.
 
 
+Soft Cascade Classifier
+======================
+
+Soft Cascade Classifier for Object Detection
+----------------------------------------------------------
+
+Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows:
+
+.. math::
+    \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)}
+
+where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let
+
+.. math::
+    \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)}
+
+be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*.
+After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*.
+
+The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_.
+
+.. [BJ05] Lubomir Bourdev and Jonathan Brandt. tRobust Object Detection Via Soft Cascade. IEEE CVPR, 2005.
+.. [BMTG12] Rodrigo Benenson, Markus Mathias, Radu Timofte and Luc Van Gool. Pedestrian detection at 100 frames per second. IEEE CVPR, 2012.
+
+
+SCascade
+----------------
+.. ocv:class:: SCascade
+
+Implementation of soft (stageless) cascaded detector. ::
+
+    class CV_EXPORTS SCascade : public Algorithm
+    {
+        struct CV_EXPORTS Detection
+        {
+              ushort x;
+              ushort y;
+              ushort w;
+              ushort h;
+              float confidence;
+              int kind;
+
+              enum {PEDESTRIAN = 0};
+        };
+
+        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
+        virtual ~SCascade();
+        virtual bool load(const FileNode& fn);
+        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
+        virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const;
+        void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+    };
+
+
+SCascade::SCascade
+--------------------------
+An empty cascade will be created.
+
+.. ocv:function:: bool SCascade::SCascade(const float minScale = 0.4f, const float maxScale = 5.f, const int scales = 55, const int rejfactor = 1)
+
+    :param minScale: a minimum scale relative to the original size of the image on which cascade will be applyed.
+
+    :param maxScale: a maximum scale relative to the original size of the image on which cascade will be applyed.
+
+    :param scales: a number of scales from minScale to maxScale.
+
+    :param rejfactor: used for non maximum suppression.
+
+
+
+SCascade::~SCascade
+---------------------------
+Destructor for SCascade.
+
+.. ocv:function:: SCascade::~SCascade()
+
+
+
+SCascade::load
+--------------------------
+Load cascade from FileNode.
+
+.. ocv:function:: bool SCascade::load(const FileNode& fn)
+
+    :param fn: File node from which the soft cascade are read.
+
+
+
+SCascade::detect
+--------------------------
+Apply cascade to an input frame and return the vector of Decection objcts.
+
+.. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
+.. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const
+
+    :param image: a frame on which detector will be applied.
+
+    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
+
+    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
+
+    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
+
+    :param level: used for execution cascade on specific scales pyramid level.
+
+
+SCascade::genRoi
+--------------------------
+Convert ROI matrix into the suitable for detect method.
+
+.. ocv:function:: void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const
+
+    :param rois: an input matrix of the same size as the image. There non zero value mean that detector should be executed in this point.
+
+    :param mask: an output mask
+
+    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
+
+
 
 gpu::CascadeClassifier_GPU
 --------------------------

From 580d8173e5acc12cc0a5a6997fe8af73efffe619 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 12 Nov 2012 11:54:40 +0400
Subject: [PATCH 60/74] refactor computing of scaling factor

---
 modules/gpu/src/softcascade.cpp | 53 ++++++---------------------------
 1 file changed, 9 insertions(+), 44 deletions(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index fdde2618e..a69be9239 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -70,6 +70,15 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale
 
     objSize.x  = cv::saturate_cast<uchar>(oct.size.x * relScale);
     objSize.y  = cv::saturate_cast<uchar>(oct.size.y * relScale);
+
+    // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
+    if (fabs(relScale - 1.f) < FLT_EPSILON)
+        scaling[0] = scaling[1] = 1.f;
+    else
+    {
+        scaling[0] = (relScale < 1.f) ? 0.89f * ::pow(relScale, 1.099f / ::log(2)) : 1.f;
+        scaling[1] = relScale * relScale;
+    }
 }
 
 namespace cv { namespace gpu { namespace device {
@@ -91,38 +100,6 @@ namespace imgproc {
 
 struct cv::gpu::SCascade::Fields
 {
-    struct CascadeIntrinsics
-    {
-        static const float lambda = 1.099f, a = 0.89f;
-
-        static float getFor(int channel, float scaling)
-        {
-            CV_Assert(channel < 10);
-
-            if (fabs(scaling - 1.f) < FLT_EPSILON)
-                return 1.f;
-
-            // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
-            static const float A[2][2] =
-            {   //channel <= 6, otherwise
-                {        0.89f, 1.f}, // down
-                {        1.00f, 1.f}  // up
-            };
-
-            static const float B[2][2] =
-            {   //channel <= 6,  otherwise
-                { 1.099f / ::log(2), 2.f}, // down
-                {             0.f, 2.f}  // up
-            };
-
-            float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
-            float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
-
-            // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
-            return a * ::pow(scaling, b);
-        }
-    };
-
     static Fields* parseCascade(const FileNode &root, const float mins, const float maxs)
     {
         static const char *const SC_STAGE_TYPE          = "stageType";
@@ -281,8 +258,6 @@ struct cv::gpu::SCascade::Fields
             int fit = fitOctave(voctaves, logScale);
 
             Level level(fit, voctaves[fit], scale, width, height);
-            level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
-            level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
 
             if (!width || !height)
                 break;
@@ -294,16 +269,6 @@ struct cv::gpu::SCascade::Fields
 
             if (::fabs(scale - maxs) < FLT_EPSILON) break;
             scale = ::std::min(maxs, ::expf(::log(scale) + logFactor));
-
-            // std::cout << "level " << sc
-            //           << " octeve "
-            //           << vlevels[sc].octave
-            //           << " relScale "
-            //           << vlevels[sc].relScale
-            //           << " " << vlevels[sc].shrScale
-            //           << " [" << (int)vlevels[sc].objSize.x
-            //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
-            // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
         }
 
         cv::Mat hlevels(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) );

From aa92be34d603a154b6e734d3a52a43507ab150e2 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 12 Nov 2012 16:37:18 +0400
Subject: [PATCH 61/74] GK107 Policy

---
 modules/gpu/src/cuda/isf-sc.cu        | 284 ++++++++++----------------
 modules/gpu/src/icf.hpp               |  26 ++-
 modules/gpu/src/softcascade.cpp       |  41 ++--
 modules/gpu/test/test_softcascade.cpp |  10 +-
 4 files changed, 149 insertions(+), 212 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 0de2d8e37..ac4b8f0e8 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -161,192 +161,128 @@ namespace icf {
     }
 
     texture<float2,  cudaTextureType2D, cudaReadModeElementType> troi;
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
-    template<bool isUp>
-    __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
-        const int downscales)
+
+template<typename Policy>
+template<bool isUp>
+__device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
+{
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x;
+
+    // load Lavel
+    __shared__ Level level;
+
+    // check POI
+    __shared__ volatile char roiCache[Policy::STA_Y];
+
+    if (!threadIdx.y && !threadIdx.x)
+        ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
+
+    __syncthreads();
+
+    if (!roiCache[threadIdx.y]) return;
+
+    if (!threadIdx.x)
+        level = levels[downscales + blockIdx.z];
+
+    if(x >= level.workRect.x || y >= level.workRect.y) return;
+
+    int st = level.octave * level.step;
+    const int stEnd = st + level.step;
+
+    float confidence = 0.f;
+    for(; st < stEnd; st += Policy::WARP)
     {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x;
+        const int nId = (st + threadIdx.x) * 3;
 
-        // load Lavel
-        __shared__ Level level;
+        Node node = nodes[nId];
 
-        // check POI
-        __shared__ volatile char roiCache[8];
-        if (!threadIdx.y && !threadIdx.x)
-            ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
+        float threshold = rescale<isUp>(level, node);
+        int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
 
-        __syncthreads();
+        int next = 1 + (int)(sum >= threshold);
 
-        if (!roiCache[threadIdx.y]) return;
+        node = nodes[nId + next];
+        threshold = rescale<isUp>(level, node);
+        sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
 
-        if (!threadIdx.x)
-            level = levels[downscales + blockIdx.z];
+        const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+        float impact = leaves[(st + threadIdx.x) * 4 + lShift];
 
-        if(x >= level.workRect.x || y >= level.workRect.y) return;
-
-        Octave octave = octaves[level.octave];
-        int st = octave.index * octave.stages;
-        const int stEnd = st + 1024;
-
-        float confidence = 0.f;
-
-        for(; st < stEnd; st += 32)
-        {
-
-            const int nId = (st + threadIdx.x) * 3;
-            dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId);
-            Node node = nodes[nId];
-
-            float threshold = rescale<isUp>(level, node);
-            int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
-
-            int next = 1 + (int)(sum >= threshold);
-            dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold);
-
-            node = nodes[nId + next];
-            threshold = rescale<isUp>(level, node);
-            sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
-
-            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-            float impact = leaves[(st + threadIdx.x) * 4 + lShift];
-
-            dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact);
-            dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]);
-            dprintf("%d: computed  score: %f\n",threadIdx.x, impact);
 #pragma unroll
-            // scan on shuffl functions
-            for (int i = 1; i < 32; i *= 2)
-            {
-                const float n = __shfl_up(impact, i, 32);
-
-                if (threadIdx.x >= i)
-                    impact += n;
-            }
-
-            dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact);
-
-            confidence += impact;
-            if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
-        }
-
-        if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
+        // scan on shuffl functions
+        for (int i = 1; i < Policy::WARP; i *= 2)
         {
-            int idx = atomicInc(ctr, ndetections);
-            // store detection
-            objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
-                __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
+            const float n = __shfl_up(impact, i, Policy::WARP);
+
+            if (threadIdx.x >= i)
+                impact += n;
         }
+
+        confidence += impact;
+        if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
     }
-#else
-    template<bool isUp>
-    __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr,
-        const int downscales)
+
+    if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
     {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        Level level = levels[blockIdx.z];
-
-        // if (blockIdx.z != 31) return;
-        if(x >= level.workRect.x || y >= level.workRect.y) return;
-
-        // int roi = tex2D(troi, x, y);
-        // printf("%d\n", roi);
-        // if (!roi) return;
-
-        Octave octave = octaves[level.octave];
-
-        int st = octave.index * octave.stages;
-        const int stEnd = st + 1000;//octave.stages;
-
-        float confidence = 0.f;
-
-        for(; st < stEnd; ++st)
-        {
-            dprintf("\n\nstage: %d\n", st);
-            const int nId = st * 3;
-            Node node = nodes[nId];
-
-            dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w,
-                node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
-
-            float threshold = rescale<isUp>(level, node);
-            int sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
-
-            dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
-                node.rect.w, threshold);
-
-            int next = 1 + (int)(sum >= threshold);
-            dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold);
-
-            node = nodes[nId + next];
-            threshold = rescale<isUp>(level, node);
-            sum = get<isUp>(x, y + (node.threshold >> 28) * 121, node.rect);
-
-            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-            float impact = leaves[st * 4 + lShift];
-            confidence += impact;
-
-            if (confidence <= stages[st]) st = stEnd + 10;
-            dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-            dprintf("extracted stage: %f\n", stages[st]);
-            dprintf("computed  score: %f\n\n", confidence);
-        }
-
-        if(st == stEnd)
-        {
-            int idx = atomicInc(ctr, ndetections);
-            // store detection
-            objects[idx] = Detection(__float2int_rn(x * octave.shrinkage),
-                __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence);
-        }
-    }
-#endif
-
-    template<>
-    void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
-    {
-        int fw = 160;
-        int fh = 120;
-
-        dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1);
-
-        uint* ctr = (uint*)(counter.ptr(0));
-        Detection* det = (Detection*)objects.ptr();
-        uint max_det = objects.cols / sizeof(Detection);
-
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
-        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
-
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
-
-        if (scale == -1)
-        {
-            test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
-            cudaSafeCall( cudaGetLastError());
-
-            grid = dim3(fw, fh / 8, 47 - downscales);
-            test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
-        }
-        else
-        {
-            if (scale >= downscales)
-                test_kernel_warp<true><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
-            else
-                test_kernel_warp<false><<<grid, block, 0, stream>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
-        }
-
-        if (!stream)
-        {
-            cudaSafeCall( cudaGetLastError());
-            cudaSafeCall( cudaDeviceSynchronize());
-        }
+        int idx = atomicInc(ctr, ndetections);
+        objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE),
+            __float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence);
     }
 }
+
+template<typename Policy, bool isUp>
+__global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* objects, const uint n, uint* ctr, const int downs)
+{
+    invoker.template detect<isUp>(objects, n, ctr, downs);
+}
+
+template<typename Policy>
+void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
+{
+    int fw = 160;
+    int fh = 120;
+
+    dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1);
+
+    uint* ctr = (uint*)(counter.ptr(0));
+    Detection* det = (Detection*)objects.ptr();
+    uint max_det = objects.cols / sizeof(Detection);
+
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+    cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
+
+    cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
+    cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
+
+    const CascadeInvoker<Policy> inv = *this;
+
+    if (scale == -1)
+    {
+        soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0);
+        cudaSafeCall( cudaGetLastError());
+
+        grid = dim3(fw, fh / Policy::STA_Y, scales - downscales);
+        soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales);
+    }
+    else
+    {
+        if (scale >= downscales)
+            soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, scale);
+        else
+            soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, scale);
+    }
+
+    if (!stream)
+    {
+        cudaSafeCall( cudaGetLastError());
+        cudaSafeCall( cudaDeviceSynchronize());
+    }
+}
+
+template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const;
+
+}
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 60df55882..8eb080e23 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -72,9 +72,9 @@ struct __align__(16) Octave
 struct __align__(8) Level //is actually 24 bytes
 {
     int octave;
+    int step;
 
     float relScale;
-    float shrScale;   // used for marking detection
     float scaling[2]; // calculated according to Dollal paper
 
     // for 640x480 we can not get overflow
@@ -115,31 +115,41 @@ struct __align__(16) Detection
     : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };
 
-struct CascadePolicy
+struct GK107PolicyX4
 {
-    enum {STA_X = 32, STA_Y = 8};
+    enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4};
+    static const dim3 block()
+    {
+        return dim3(GK107PolicyX4::STA_X, GK107PolicyX4::STA_Y);
+    }
 };
 
 template<typename Policy>
 struct CascadeInvoker
 {
-    CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {}
+    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
+
     CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages,
                    const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
-    : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()),
-       nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr())
+    : levels((const Level*)_levels.ptr()),
+      stages((const float*)_stages.ptr()),
+      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
+      scales(_levels.cols / sizeof(Level))
     {}
 
     const Level*  levels;
-    const Octave* octaves;
-
     const float*  stages;
 
     const Node*   nodes;
     const float*  leaves;
 
+    int scales;
+
     void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
         PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const;
+
+    template<bool isUp>
+    __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;
 };
 
 }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index a69be9239..6133bd1cb 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -63,7 +63,7 @@ void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 #include <icf.hpp>
 
 cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-:  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+:  octave(idx), step(oct.stages), relScale(scale / oct.scale)
 {
     workRect.x = round(w / (float)oct.shrinkage);
     workRect.y = round(h / (float)oct.shrinkage);
@@ -100,7 +100,7 @@ namespace imgproc {
 
 struct cv::gpu::SCascade::Fields
 {
-    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs)
+    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals)
     {
         static const char *const SC_STAGE_TYPE          = "stageType";
         static const char *const SC_BOOST               = "BOOST";
@@ -119,11 +119,8 @@ struct cv::gpu::SCascade::Fields
         static const char *const SC_ORIG_W              = "width";
         static const char *const SC_ORIG_H              = "height";
 
-        int origWidth = (int)root[SC_ORIG_W];
-        CV_Assert(origWidth  == ORIG_OBJECT_WIDTH);
-
+        int origWidth  = (int)root[SC_ORIG_W];
         int origHeight = (int)root[SC_ORIG_H];
-        CV_Assert(origHeight == ORIG_OBJECT_HEIGHT);
 
         static const char *const SC_OCTAVES             = "octaves";
         static const char *const SC_STAGES              = "stages";
@@ -142,7 +139,6 @@ struct cv::gpu::SCascade::Fields
         static const char * const SC_F_CHANNEL          = "channel";
         static const char * const SC_F_RECT             = "rect";
 
-
         FileNode fn = root[SC_OCTAVES];
             if (fn.empty()) return false;
 
@@ -167,8 +163,8 @@ struct cv::gpu::SCascade::Fields
 
             ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
             ushort2 size;
-            size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
-            size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
+            size.x = cvRound(origWidth * scale);
+            size.y = cvRound(origHeight * scale);
             shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
 
             Octave octave(octIndex, nstages, shrinkage, size, scale);
@@ -245,11 +241,11 @@ struct cv::gpu::SCascade::Fields
         CV_Assert(!hleaves.empty());
 
         std::vector<Level> vlevels;
-        float logFactor = (::log(maxs) - ::log(mins)) / (TOTAL_SCALES -1);
+        float logFactor = (::log(maxs) - ::log(mins)) / (totals -1);
 
         float scale = mins;
         int downscales = 0;
-        for (int sc = 0; sc < TOTAL_SCALES; ++sc)
+        for (int sc = 0; sc < totals; ++sc)
         {
             int width  = ::std::max(0.0f, FRAME_WIDTH - (origWidth  * scale));
             int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale));
@@ -302,7 +298,7 @@ struct cv::gpu::SCascade::Fields
         leaves.upload(hleaves);
         levels.upload(hlevels);
 
-        invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
+        invoker = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, octaves, stages, nodes, leaves);
 
     }
 
@@ -456,16 +452,13 @@ public:
 
     GpuMat sobelBuf;
 
-    device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
+    device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker;
 
     enum { BOOST = 0 };
     enum
     {
         FRAME_WIDTH        = 640,
         FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128,
         HOG_BINS           = 6,
         LUV_BINS           = 3,
         HOG_LUV_BINS       = 10
@@ -480,21 +473,19 @@ cv::gpu::SCascade::~SCascade() { delete fields; }
 bool cv::gpu::SCascade::load(const FileNode& fn)
 {
     if (fields) delete fields;
-    fields = Fields::parseCascade(fn, minScale, maxScale);
+    fields = Fields::parseCascade(fn, minScale, maxScale, scales);
     return fields != 0;
 }
 
 void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const
 {
+    CV_Assert(fields);
+
     const GpuMat colored = image.getGpuMat();
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
 
     GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
-
-    // we guess user knows about shrincage
-    // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
-
     Fields& flds = *fields;
 
     if (colored.type() == CV_8UC3)
@@ -518,15 +509,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 
 void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const
 {
+    CV_Assert(fields);
+
     const GpuMat colored = image.getGpuMat();
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
 
-    // we guess user knows about shrincage
-    // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1));
-
     Fields& flds = *fields;
-
     if (colored.type() == CV_8UC3)
     {
         // only this window size allowed
@@ -549,6 +538,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const
 {
+    CV_Assert(fields);
+
     const GpuMat roi = _roi.getGpuMat();
     _mask.create( roi.cols / 4, roi.rows / 4, roi.type() );
     GpuMat mask = _mask.getGpuMat();
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index cfae940c7..7034b33b9 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -172,7 +172,7 @@ GPU_TEST_P(SCascadeTestRoi, detect,
         sub.setTo(1);
         cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
     }
-
+    objectBoxes.setTo(0);
     cascade.genRoi(rois, trois);
     cascade.detect(colored, trois, objectBoxes);
 
@@ -222,7 +222,7 @@ GPU_TEST_P(SCascadeTestLevel, detect,
 
     cv::gpu::GpuMat trois;
     cascade.genRoi(rois, trois);
-
+    objectBoxes.setTo(0);
     int level = GET_PARAM(3);
     cascade.detect(colored, trois, objectBoxes, level);
 
@@ -281,7 +281,7 @@ GPU_TEST_P(SCascadeTestAll, detect,
 
     cv::gpu::GpuMat trois;
     cascade.genRoi(rois, trois);
-
+    objectBoxes.setTo(0);
     cascade.detect(colored, trois, objectBoxes);
 
     typedef cv::gpu::SCascade::Detection Detection;
@@ -321,7 +321,7 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral,
 
     cv::gpu::GpuMat trois;
     cascade.genRoi(rois, trois);
-
+    objectBoxes.setTo(0);
     cascade.detect(hogluv, trois, objectBoxes);
 
     typedef cv::gpu::SCascade::Detection Detection;
@@ -357,7 +357,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream,
 
     cv::gpu::GpuMat trois;
     cascade.genRoi(rois, trois, s);
-
+    objectBoxes.setTo(0);
     cascade.detect(colored, trois, objectBoxes, s);
 
     cudaDeviceSynchronize();

From 08910e81af95dd2004930845e5f206c2b9368aac Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 12:40:44 +0400
Subject: [PATCH 62/74] integrate pre-Kepler architectures

---
 modules/gpu/src/cuda/isf-sc.cu  | 15 +++++++++++++++
 modules/gpu/src/softcascade.cpp | 16 +++++++++++-----
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index ac4b8f0e8..b6c87e17b 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -209,6 +209,7 @@ __device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndet
         const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
         float impact = leaves[(st + threadIdx.x) * 4 + lShift];
 
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
 #pragma unroll
         // scan on shuffl functions
         for (int i = 1; i < Policy::WARP; i *= 2)
@@ -218,7 +219,21 @@ __device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndet
             if (threadIdx.x >= i)
                 impact += n;
         }
+#else
+        __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y];
 
+        const int idx = threadIdx.y * Policy::STA_X + threadIdx.x;
+
+        ptr[idx] = impact;
+
+        if ( threadIdx.x >=  1) ptr [idx ] = (ptr [idx -  1] + ptr [idx]);
+        if ( threadIdx.x >=  2) ptr [idx ] = (ptr [idx -  2] + ptr [idx]);
+        if ( threadIdx.x >=  4) ptr [idx ] = (ptr [idx -  4] + ptr [idx]);
+        if ( threadIdx.x >=  8) ptr [idx ] = (ptr [idx -  8] + ptr [idx]);
+        if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]);
+
+        impact = ptr[idx];
+#endif
         confidence += impact;
         if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
     }
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 6133bd1cb..c5bcbedb5 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -298,14 +298,14 @@ struct cv::gpu::SCascade::Fields
         leaves.upload(hleaves);
         levels.upload(hlevels);
 
-        invoker = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, octaves, stages, nodes, leaves);
-
     }
 
     void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
     {
         cudaMemset(count.data, 0, sizeof(Detection));
         cudaSafeCall( cudaGetLastError());
+        device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
+        = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, octaves, stages, nodes, leaves);
         invoker(roi, hogluv, objects, count, downscales, scale, stream);
     }
 
@@ -407,8 +407,14 @@ private:
 
         GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS));
         cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s);
-        cudaStream_t stream = StreamAccessor::getStream(s);
-        device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream);
+
+        if (info.majorVersion() < 3)
+            cv::gpu::integralBuffered(shrunk, hogluv, integralBuffer, s);
+        else
+        {
+            cudaStream_t stream = StreamAccessor::getStream(s);
+            device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream);
+        }
     }
 
 public:
@@ -452,7 +458,7 @@ public:
 
     GpuMat sobelBuf;
 
-    device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker;
+    DeviceInfo info;
 
     enum { BOOST = 0 };
     enum

From 72e2b8b370c10024adb05a46230fbac782fbeee4 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 14:00:02 +0400
Subject: [PATCH 63/74] remove size constraints of input frame

---
 modules/gpu/src/cuda/isf-sc.cu  |  4 +-
 modules/gpu/src/softcascade.cpp | 97 ++++++++++++++++++---------------
 2 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index b6c87e17b..7f7a10e92 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -256,8 +256,8 @@ template<typename Policy>
 void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
     PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
 {
-    int fw = 160;
-    int fh = 120;
+    int fw = roi.rows;
+    int fh = roi.cols;
 
     dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1);
 
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index c5bcbedb5..ad6e00027 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -240,15 +240,26 @@ struct cv::gpu::SCascade::Fields
         cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1));
         CV_Assert(!hleaves.empty());
 
-        std::vector<Level> vlevels;
-        float logFactor = (::log(maxs) - ::log(mins)) / (totals -1);
+        Fields* fields = new Fields(mins, maxs, totals, origWidth, origHeight, shrinkage, 0,
+            hoctaves, hstages, hnodes, hleaves);
+        fields->voctaves = voctaves;
+        fields->createLevels(FRAME_HEIGHT, FRAME_WIDTH);
 
-        float scale = mins;
-        int downscales = 0;
+        return fields;
+    }
+
+    int createLevels(const int fh, const int fw)
+    {
+        using namespace device::icf;
+        std::vector<Level> vlevels;
+        float logFactor = (::log(maxScale) - ::log(minScale)) / (totals -1);
+
+        float scale = minScale;
+        int dcs = 0;
         for (int sc = 0; sc < totals; ++sc)
         {
-            int width  = ::std::max(0.0f, FRAME_WIDTH - (origWidth  * scale));
-            int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale));
+            int width  = ::std::max(0.0f, fw - (origObjWidth  * scale));
+            int height = ::std::max(0.0f, fh - (origObjHeight * scale));
 
             float logScale = ::log(scale);
             int fit = fitOctave(voctaves, logScale);
@@ -260,44 +271,44 @@ struct cv::gpu::SCascade::Fields
             else
             {
                 vlevels.push_back(level);
-                if (voctaves[fit].scale < 1) ++downscales;
+                if (voctaves[fit].scale < 1) ++dcs;
             }
 
-            if (::fabs(scale - maxs) < FLT_EPSILON) break;
-            scale = ::std::min(maxs, ::expf(::log(scale) + logFactor));
+            if (::fabs(scale - maxScale) < FLT_EPSILON) break;
+            scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
         }
 
-        cv::Mat hlevels(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) );
+        cv::Mat hlevels = cv::Mat(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) );
         CV_Assert(!hlevels.empty());
-
-        Fields* fields = new Fields(mins, maxs, origWidth, origHeight, shrinkage, downscales,
-            hoctaves, hstages, hnodes, hleaves, hlevels);
-
-        return fields;
+        levels.upload(hlevels);
+        downscales = dcs;
+        return dcs;
     }
 
-    Fields( const float mins, const float maxs, const int ow, const int oh, const int shr, const int ds,
-        cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves, cv::Mat hlevels)
-    : minScale(mins), maxScale(maxs), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds)
+    bool update(int fh, int fw, int shr)
     {
-        plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
-        fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1);
-        luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
+        if (fh == luv.rows && fh == luv.cols) return false;
+        plane.create(fh * (HOG_LUV_BINS + 1), fw, CV_8UC1);
+        fplane.create(fh * HOG_BINS, fw, CV_32FC1);
+        luv.create(fh, fw, CV_8UC3);
 
-        shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1);
+        shrunk.create(fh / shr * HOG_LUV_BINS, fw / shr, CV_8UC1);
         integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1);
 
-        hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1);
+        hogluv.create((fh / shr) * HOG_LUV_BINS + 1, fw / shr + 1, CV_32SC1);
         hogluv.setTo(cv::Scalar::all(0));
+        return true;
+    }
 
-        detCounter.create(sizeof(Detection) / sizeof(int),1, CV_32SC1);
-
+    Fields( const float mins, const float maxs, const int tts, const int ow, const int oh, const int shr, const int ds,
+        cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves)
+    : minScale(mins), maxScale(maxs), totals(tts), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds)
+    {
+        update(FRAME_HEIGHT, FRAME_WIDTH, shr);
         octaves.upload(hoctaves);
         stages.upload(hstages);
         nodes.upload(hnodes);
         leaves.upload(hleaves);
-        levels.upload(hlevels);
-
     }
 
     void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
@@ -316,8 +327,8 @@ struct cv::gpu::SCascade::Fields
         else
             cudaMemset(plane.data, 0, plane.step * plane.rows);
 
-        static const int fw = Fields::FRAME_WIDTH;
-        static const int fh = Fields::FRAME_HEIGHT;
+        const int fw = colored.cols;
+        const int fh = colored.rows;
 
         GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh));
         cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY, s);
@@ -325,7 +336,7 @@ struct cv::gpu::SCascade::Fields
 
         createLuvBins(colored, s);
 
-        integrate(s);
+        integrate(fh, fw, s);
     }
 
 private:
@@ -352,8 +363,8 @@ private:
 
     void createHogBins(const cv::gpu::GpuMat& gray, Stream& s)
     {
-        static const int fw = Fields::FRAME_WIDTH;
-        static const int fh = Fields::FRAME_HEIGHT;
+        static const int fw = gray.cols;
+        static const int fh = gray.rows;
 
         GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
         GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
@@ -386,8 +397,8 @@ private:
 
     void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s)
     {
-        static const int fw = Fields::FRAME_WIDTH;
-        static const int fh = Fields::FRAME_HEIGHT;
+        static const int fw = colored.cols;
+        static const int fh = colored.rows;
 
         cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
 
@@ -400,11 +411,8 @@ private:
         cv::gpu::split(luv, splited, s);
     }
 
-    void integrate( Stream& s)
+    void integrate(const int fh, const int fw, Stream& s)
     {
-        int fw = Fields::FRAME_WIDTH;
-        int fh = Fields::FRAME_HEIGHT;
-
         GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS));
         cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s);
 
@@ -423,6 +431,8 @@ public:
     float minScale;
     float maxScale;
 
+    int totals;
+
     int origObjWidth;
     int origObjHeight;
 
@@ -447,8 +457,6 @@ public:
     // 161x121x10
     GpuMat hogluv;
 
-    GpuMat detCounter;
-
     // Cascade from xml
     GpuMat octaves;
     GpuMat stages;
@@ -458,6 +466,8 @@ public:
 
     GpuMat sobelBuf;
 
+    std::vector<device::icf::Octave> voctaves;
+
     DeviceInfo info;
 
     enum { BOOST = 0 };
@@ -488,6 +498,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     CV_Assert(fields);
 
     const GpuMat colored = image.getGpuMat();
+
     // only color images are supperted
     CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
 
@@ -496,8 +507,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 
     if (colored.type() == CV_8UC3)
     {
-        // only this window size allowed
-        CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
+        if (!flds.update(colored.rows, colored.cols, flds.shrinkage))
+            flds.createLevels(colored.rows, colored.cols);
         flds.preprocess(colored, s);
     }
     else
@@ -525,7 +536,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     if (colored.type() == CV_8UC3)
     {
         // only this window size allowed
-        CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
+        // CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
         flds.preprocess(colored, s);
     }
     else

From 8acfbde68e5b9c4bd6640a21c384c3fff8fd28aa Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 14:21:22 +0400
Subject: [PATCH 64/74] remove debug detect at scale method

---
 modules/gpu/doc/object_detection.rst    |  4 --
 modules/gpu/include/opencv2/gpu/gpu.hpp |  2 -
 modules/gpu/src/cuda/isf-sc.cu          | 37 +++---------
 modules/gpu/src/icf.hpp                 |  5 +-
 modules/gpu/src/softcascade.cpp         | 35 +-----------
 modules/gpu/test/test_softcascade.cpp   | 76 ++++++++++++-------------
 6 files changed, 53 insertions(+), 106 deletions(-)

diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst
index 64348717c..c503d93fe 100644
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -248,7 +248,6 @@ Implementation of soft (stageless) cascaded detector. ::
         virtual ~SCascade();
         virtual bool load(const FileNode& fn);
         virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-        virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const;
         void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
     };
 
@@ -292,7 +291,6 @@ SCascade::detect
 Apply cascade to an input frame and return the vector of Decection objcts.
 
 .. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
-.. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const
 
     :param image: a frame on which detector will be applied.
 
@@ -302,8 +300,6 @@ Apply cascade to an input frame and return the vector of Decection objcts.
 
     :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
 
-    :param level: used for execution cascade on specific scales pyramid level.
-
 
 SCascade::genRoi
 --------------------------
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 9a43760f9..db228a69b 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1577,9 +1577,7 @@ public:
     // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
     //    The first element of the matrix is  actually a count of detections.
     // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
-    // Param level used for execution cascade on specific scales pyramid level.
     virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-    virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const;
 
     // Convert ROI matrix into the suitable for detect method.
     // Param roi is an input matrix of the same size as the image.
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 7f7a10e92..3d3536683 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -45,15 +45,6 @@
 #include <stdio.h>
 #include <float.h>
 
-// #define LOG_CUDA_CASCADE
-
-#if defined LOG_CUDA_CASCADE
-# define dprintf(format, ...) \
-            do { printf(format, __VA_ARGS__); } while (0)
-#else
-# define dprintf(format, ...)
-#endif
-
 namespace cv { namespace gpu { namespace device {
 namespace icf {
 
@@ -254,12 +245,12 @@ __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* ob
 
 template<typename Policy>
 void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const
+    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const
 {
     int fw = roi.rows;
     int fh = roi.cols;
 
-    dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1);
+    dim3 grid(fw, fh / Policy::STA_Y, downscales);
 
     uint* ctr = (uint*)(counter.ptr(0));
     Detection* det = (Detection*)objects.ptr();
@@ -268,26 +259,16 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
     cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
     cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-    cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
-    cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
+    cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<typename Policy::roi_type>();
+    cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / Policy::STA_Y, roi.rows, roi.step));
 
     const CascadeInvoker<Policy> inv = *this;
 
-    if (scale == -1)
-    {
-        soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0);
-        cudaSafeCall( cudaGetLastError());
+    soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0);
+    cudaSafeCall( cudaGetLastError());
 
-        grid = dim3(fw, fh / Policy::STA_Y, scales - downscales);
-        soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales);
-    }
-    else
-    {
-        if (scale >= downscales)
-            soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, scale);
-        else
-            soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, scale);
-    }
+    grid = dim3(fw, fh / Policy::STA_Y, scales - downscales);
+    soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales);
 
     if (!stream)
     {
@@ -297,7 +278,7 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
 }
 
 template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const;
+    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const;
 
 }
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 8eb080e23..2bbbb64d2 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -118,9 +118,10 @@ struct __align__(16) Detection
 struct GK107PolicyX4
 {
     enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4};
+    typedef float2 roi_type;
     static const dim3 block()
     {
-        return dim3(GK107PolicyX4::STA_X, GK107PolicyX4::STA_Y);
+        return dim3(STA_X, STA_Y);
     }
 };
 
@@ -146,7 +147,7 @@ struct CascadeInvoker
     int scales;
 
     void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const;
+        PtrStepSzi counter, const int downscales, const cudaStream_t& stream = 0) const;
 
     template<bool isUp>
     __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index ad6e00027..5da3abf53 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -311,13 +311,13 @@ struct cv::gpu::SCascade::Fields
         leaves.upload(hleaves);
     }
 
-    void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
+    void detect(const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
     {
         cudaMemset(count.data, 0, sizeof(Detection));
         cudaSafeCall( cudaGetLastError());
         device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
         = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, octaves, stages, nodes, leaves);
-        invoker(roi, hogluv, objects, count, downscales, scale, stream);
+        invoker(roi, hogluv, objects, count, downscales, stream);
     }
 
     void preprocess(const cv::gpu::GpuMat& colored, Stream& s)
@@ -521,36 +521,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols -  sizeof(Detection), 1));
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    flds.detect(-1, rois, tmp, objects, stream);
-}
-
-void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const
-{
-    CV_Assert(fields);
-
-    const GpuMat colored = image.getGpuMat();
-    // only color images are supperted
-    CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
-
-    Fields& flds = *fields;
-    if (colored.type() == CV_8UC3)
-    {
-        // only this window size allowed
-        // CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT);
-        flds.preprocess(colored, s);
-    }
-    else
-    {
-        colored.copyTo(flds.hogluv);
-    }
-
-    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
-
-    GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
-    objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols -  sizeof(Detection), 1));
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    flds.detect(level, rois, tmp, objects, stream);
+    flds.detect(rois, tmp, objects, stream);
 }
 
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index 7034b33b9..e36c28904 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -195,55 +195,55 @@ GPU_TEST_P(SCascadeTestRoi, detect,
 
 }
 
-typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestLevel;
-GPU_TEST_P(SCascadeTestLevel, detect,
-        testing::Combine(
-        ALL_DEVICES,
-        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
-        testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
-        testing::Range(0, 47)
-        ))
-{
-    cv::gpu::setDevice(GET_PARAM(0).deviceID());
+// typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestLevel;
+// GPU_TEST_P(SCascadeTestLevel, detect,
+//         testing::Combine(
+//         ALL_DEVICES,
+//         testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
+//         testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
+//         testing::Range(0, 47)
+//         ))
+// {
+//     cv::gpu::setDevice(GET_PARAM(0).deviceID());
 
-    cv::gpu::SCascade cascade;
+//     cv::gpu::SCascade cascade;
 
-    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
-    ASSERT_TRUE(fs.isOpened());
+//     cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
+//     ASSERT_TRUE(fs.isOpened());
 
-    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
+//     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 
-    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
-    ASSERT_FALSE(coloredCpu.empty());
+//     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
+//     ASSERT_FALSE(coloredCpu.empty());
 
-    typedef cv::gpu::SCascade::Detection Detection;
-    GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
-    rois.setTo(1);
+//     typedef cv::gpu::SCascade::Detection Detection;
+//     GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
+//     rois.setTo(1);
 
-    cv::gpu::GpuMat trois;
-    cascade.genRoi(rois, trois);
-    objectBoxes.setTo(0);
-    int level = GET_PARAM(3);
-    cascade.detect(colored, trois, objectBoxes, level);
+//     cv::gpu::GpuMat trois;
+//     cascade.genRoi(rois, trois);
+//     objectBoxes.setTo(0);
+//     int level = GET_PARAM(3);
+//     cascade.detect(colored, trois, objectBoxes, level);
 
-    cv::Mat dt(objectBoxes);
+//     cv::Mat dt(objectBoxes);
 
-    Detection* dts = ((Detection*)dt.data) + 1;
-    int* count = dt.ptr<int>(0);
+//     Detection* dts = ((Detection*)dt.data) + 1;
+//     int* count = dt.ptr<int>(0);
 
-    cv::Mat result(coloredCpu);
+//     cv::Mat result(coloredCpu);
 
-    printTotal(std::cout, *count);
-    for (int i = 0; i  < *count; ++i)
-    {
-        Detection d = dts[i];
-        print(std::cout, d);
-        cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
-    }
+//     printTotal(std::cout, *count);
+//     for (int i = 0; i  < *count; ++i)
+//     {
+//         Detection d = dts[i];
+//         print(std::cout, d);
+//         cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
+//     }
 
-    writeResult(result, level);
-    SHOW(result);
-}
+//     writeResult(result, level);
+//     SHOW(result);
+// }
 
 TEST(SCascadeTest, readCascade)
 {

From a30bbda3bddfa55c1e6261e1d872c262f69deb43 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 14:36:59 +0400
Subject: [PATCH 65/74] remove hardcoded values

---
 modules/gpu/src/cuda/isf-sc.cu  |  5 +++--
 modules/gpu/src/icf.hpp         |  2 +-
 modules/gpu/src/softcascade.cpp | 11 ++++++-----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 3d3536683..a4496bf67 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -181,6 +181,7 @@ __device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndet
     int st = level.octave * level.step;
     const int stEnd = st + level.step;
 
+    const int hogluvStep = gridDim.y * Policy::STA_Y;
     float confidence = 0.f;
     for(; st < stEnd; st += Policy::WARP)
     {
@@ -189,13 +190,13 @@ __device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndet
         Node node = nodes[nId];
 
         float threshold = rescale<isUp>(level, node);
-        int sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
+        int sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect);
 
         int next = 1 + (int)(sum >= threshold);
 
         node = nodes[nId + next];
         threshold = rescale<isUp>(level, node);
-        sum = get<isUp>(x, y + (node.threshold >> 28) * 120, node.rect);
+        sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect);
 
         const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
         float impact = leaves[(st + threadIdx.x) * 4 + lShift];
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 2bbbb64d2..454ac30da 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -130,7 +130,7 @@ struct CascadeInvoker
 {
     CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
 
-    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages,
+    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages,
                    const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
     : levels((const Level*)_levels.ptr()),
       stages((const float*)_stages.ptr()),
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 5da3abf53..038654225 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -316,7 +316,7 @@ struct cv::gpu::SCascade::Fields
         cudaMemset(count.data, 0, sizeof(Detection));
         cudaSafeCall( cudaGetLastError());
         device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
-        = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, octaves, stages, nodes, leaves);
+        = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
         invoker(roi, hogluv, objects, count, downscales, stream);
     }
 
@@ -414,7 +414,7 @@ private:
     void integrate(const int fh, const int fw, Stream& s)
     {
         GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS));
-        cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s);
+        cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
 
         if (info.majorVersion() < 3)
             cv::gpu::integralBuffered(shrunk, hogluv, integralBuffer, s);
@@ -518,7 +518,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 
 
     GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
-    objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols -  sizeof(Detection), 1));
+    objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1));
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     flds.detect(rois, tmp, objects, stream);
@@ -527,13 +527,14 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const
 {
     CV_Assert(fields);
+    int shr = (*fields).shrinkage;
 
     const GpuMat roi = _roi.getGpuMat();
-    _mask.create( roi.cols / 4, roi.rows / 4, roi.type() );
+    _mask.create( roi.cols / shr, roi.rows / shr, roi.type() );
     GpuMat mask = _mask.getGpuMat();
     cv::gpu::GpuMat tmp;
 
-    cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA, stream);
+    cv::gpu::resize(roi, tmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, stream);
     cv::gpu::transpose(tmp, mask, stream);
 }
 

From 781c04324eab9537dc3ddb0b01f75975990b8e14 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 14:47:00 +0400
Subject: [PATCH 66/74] refactor: PrefixSum

---
 modules/gpu/src/cuda/isf-sc.cu | 60 ++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index a4496bf67..27d60e637 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -79,6 +79,39 @@ namespace icf {
         }
     }
 
+    template<typename Policy>
+    struct PrefixSum
+    {
+    __device static void apply(float& impact)
+        {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+    #pragma unroll
+            // scan on shuffl functions
+            for (int i = 1; i < Policy::WARP; i *= 2)
+            {
+                const float n = __shfl_up(impact, i, Policy::WARP);
+
+                if (threadIdx.x >= i)
+                    impact += n;
+            }
+    #else
+            __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y];
+
+            const int idx = threadIdx.y * Policy::STA_X + threadIdx.x;
+
+            ptr[idx] = impact;
+
+            if ( threadIdx.x >=  1) ptr [idx ] = (ptr [idx -  1] + ptr [idx]);
+            if ( threadIdx.x >=  2) ptr [idx ] = (ptr [idx -  2] + ptr [idx]);
+            if ( threadIdx.x >=  4) ptr [idx ] = (ptr [idx -  4] + ptr [idx]);
+            if ( threadIdx.x >=  8) ptr [idx ] = (ptr [idx -  8] + ptr [idx]);
+            if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]);
+
+            impact = ptr[idx];
+    #endif
+        }
+    };
+
     texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
 
     template<bool isUp>
@@ -201,32 +234,9 @@ __device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndet
         const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
         float impact = leaves[(st + threadIdx.x) * 4 + lShift];
 
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
-#pragma unroll
-        // scan on shuffl functions
-        for (int i = 1; i < Policy::WARP; i *= 2)
-        {
-            const float n = __shfl_up(impact, i, Policy::WARP);
-
-            if (threadIdx.x >= i)
-                impact += n;
-        }
-#else
-        __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y];
-
-        const int idx = threadIdx.y * Policy::STA_X + threadIdx.x;
-
-        ptr[idx] = impact;
-
-        if ( threadIdx.x >=  1) ptr [idx ] = (ptr [idx -  1] + ptr [idx]);
-        if ( threadIdx.x >=  2) ptr [idx ] = (ptr [idx -  2] + ptr [idx]);
-        if ( threadIdx.x >=  4) ptr [idx ] = (ptr [idx -  4] + ptr [idx]);
-        if ( threadIdx.x >=  8) ptr [idx ] = (ptr [idx -  8] + ptr [idx]);
-        if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]);
-
-        impact = ptr[idx];
-#endif
+        PrefixSum<Policy>::apply(impact);
         confidence += impact;
+
         if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
     }
 

From ee4f003e725cea21690d361f03c2596304d28720 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 14:49:38 +0400
Subject: [PATCH 67/74] fixed typo

---
 modules/gpu/src/cuda/{isf-sc.cu => icf-sc.cu} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename modules/gpu/src/cuda/{isf-sc.cu => icf-sc.cu} (100%)

diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu
similarity index 100%
rename from modules/gpu/src/cuda/isf-sc.cu
rename to modules/gpu/src/cuda/icf-sc.cu

From c3e4a52fbe44ae5bf2f754e922a2e932e3d20a28 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 20:11:18 +0400
Subject: [PATCH 68/74] soft cascade sample

---
 modules/gpu/src/softcascade.cpp |   4 +-
 samples/gpu/softcascade.cpp     | 106 ++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 samples/gpu/softcascade.cpp

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 038654225..7f5221f37 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -287,7 +287,8 @@ struct cv::gpu::SCascade::Fields
 
     bool update(int fh, int fw, int shr)
     {
-        if (fh == luv.rows && fh == luv.cols) return false;
+        if ((fh == luv.rows) && (fw == luv.cols)) return false;
+
         plane.create(fh * (HOG_LUV_BINS + 1), fw, CV_8UC1);
         fplane.create(fh * HOG_BINS, fw, CV_32FC1);
         luv.create(fh, fw, CV_8UC3);
@@ -297,6 +298,7 @@ struct cv::gpu::SCascade::Fields
 
         hogluv.create((fh / shr) * HOG_LUV_BINS + 1, fw / shr + 1, CV_32SC1);
         hogluv.setTo(cv::Scalar::all(0));
+
         return true;
     }
 
diff --git a/samples/gpu/softcascade.cpp b/samples/gpu/softcascade.cpp
new file mode 100644
index 000000000..3c08fdb1c
--- /dev/null
+++ b/samples/gpu/softcascade.cpp
@@ -0,0 +1,106 @@
+#include <opencv2/gpu/gpu.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <iostream>
+
+int main(int argc, char** argv)
+{
+    const std::string keys =
+        "{help h usage ?    |     | print this message }"
+        "{cascade c         |     | path to configuration xml }"
+        "{frames f          |     | path to configuration xml }"
+        "{min_scale         |0.4f | path to configuration xml }"
+        "{max_scale         |5.0f | path to configuration xml }"
+        "{total_scales      |55   | path to configuration xml }"
+        "{device d          |0    | path to configuration xml }"
+    ;
+
+    cv::CommandLineParser parser(argc, argv, keys);
+    parser.about("Soft cascade training application.");
+
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 1;
+    }
+
+    cv::gpu::setDevice(parser.get<int>("device"));
+
+    std::string cascadePath = parser.get<std::string>("cascade");
+
+    cv::FileStorage fs(cascadePath, cv::FileStorage::READ);
+    if(!fs.isOpened())
+    {
+        std::cout << "Soft Cascade file " << cascadePath << " can't be opened." << std::endl << std::flush;
+        return 1;
+    }
+
+    std::cout << "Read cascade from file " << cascadePath << std::endl;
+
+    float minScale =  parser.get<float>("min_scale");
+    float maxScale =  parser.get<float>("max_scale");
+    int scales     =  parser.get<int>("total_scales");
+
+    using cv::gpu::SCascade;
+    SCascade cascade(minScale, maxScale, scales);
+
+    if (!cascade.load(fs.getFirstTopLevelNode()))
+    {
+        std::cout << "Soft Cascade can't be parsed." << std::endl << std::flush;
+        return 1;
+    }
+
+    std::string frames = parser.get<std::string>("frames");
+    cv::VideoCapture capture(frames);
+    if(!capture.isOpened())
+    {
+        std::cout << "Frame source " << frames << " can't be opened." << std::endl << std::flush;
+        return 1;
+    }
+
+    cv::gpu::GpuMat objects(1, sizeof(SCascade::Detection) * 10000, CV_8UC1);
+    cv::gpu::printShortCudaDeviceInfo(parser.get<int>("device"));
+    for (;;)
+    {
+        cv::Mat frame;
+        if (!capture.read(frame))
+        {
+            std::cout << "Nothing to read. " << std::endl << std::flush;
+            return 0;
+        }
+
+        cv::gpu::GpuMat dframe(frame), roi(frame.rows, frame.cols, CV_8UC1), trois;
+        roi.setTo(cv::Scalar::all(1));
+        cascade.genRoi(roi, trois);
+        cascade.detect(dframe, trois, objects);
+
+        cv::Mat dt(objects);
+        typedef cv::gpu::SCascade::Detection Detection;
+
+        Detection* dts = ((Detection*)dt.data) + 1;
+        int* count = dt.ptr<int>(0);
+
+        std::cout << *count << std::endl;
+
+        cv::Mat result;
+        frame.copyTo(result);
+
+
+        for (int i = 0; i < *count; ++i)
+        {
+            Detection d = dts[i];
+            cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
+        }
+
+        std::cout << "working..." << std::endl;
+        cv::imshow("Soft Cascade demo", result);
+        cv::waitKey(10);
+    }
+
+    return 0;
+}
\ No newline at end of file

From 0865227049b465bc61dc104627dbbf50611f3436 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 20:28:08 +0400
Subject: [PATCH 69/74] check if scaling values changed

---
 modules/gpu/src/softcascade.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 7f5221f37..bf543150b 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -248,6 +248,17 @@ struct cv::gpu::SCascade::Fields
         return fields;
     }
 
+    bool check(float mins,float  maxs, int scales)
+    {
+        bool updated = (minScale == mins) || (maxScale == maxs) || (totals = scales);
+
+        minScale = mins;
+        maxScale = maxScale;
+        totals   = scales;
+
+        return updated;
+    }
+
     int createLevels(const int fh, const int fw)
     {
         using namespace device::icf;
@@ -509,7 +520,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 
     if (colored.type() == CV_8UC3)
     {
-        if (!flds.update(colored.rows, colored.cols, flds.shrinkage))
+        if (!flds.update(colored.rows, colored.cols, flds.shrinkage) || flds.check(minScale, maxScale, scales))
             flds.createLevels(colored.rows, colored.cols);
         flds.preprocess(colored, s);
     }

From a9f10e5cadeb64bf956e5bef624a494bf65142ce Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Wed, 14 Nov 2012 20:34:17 +0400
Subject: [PATCH 70/74] fixed compile without cuda

---
 modules/gpu/src/softcascade.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index bf543150b..d5a8e8481 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -52,7 +52,6 @@ cv::gpu::SCascade::~SCascade() { throw_nogpu(); }
 bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
 
 void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
-void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, const int, Stream&) const { throw_nogpu(); }
 
 void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_nogpu(); }
 

From d2e88e1d4d0e7cf803187e97588a0a4a4338444f Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 26 Nov 2012 15:26:11 +0400
Subject: [PATCH 71/74] nms: part 1

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  6 ++-
 modules/gpu/src/cuda/icf-sc.cu          | 67 ++++++++++++++++++++++++-
 modules/gpu/src/gpu_init.cpp            |  8 +--
 modules/gpu/src/softcascade.cpp         | 22 +++++++-
 4 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index db228a69b..8362282b0 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1552,12 +1552,14 @@ public:
         enum {PEDESTRIAN = 0};
     };
 
+    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT};
+
     // An empty cascade will be created.
     // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
     // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
     // Param scales is a number of scales from minScale to maxScale.
     // Param rejfactor is used for NMS.
-    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
+    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejCriteria = 1);
 
     virtual ~SCascade();
 
@@ -1595,7 +1597,7 @@ private:
     double maxScale;
 
     int scales;
-    int rejfactor;
+    int rejCriteria;
 };
 
 ////////////////////////////////// SURF //////////////////////////////////////////
diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu
index 27d60e637..5334441d8 100644
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
@@ -41,9 +41,10 @@
 //M*/
 
 #include <opencv2/gpu/device/common.hpp>
+
 #include <icf.hpp>
-#include <stdio.h>
 #include <float.h>
+#include <stdio.h>
 
 namespace cv { namespace gpu { namespace device {
 namespace icf {
@@ -79,6 +80,70 @@ namespace icf {
         }
     }
 
+    __device__ __forceinline__ float overlapArea(const Detection &a, const Detection &b)
+    {
+        int w = ::min(a.x + a.w, b.x + b.w) - ::max(a.x, b.x);
+        int h = ::min(a.y + a.h, b.y + b.h) - ::max(a.y, b.y);
+
+        return (w < 0 || h < 0)? 0.f : (float)(w * h);
+    }
+
+    __global__ void overlap(const uint* n, const Detection* detections, uchar* overlaps)
+    {
+        const int idx = threadIdx.x;
+        const int total = *n;
+
+        for (int i = idx; i < total; i += 192)
+        {
+            const Detection& a = detections[i];
+            bool excluded = false;
+
+            for (int j = i + 1; j < total; ++j)
+            {
+                const Detection& b = detections[j];
+                float ovl = overlapArea(a, b) / ::min(a.w * a.h, b.w * b.h);
+
+                if (ovl > 0.65f)
+                {
+                    int suppessed = (a.confidence > b.confidence)? j : i;
+                    overlaps[suppessed] = 1;
+                    excluded = excluded || (suppessed == i);
+                }
+
+                if (__all(excluded)) break;
+            }
+        }
+    }
+
+    __global__ void collect(const uint* n, const Detection* detections, uchar* overlaps)
+    {
+        const int idx = threadIdx.x;
+        const int total = *n;
+
+        for (int i = idx; i < total; i += 192)
+        {
+            if (!overlaps[i])
+            {
+                const Detection& det = detections[i];
+                // printf("%d: %d %d %d %d %f\n", i, det.x, det.y, det.w, det.h, det.confidence );
+            }
+        }
+    }
+
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections)
+    {
+        int block = 192;
+        int grid = 1;
+
+        overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (Detection*)objects.ptr(0), (uchar*)overlaps.ptr(0));
+        collect<<<grid, block>>>((uint*)ndetections.ptr(0), (Detection*)objects.ptr(0), (uchar*)overlaps.ptr(0));
+        // if (!stream)
+        {
+            cudaSafeCall( cudaGetLastError());
+            cudaSafeCall( cudaDeviceSynchronize());
+        }
+    }
+
     template<typename Policy>
     struct PrefixSum
     {
diff --git a/modules/gpu/src/gpu_init.cpp b/modules/gpu/src/gpu_init.cpp
index f25bc2ceb..773a8b64e 100644
--- a/modules/gpu/src/gpu_init.cpp
+++ b/modules/gpu/src/gpu_init.cpp
@@ -46,10 +46,10 @@ namespace cv { namespace gpu
 {
 
 CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
-                  obj.info()->addParam(obj, "minScale",  obj.minScale);
-                  obj.info()->addParam(obj, "maxScale",  obj.maxScale);
-                  obj.info()->addParam(obj, "scales",    obj.scales);
-                  obj.info()->addParam(obj, "rejfactor", obj.rejfactor));
+                  obj.info()->addParam(obj, "minScale",    obj.minScale);
+                  obj.info()->addParam(obj, "maxScale",    obj.maxScale);
+                  obj.info()->addParam(obj, "scales",      obj.scales);
+                  obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));
 
 bool initModule_gpu(void)
 {
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index d5a8e8481..35bd72e55 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -85,6 +85,8 @@ namespace cv { namespace gpu { namespace device {
 namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins, cudaStream_t stream);
+
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections);
 }
 
 namespace imgproc {
@@ -309,6 +311,8 @@ struct cv::gpu::SCascade::Fields
         hogluv.create((fh / shr) * HOG_LUV_BINS + 1, fw / shr + 1, CV_32SC1);
         hogluv.setTo(cv::Scalar::all(0));
 
+        overlaps.create(1, 5000, CV_8UC1);
+
         return true;
     }
 
@@ -437,7 +441,15 @@ private:
         }
     }
 
+#include <iostream>
 public:
+    void suppress(GpuMat& ndetections, GpuMat& objects)
+    {
+        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
+        overlaps.setTo(0);
+        device::icf::suppress(objects, overlaps, ndetections);
+        // std::cout << cv::Mat(overlaps) << std::endl;
+    }
 
     // scales range
     float minScale;
@@ -469,6 +481,9 @@ public:
     // 161x121x10
     GpuMat hogluv;
 
+    // used for area overlap computing during
+    GpuMat overlaps;
+
     // Cascade from xml
     GpuMat octaves;
     GpuMat stages;
@@ -478,6 +493,8 @@ public:
 
     GpuMat sobelBuf;
 
+    GpuMat collected;
+
     std::vector<device::icf::Octave> voctaves;
 
     DeviceInfo info;
@@ -494,7 +511,7 @@ public:
 };
 
 cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int rjf)
-: fields(0),  minScale(mins), maxScale(maxs), scales(sc), rejfactor(rjf) {}
+: fields(0),  minScale(mins), maxScale(maxs), scales(sc), rejCriteria(rjf) {}
 
 cv::gpu::SCascade::~SCascade() { delete fields; }
 
@@ -534,6 +551,9 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     flds.detect(rois, tmp, objects, stream);
+
+    // if (rejCriteria != NO_REJECT)
+    flds.suppress(tmp, objects);
 }
 
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const

From 60c0e41ba5a74ed87170c992a043bac9ddaaca2d Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 26 Nov 2012 17:22:47 +0400
Subject: [PATCH 72/74] integrate NMS (Dollar's criteria)

---
 modules/gpu/src/cuda/icf-sc.cu  | 29 +++++++++++++++++++----------
 modules/gpu/src/softcascade.cpp | 19 ++++++++++++++-----
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu
index 5334441d8..12b511fb0 100644
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
@@ -88,19 +88,23 @@ namespace icf {
         return (w < 0 || h < 0)? 0.f : (float)(w * h);
     }
 
-    __global__ void overlap(const uint* n, const Detection* detections, uchar* overlaps)
+    texture<uint4,  cudaTextureType2D, cudaReadModeElementType> tdetections;
+
+    __global__ void overlap(const uint* n, uchar* overlaps)
     {
         const int idx = threadIdx.x;
         const int total = *n;
 
-        for (int i = idx; i < total; i += 192)
+        for (int i = idx + 1; i < total; i += 192)
         {
-            const Detection& a = detections[i];
+            const uint4 _a = tex2D(tdetections, i, 0);
+            const Detection& a = *((Detection*)(&_a));
             bool excluded = false;
 
             for (int j = i + 1; j < total; ++j)
             {
-                const Detection& b = detections[j];
+                const uint4 _b = tex2D(tdetections, j, 0);
+                const Detection& b = *((Detection*)(&_b));
                 float ovl = overlapArea(a, b) / ::min(a.w * a.h, b.w * b.h);
 
                 if (ovl > 0.65f)
@@ -115,7 +119,7 @@ namespace icf {
         }
     }
 
-    __global__ void collect(const uint* n, const Detection* detections, uchar* overlaps)
+    __global__ void collect(const uint* n, uchar* overlaps, uint* ctr, uint4* suppressed)
     {
         const int idx = threadIdx.x;
         const int total = *n;
@@ -124,19 +128,24 @@ namespace icf {
         {
             if (!overlaps[i])
             {
-                const Detection& det = detections[i];
-                // printf("%d: %d %d %d %d %f\n", i, det.x, det.y, det.w, det.h, det.confidence );
+                int oidx = atomicInc(ctr, 50);
+                suppressed[oidx] = tex2D(tdetections, i + 1, 0);
             }
         }
     }
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections)
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed)
     {
         int block = 192;
         int grid = 1;
 
-        overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (Detection*)objects.ptr(0), (uchar*)overlaps.ptr(0));
-        collect<<<grid, block>>>((uint*)ndetections.ptr(0), (Detection*)objects.ptr(0), (uchar*)overlaps.ptr(0));
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uint4>();
+        size_t offset;
+        cudaSafeCall( cudaBindTexture2D(&offset, tdetections, objects.data, desc, objects.cols / sizeof(uint4), objects.rows, objects.step));
+
+        overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0));
+        collect<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1);
+
         // if (!stream)
         {
             cudaSafeCall( cudaGetLastError());
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 35bd72e55..5324e2e56 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -86,7 +86,7 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins, cudaStream_t stream);
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections);
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed);
 }
 
 namespace imgproc {
@@ -312,6 +312,7 @@ struct cv::gpu::SCascade::Fields
         hogluv.setTo(cv::Scalar::all(0));
 
         overlaps.create(1, 5000, CV_8UC1);
+        suppressed.create(1, sizeof(Detection) * 51, CV_8UC1);
 
         return true;
     }
@@ -447,7 +448,9 @@ public:
     {
         ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
         overlaps.setTo(0);
-        device::icf::suppress(objects, overlaps, ndetections);
+        suppressed.setTo(0);
+
+        device::icf::suppress(objects, overlaps, ndetections, suppressed);
         // std::cout << cv::Mat(overlaps) << std::endl;
     }
 
@@ -484,6 +487,9 @@ public:
     // used for area overlap computing during
     GpuMat overlaps;
 
+    // used for suppression
+    GpuMat suppressed;
+
     // Cascade from xml
     GpuMat octaves;
     GpuMat stages;
@@ -525,7 +531,6 @@ bool cv::gpu::SCascade::load(const FileNode& fn)
 void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const
 {
     CV_Assert(fields);
-
     const GpuMat colored = image.getGpuMat();
 
     // only color images are supperted
@@ -545,6 +550,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
         colored.copyTo(flds.hogluv);
     }
 
+    GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
 
     GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
     objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1));
@@ -552,8 +558,11 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
 
     flds.detect(rois, tmp, objects, stream);
 
-    // if (rejCriteria != NO_REJECT)
-    flds.suppress(tmp, objects);
+    if (rejCriteria != NO_REJECT)
+    {
+        flds.suppress(tmp, objects);
+        flds.suppressed.copyTo(spr);
+    }
 }
 
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const

From bd3179bda8035f3f349678aad2d1ae0510c52870 Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 26 Nov 2012 17:53:25 +0400
Subject: [PATCH 73/74] fix CUDA support for streams for NMS; refactor tests

---
 modules/gpu/src/cuda/icf-sc.cu        | 13 +++---
 modules/gpu/src/icf.hpp               |  2 +-
 modules/gpu/src/softcascade.cpp       | 62 +++++++++++++++++----------
 modules/gpu/test/test_softcascade.cpp | 52 +++++++++++++---------
 4 files changed, 78 insertions(+), 51 deletions(-)

diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu
index 12b511fb0..e32379950 100644
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
@@ -134,7 +134,8 @@ namespace icf {
         }
     }
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed)
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
+        PtrStepSzb suppressed, cudaStream_t stream)
     {
         int block = 192;
         int grid = 1;
@@ -146,7 +147,7 @@ namespace icf {
         overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0));
         collect<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1);
 
-        // if (!stream)
+        if (!stream)
         {
             cudaSafeCall( cudaGetLastError());
             cudaSafeCall( cudaDeviceSynchronize());
@@ -330,15 +331,15 @@ __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* ob
 
 template<typename Policy>
 void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const
+    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
 {
     int fw = roi.rows;
     int fh = roi.cols;
 
     dim3 grid(fw, fh / Policy::STA_Y, downscales);
 
-    uint* ctr = (uint*)(counter.ptr(0));
-    Detection* det = (Detection*)objects.ptr();
+    uint* ctr = (uint*)(objects.ptr(0));
+    Detection* det = ((Detection*)objects.ptr(0)) + 1;
     uint max_det = objects.cols / sizeof(Detection);
 
     cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
@@ -363,7 +364,7 @@ void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi&
 }
 
 template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
-    PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const;
+    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
 
 }
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 454ac30da..06f99f2b1 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -147,7 +147,7 @@ struct CascadeInvoker
     int scales;
 
     void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
-        PtrStepSzi counter, const int downscales, const cudaStream_t& stream = 0) const;
+        const int downscales, const cudaStream_t& stream = 0) const;
 
     template<bool isUp>
     __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 5324e2e56..37e7e3f4e 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -86,7 +86,8 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins, cudaStream_t stream);
 
-    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed);
+    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
+        PtrStepSzb suppressed, cudaStream_t stream);
 }
 
 namespace imgproc {
@@ -328,13 +329,20 @@ struct cv::gpu::SCascade::Fields
         leaves.upload(hleaves);
     }
 
-    void detect(const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const
+    void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, Stream& s) const
     {
-        cudaMemset(count.data, 0, sizeof(Detection));
+        if (s)
+            s.enqueueMemSet(objects, 0);
+        else
+            cudaMemset(objects.data, 0, sizeof(Detection));
+
         cudaSafeCall( cudaGetLastError());
+
         device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
         = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
-        invoker(roi, hogluv, objects, count, downscales, stream);
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        invoker(roi, hogluv, objects, downscales, stream);
     }
 
     void preprocess(const cv::gpu::GpuMat& colored, Stream& s)
@@ -356,6 +364,26 @@ struct cv::gpu::SCascade::Fields
         integrate(fh, fw, s);
     }
 
+    void suppress(GpuMat& objects, Stream& s)
+    {
+        GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
+        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
+
+        if (s)
+        {
+            s.enqueueMemSet(overlaps, 0);
+            s.enqueueMemSet(suppressed, 0);
+        }
+        else
+        {
+            overlaps.setTo(0);
+            suppressed.setTo(0);
+        }
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+        device::icf::suppress(objects, overlaps, ndetections, suppressed, stream);
+    }
+
 private:
 
     typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
@@ -442,17 +470,7 @@ private:
         }
     }
 
-#include <iostream>
 public:
-    void suppress(GpuMat& ndetections, GpuMat& objects)
-    {
-        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
-        overlaps.setTo(0);
-        suppressed.setTo(0);
-
-        device::icf::suppress(objects, overlaps, ndetections, suppressed);
-        // std::cout << cv::Mat(overlaps) << std::endl;
-    }
 
     // scales range
     float minScale;
@@ -547,20 +565,18 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _
     }
     else
     {
-        colored.copyTo(flds.hogluv);
+        if (s)
+            s.enqueueCopy(colored, flds.hogluv);
+        else
+            colored.copyTo(flds.hogluv);
     }
 
-    GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
-
-    GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
-    objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1));
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    flds.detect(rois, tmp, objects, stream);
+    flds.detect(rois, objects, s);
 
     if (rejCriteria != NO_REJECT)
     {
-        flds.suppress(tmp, objects);
+        GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
+        flds.suppress(objects, s);
         flds.suppressed.copyTo(spr);
     }
 }
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index e36c28904..da97d4112 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -47,7 +47,7 @@
 using cv::gpu::GpuMat;
 
 // show detection results on input image with cv::imshow
-#define SHOW_DETECTIONS
+// #define SHOW_DETECTIONS
 
 #if defined SHOW_DETECTIONS
 # define SHOW(res)           \
@@ -99,6 +99,35 @@ namespace {
         return std::string(s);
     }
 
+    static void print(std::ostream &out, const Detection& d)
+    {
+#if defined SHOW_DETECTIONS
+        out << "\x1b[32m[ detection]\x1b[0m ("
+            << std::setw(4)  << d.x
+            << " "
+            << std::setw(4)  << d.y
+            << ") ("
+            << std::setw(4)  << d.w
+            << " "
+            << std::setw(4)  << d.h
+            << ") "
+            << std::setw(12) << d.confidence
+            <<  std::endl;
+#else
+        (void)out; (void)d;
+#endif
+    }
+
+    static void printTotal(std::ostream &out, int detbytes)
+    {
+#if defined SHOW_DETECTIONS
+        out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl;
+#else
+        (void)out; (void)detbytes;
+#endif
+    }
+
+#if defined SHOW_DETECTIONS
     static std::string getImageName(int level)
     {
         time_t rawtime;
@@ -112,32 +141,13 @@ namespace {
         return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
     }
 
-    static void print(std::ostream &out, const Detection& d)
-    {
-        out << "\x1b[32m[ detection]\x1b[0m ("
-            << std::setw(4)  << d.x
-            << " "
-            << std::setw(4)  << d.y
-            << ") ("
-            << std::setw(4)  << d.w
-            << " "
-            << std::setw(4)  << d.h
-            << ") "
-            << std::setw(12) << d.confidence
-            <<  std::endl;
-    }
-
-    static void printTotal(std::ostream &out, int detbytes)
-    {
-        out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl;
-    }
-
     static void writeResult(const cv::Mat& result, const int level)
     {
         std::string path = cv::tempfile(getImageName(level).c_str());
         cv::imwrite(path, result);
         std::cout << "\x1b[32m" << "[          ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
     }
+#endif
 }
 
 typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestRoi;

From 05cd88ae4233824a31072e1ed692ba80604ff51f Mon Sep 17 00:00:00 2001
From: "marina.kolpakova" <marina.kolpakova@itseez.com>
Date: Mon, 26 Nov 2012 18:50:08 +0400
Subject: [PATCH 74/74] clean code; fix problems in documentation

---
 modules/gpu/doc/object_detection.rst    |  6 +--
 modules/gpu/include/opencv2/gpu/gpu.hpp |  4 +-
 modules/gpu/perf/perf_softcascade.cpp   | 35 -----------------
 modules/gpu/src/cuda/icf-sc.cu          |  2 +-
 modules/gpu/src/icf.hpp                 |  8 +---
 modules/gpu/src/softcascade.cpp         |  2 +-
 modules/gpu/test/test_softcascade.cpp   | 50 -------------------------
 7 files changed, 9 insertions(+), 98 deletions(-)

diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst
index c503d93fe..ce18892a1 100644
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -200,7 +200,7 @@ The function is mainly used to learn the classifier.
 
 
 Soft Cascade Classifier
-======================
+==========================
 
 Soft Cascade Classifier for Object Detection
 ----------------------------------------------------------
@@ -226,7 +226,7 @@ The sample has been rejected if it fall rejection threshold. So stageless cascad
 
 SCascade
 ----------------
-.. ocv:class:: SCascade
+.. ocv:class:: SCascade : public Algorithm
 
 Implementation of soft (stageless) cascaded detector. ::
 
@@ -248,7 +248,7 @@ Implementation of soft (stageless) cascaded detector. ::
         virtual ~SCascade();
         virtual bool load(const FileNode& fn);
         virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
-        void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
     };
 
 
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 8362282b0..f0a9e99a0 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1586,7 +1586,7 @@ public:
     //    There non zero value mean that detector should be executed in this point.
     // Param mask is an output mask
     // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
-    void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
+    virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
 
 private:
 
@@ -1600,6 +1600,8 @@ private:
     int rejCriteria;
 };
 
+CV_EXPORTS bool initModule_gpu(void);
+
 ////////////////////////////////// SURF //////////////////////////////////////////
 
 class CV_EXPORTS SURF_GPU
diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp
index 3e82cc5bb..ae816bcf5 100644
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
@@ -33,18 +33,6 @@ namespace {
             else if (a.w != b.w) return a.w < b.w;
             else return a.h < b.h;
         }
-
-        // bool operator()(const cv::SoftCascade::Detection& a,
-        //     const cv::SoftCascade::Detection& b) const
-        // {
-        //     const cv::Rect& ra = a.rect;
-        //     const cv::Rect& rb = b.rect;
-
-        //     if (ra.x != rb.x) return ra.x < rb.x;
-        //     else if (ra.y != rb.y) return ra.y < rb.y;
-        //     else if (ra.width != rb.width) return ra.width < rb.width;
-        //     else return ra.height < rb.height;
-        // }
     };
 
     cv::Mat sortDetections(cv::gpu::GpuMat& objects)
@@ -99,29 +87,6 @@ RUN_GPU(SCascadeTest, detect)
 
 NO_CPU(SCascadeTest, detect)
 
-// RUN_CPU(SCascadeTest, detect)
-// {
-//     cv::Mat colored = readImage(GET_PARAM(1));
-//     ASSERT_FALSE(colored.empty());
-
-//     cv::SCascade cascade;
-//     ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0))));
-
-//     std::vector<cv::Rect> rois;
-
-//     typedef cv::SCascade::Detection Detection;
-//     std::vector<Detection>objects;
-//     cascade.detectMultiScale(colored, rois, objects);
-
-//     TEST_CYCLE()
-//     {
-//         cascade.detectMultiScale(colored, rois, objects);
-//     }
-
-//     std::sort(objects.begin(), objects.end(), DetectionLess());
-//     SANITY_CHECK(objects);
-// }
-
 static cv::Rect getFromTable(int idx)
 {
     static const cv::Rect rois[] =
diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu
index e32379950..5fca87b4d 100644
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
@@ -49,7 +49,7 @@
 namespace cv { namespace gpu { namespace device {
 namespace icf {
 
-    // ToDo: use textures or ancached load instruction.
+    // ToDo: use textures or uncached load instruction.
     __global__ void magToHist(const uchar* __restrict__ mag,
                               const float* __restrict__ angle, const int angPitch,
                                     uchar* __restrict__ hog,   const int hogPitch, const int fh)
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 06f99f2b1..295255f57 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -45,7 +45,6 @@
 #define __OPENCV_ICF_HPP__
 
 #include <opencv2/gpu/device/common.hpp>
-#include <stdio.h>
 
 #if defined __CUDACC__
 # define __device __device__ __forceinline__
@@ -93,12 +92,7 @@ struct __align__(8) Node
 
     enum { THRESHOLD_MASK = 0x0FFFFFFF };
 
-    Node(const uchar4 r, const uint ch, const uint t) : rect(r), threshold(t + (ch << 28))
-    {
-        // printf("%d\n", t);
-        // printf("[%d %d %d %d] %d, %d\n",rect.x, rect.y, rect.z, rect.w, (int)(threshold >> 28),
-        //     (int)(0x0FFFFFFF & threshold));
-    }
+    Node(const uchar4 r, const uint ch, const uint t) : rect(r), threshold(t + (ch << 28)) {}
 };
 
 struct __align__(16) Detection
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index 37e7e3f4e..2208369e6 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -142,7 +142,7 @@ struct cv::gpu::SCascade::Fields
         static const char * const SC_F_RECT             = "rect";
 
         FileNode fn = root[SC_OCTAVES];
-            if (fn.empty()) return false;
+        if (fn.empty()) return false;
 
         using namespace device::icf;
 
diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp
index da97d4112..40bf5a0d6 100644
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@@ -205,56 +205,6 @@ GPU_TEST_P(SCascadeTestRoi, detect,
 
 }
 
-// typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestLevel;
-// GPU_TEST_P(SCascadeTestLevel, detect,
-//         testing::Combine(
-//         ALL_DEVICES,
-//         testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
-//         testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
-//         testing::Range(0, 47)
-//         ))
-// {
-//     cv::gpu::setDevice(GET_PARAM(0).deviceID());
-
-//     cv::gpu::SCascade cascade;
-
-//     cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
-//     ASSERT_TRUE(fs.isOpened());
-
-//     ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
-
-//     cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
-//     ASSERT_FALSE(coloredCpu.empty());
-
-//     typedef cv::gpu::SCascade::Detection Detection;
-//     GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
-//     rois.setTo(1);
-
-//     cv::gpu::GpuMat trois;
-//     cascade.genRoi(rois, trois);
-//     objectBoxes.setTo(0);
-//     int level = GET_PARAM(3);
-//     cascade.detect(colored, trois, objectBoxes, level);
-
-//     cv::Mat dt(objectBoxes);
-
-//     Detection* dts = ((Detection*)dt.data) + 1;
-//     int* count = dt.ptr<int>(0);
-
-//     cv::Mat result(coloredCpu);
-
-//     printTotal(std::cout, *count);
-//     for (int i = 0; i  < *count; ++i)
-//     {
-//         Detection d = dts[i];
-//         print(std::cout, d);
-//         cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
-//     }
-
-//     writeResult(result, level);
-//     SHOW(result);
-// }
-
 TEST(SCascadeTest, readCascade)
 {
     std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml";