LBP classifier was refactored, added parameter for max size of detected object

2012-07-02 08:08:11 +00:00 · 2012-07-02 08:08:11 +00:00 · a9f2f522e7
commit a9f2f522e7
parent e6f7e4d83e
4 changed files with 113 additions and 86 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -1435,7 +1435,8 @@ public:
    bool load(const std::string& filename);
    void release();
-    int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4/*, Size minSize = Size()*/);
+    int detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objectsBuf, double scaleFactor = 1.1, int minNeighbors = 4,
    cv::Size maxObjectSize = cv::Size()/*, Size minSize = Size()*/);
    void preallocateIntegralBuffer(cv::Size desired);
    bool findLargestObject;
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -48,20 +48,6 @@ using namespace cv;
 using namespace cv::gpu;
 using namespace std;
 struct Stage
 {
    int    first;
    int    ntrees;
    float  threshold;
 };
 struct DTreeNode
 {
    int   featureIdx;
    int   left;
    int   right;
 };
 #if !defined (HAVE_CUDA)
 // ============ old fashioned haar cascade ==============================================//
 cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()               { throw_nogpu(); }
@ -128,6 +114,13 @@ bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string& classifierAsXml)
 #define GPU_CC_FEATURES             "features"
 #define GPU_CC_RECT                 "rect"
 struct Stage
 {
    int    first;
    int    ntrees;
    float  threshold;
 };
 // currently only stump based boost classifiers are supported
 bool CascadeClassifier_GPU_LBP::read(const FileNode &root)
 {
@ -279,12 +272,26 @@ namespace cv { namespace gpu { namespace device
 {
    namespace lbp
    {
-        void cascadeClassify(const DevMem2Db stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db features,
+        void classifyStump(const DevMem2Db mstages,
-            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors = 4, cudaStream_t stream = 0);
+                           const int nstages,
                           const DevMem2Di mnodes,
                           const DevMem2Df mleaves,
                           const DevMem2Di msubsets,
                           const DevMem2Db mfeatures,
                           const DevMem2Di integral,
                           const int workWidth,
                           const int workHeight,
                           const int clWidth,
                           const int clHeight,
                           float scale,
                           int step,
                           int subsetSize,
                           DevMem2D_<int4> objects);
    }
 }}}
-int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects, double scaleFactor, int minNeighbors /*, Size minSize=Size()*/)
+int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects,
                                                        double scaleFactor, int minNeighbors, cv::Size maxObjectSize /*, Size minSize=Size()*/)
 {
    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
    CV_Assert(!empty());
@ -299,28 +306,35 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
    // temp solution
    objects.create(image.rows, image.cols, CV_32SC4);
-    scaledImageBuffer.create(image.size(), image.type());
+    if (maxObjectSize == cv::Size())
        maxObjectSize = image.size();
    scaledImageBuffer.create(image.rows + 1, image.cols + 1, CV_8U);
    // TODO: specify max objects size
    for( double factor = 1; ; factor *= scaleFactor )
    {
        cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
        cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
        cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
        // nothing to do
        if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
            break;
-        // TODO: min max object sizes cheching
+
-        cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, INTER_NEAREST);
+        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
-        //prepare image for evaluation
+            break;
        // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
        //     continue;
        cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, CV_INTER_LINEAR);
        integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32SC1);
        cv::gpu::integral(scaledImageBuffer, integral);
        int step = (factor <= 2.) + 1;
-        cv::gpu::device::lbp::cascadeClassify(stage_mat, trees_mat, nodes_mat, leaves_mat, subsets_mat, features_mat,
+        cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
-         integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, minNeighbors);
+        integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects);
    }
    // TODO: reject levels
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -46,54 +46,69 @@ namespace cv { namespace gpu { namespace device
 {
    namespace lbp
    {
-        __global__ void lbp_classify(const DevMem2D_< ::cv::gpu::device::Stage> stages, const DevMem2Di trees, const DevMem2D_< ::cv::gpu::device::ClNode> nodes,
+        __global__ void lbp_classify_stump(Stage* stages, int nstages, ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
-            const DevMem2Df leaves, const DevMem2Di subsets,
+            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects)
            const DevMem2D_<uchar4> features, const DevMem2Di integral, float step, int subsetSize, DevMem2D_<int4> objects, float scale, int clWidth, int clHeight)
        {
-            unsigned int x = threadIdx.x * step;
+            int y = threadIdx.x * scale;
-            unsigned int y = blockIdx.x  * step;
+            int x = blockIdx.x * scale;
            int nodeOfs = 0, leafOfs = 0;
            ::cv::gpu::device::Feature evaluator;
-            for (int s = 0; s < stages.cols; s++ )
+            int i = 0;
            {
                ::cv::gpu::device::Stage stage = stages(0, s);
                int sum = 0;
                for (int w = 0; w < stage.ntrees; w++)
                {
                    ::cv::gpu::device::ClNode node = nodes(0, nodeOfs);
                    uchar4 feature = features(0, node.featureIdx);
-                    uchar c = evaluator(y, x, feature, integral);
+            int current_node = 0;
-                    const int subsetIdx = (nodeOfs * subsetSize);
+            int current_leave = 0;
-                    int idx = subsetIdx + ((c >> 5) & ( 1 << (c & 31)) ? leafOfs : leafOfs + 1);
+
-                    sum += leaves(0, subsets(0, idx) );
+            LBP evaluator;
-                    nodeOfs++;
+            for (int s = 0; s < nstages; s++ )
-                    leafOfs += 2;
+            {
                float sum = 0;
                Stage stage = stages[s];
                for (int t = 0; t < stage.ntrees; t++)
                {
                    ClNode node = nodes[current_node];
                    uchar4 feature = features[node.featureIdx];
                    int c = evaluator(y, x, feature, integral);
                    const int* subsetIdx = subsets + (current_node * subsetSize);
                    int idx =  (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1;
                    sum += leaves[idx];
                    current_node += 1;
                    current_leave += 2;
                }
                i = s;
                if (sum < stage.threshold)
                    return;
            }
            int4 rect;
            rect.x = roundf(x * scale);
            rect.y = roundf(y * scale);
-            rect.z = roundf(clWidth * scale);
+            rect.z = roundf(clWidth);
-            rect.w = roundf(clHeight * scale);
+            rect.w = roundf(clHeight);
-            objects(blockIdx.x, threadIdx.x) = rect;
+
            if(i >= 19)
                printf( "GPU detected [%d, %d] - [%d, %d]\n", rect.x, rect.y, rect.z, rect.w);
        }
-        void cascadeClassify(const DevMem2Db bstages, const DevMem2Di trees, const DevMem2Db bnodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db bfeatures,
+        void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
-            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors, cudaStream_t stream)
+                           const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
                           DevMem2D_<int4> objects)
        {
            printf("CascadeClassify");
            int blocks  = ceilf(workHeight / (float)step);
            int threads = ceilf(workWidth / (float)step);
-            DevMem2D_< ::cv::gpu::device::Stage> stages = DevMem2D_< ::cv::gpu::device::Stage>(bstages);
+            printf("blocks %d, threads %d\n", blocks, threads);
            DevMem2D_<uchar4> features = (DevMem2D_<uchar4>)bfeatures;
            DevMem2D_< ::cv::gpu::device::ClNode> nodes = DevMem2D_< ::cv::gpu::device::ClNode>(bnodes);
-            lbp_classify<<<blocks, threads>>>(stages, trees, nodes, leaves, subsets, features, integral, step, subsetSize, objects, scale, clWidth, clHeight);
+            Stage* stages = (Stage*)(mstages.ptr());
            ClNode* nodes = (ClNode*)(mnodes.ptr());
            const float* leaves = mleaves.ptr();
            const int* subsets = msubsets.ptr();
            const uchar4* features = (uchar4*)(mfeatures.ptr());
            lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integral,
                workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects);
        }
    }
 }}}
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@ -44,62 +44,58 @@
 #define __OPENCV_GPU_DEVICE_LBP_HPP_
 #include "internal_shared.hpp"
 // #include "opencv2/gpu/device/border_interpolate.hpp"
 // #include "opencv2/gpu/device/vec_traits.hpp"
 // #include "opencv2/gpu/device/vec_math.hpp"
 // #include "opencv2/gpu/device/saturate_cast.hpp"
 // #include "opencv2/gpu/device/filters.hpp"
 // #define CALC_SUM_(p0, p1, p2, p3, offset) \
 //     ((p0)[offset] - (p1)[offset] - (p2)[offset] + (p3)[offset])
 // __device__ __forceinline__ int sum(p0, p1, p2, p3, offset)
 // {
 // }
 namespace cv { namespace gpu { namespace device {
 namespace lbp{
    struct Stage
    {
        int    first;
        int    ntrees;
        float  threshold;
        __device__ __forceinline__ Stage(int f = 0, int n = 0, float t = 0.f) : first(f), ntrees(n), threshold(t) {}
        __device__ __forceinline__ Stage(const Stage& other) : first(other.first), ntrees(other.ntrees), threshold(other.threshold) {}
    };
    struct ClNode
    {
        int   featureIdx;
        int   left;
        int   right;
-        __device__ __forceinline__  ClNode(int f = 0, int l = 0, int r = 0) : featureIdx(f), left(l), right(r) {}
+        int   featureIdx;
        __device__ __forceinline__  ClNode(const ClNode& other) : featureIdx(other.featureIdx), left(other.left), right(other.right) {}
    };
-    struct Feature
+    struct LBP
    {
-        __device__ __forceinline__ Feature(const Feature& other) {(void)other;}
+        __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
-        __device__ __forceinline__ Feature() {}
+        __device__ __forceinline__ LBP() {}
        //feature as uchar x, y - left top, z,w - right bottom
-        __device__ __forceinline__ uchar operator() (unsigned int y, unsigned int x, uchar4 feature, const DevMem2Di integral) const
+        __device__ __forceinline__ int operator() (unsigned int y, unsigned int x, uchar4 feature, const DevMem2Di integral) const
        {
            int x_off = 2 * feature.z;
            int y_off = 2 * feature.w;
            // printf("feature: %d %d %d %d\n", (int)feature.x, (int)feature.y, (int)feature.z, (int)feature.w);
            feature.z += feature.x;
            feature.w += feature.y;
            // load feature key points
            int anchors[16];
            /*
            P0-----P1-----P2-----P3
            |      |      |       |
            P4-----P5-----P6-----P7
            |      |      |       |
            P8-----P9-----P10----P11
            |      |      |       |
            P12----P13----P14----15
            */
            anchors[0]  = integral(y + feature.y, x + feature.x);
            anchors[1]  = integral(y + feature.y, x + feature.z);
-            anchors[2]  = integral(y + feature.y, x + x_off + feature.x);
+            anchors[2]  = integral(y + feature.y, x + feature.x + x_off);
-            anchors[3]  = integral(y + feature.y, x + x_off + feature.z);
+            anchors[3]  = integral(y + feature.y, x + feature.z + x_off);
            anchors[4]  = integral(y + feature.w, x + feature.x);
            anchors[5]  = integral(y + feature.w, x + feature.z);
-            anchors[6]  = integral(y + feature.w, x + x_off + feature.x);
+            anchors[6]  = integral(y + feature.w, x + feature.x + x_off);
-            anchors[7]  = integral(y + feature.w, x + x_off + feature.z);
+            anchors[7]  = integral(y + feature.w, x + feature.z + x_off);
            anchors[8]  = integral(y + y_off + feature.y, x + feature.x);
            anchors[9]  = integral(y + y_off + feature.y, x + feature.z);
@ -114,7 +110,7 @@ namespace cv { namespace gpu { namespace device {
            // calculate feature
            int sum = anchors[5] - anchors[6] - anchors[9] + anchors[10];
-            uchar response = (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)
+            int response =   (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)
                            |(( (anchors[ 1] - anchors[ 2] - anchors[ 5] + anchors[ 6]) >= sum )? 64  : 0)
                            |(( (anchors[ 2] - anchors[ 3] - anchors[ 6] + anchors[ 7]) >= sum )? 32  : 0)
                            |(( (anchors[ 6] - anchors[ 7] - anchors[10] + anchors[11]) >= sum )? 16  : 0)
@ -122,11 +118,12 @@ namespace cv { namespace gpu { namespace device {
                            |(( (anchors[ 9] - anchors[10] - anchors[13] + anchors[14]) >= sum )? 4   : 0)
                            |(( (anchors[ 8] - anchors[ 9] - anchors[12] + anchors[13]) >= sum )? 2   : 0)
                            |(( (anchors[ 4] - anchors[ 5] - anchors[ 8] + anchors[ 9]) >= sum )? 1   : 0);
            return response;
        }
    };
 } // lbp
 } } }// namespaces
 #endif