added lbp cascade test, fixed race conditions problems

2012-07-04 12:11:16 +00:00
parent 248f39e13e
commit e496345d33
4 changed files with 73 additions and 23 deletions
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace device
                      DevMem2D_<int4> objects,
                      unsigned int* classified);

-        int connectedConmonents(DevMem2D_<int4> candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses);
+        int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
    }
 }}}

@@ -308,6 +308,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
    else
        objects.create(1 , defaultObjSearchNum, CV_32SC4);

+    GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
    if (maxObjectSize == cv::Size())
        maxObjectSize = image.size();

@@ -317,6 +318,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
    unsigned int* dclassified;
    cudaMalloc(&dclassified, sizeof(int));
    cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
+    int step;

    for( double factor = 1; ; factor *= scaleFactor )
    {
@@ -334,25 +336,22 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
        //     continue;

        cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, CV_INTER_LINEAR);
-
-        integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32SC1);
        cv::gpu::integral(scaledImageBuffer, integral);

-        int step = (factor <= 2.) + 1;
+        step = (factor <= 2.) + 1;

        cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
-        integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, dclassified);
+        integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
    }
-
-    cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
-    GpuMat candidates(1, *classified, objects.type(), objects.ptr());
-    // std::cout  << *classified << " Results: " << cv::Mat(candidates) << std::endl;
-
    if (groupThreshold <= 0  || objects.empty())
        return 0;
-    cv::gpu::device::lbp::connectedConmonents(candidates, groupThreshold, grouping_eps, dclassified);
+    cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified);
+    cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
    cudaSafeCall( cudaDeviceSynchronize() );
-    return *classified;
+    step = *classified;
+    delete[] classified;
+    cudaFree(dclassified);
+    return step;
 }

 // ============ old fashioned haar cascade ==============================================//
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -51,8 +51,8 @@ namespace cv { namespace gpu { namespace device
        __global__ void lbp_classify_stump(Stage* stages, int nstages, ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
        {
-            int y = threadIdx.x * scale;
-            int x = blockIdx.x * scale;
+            int x = threadIdx.x * step;
+            int y = blockIdx.x * step;

            int current_node = 0;
            int current_leave = 0;
@@ -92,7 +92,7 @@ namespace cv { namespace gpu { namespace device
        }

        template<typename Pr>
-        __global__ void disjoin(int4* candidates, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
        {
            using cv::gpu::device::VecTraits;
            unsigned int tid = threadIdx.x;
@@ -119,7 +119,7 @@ namespace cv { namespace gpu { namespace device
            __syncthreads();

            atomicInc((unsigned int*)labels + cls, n);
-            labels[n - 1] = 0;
+            *nclasses = 0;

            int active = labels[tid];
            if (active)
@@ -152,11 +152,9 @@ namespace cv { namespace gpu { namespace device
                            (n2 > max(3, n1) || n1 < 3) )
                            break;
                    }
-
                    if( j == n)
                    {
-                        // printf("founded gpu %d %d %d %d \n", r1[0], r1[1], r1[2], r1[3]);
-                        candidates[atomicInc((unsigned int*)labels + n -1, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+                        objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
                    }
                }
            }
@@ -179,11 +177,11 @@ namespace cv { namespace gpu { namespace device
                workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
        }

-        int connectedConmonents(DevMem2D_<int4> candidates, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
        {
            int threads = candidates.cols;
            int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
-            disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses);
+            disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses);
            return 0;
        }
    }
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -65,12 +65,12 @@ namespace lbp{
    struct InSameComponint
    {
    public:
-        __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps * 0.5) {}
+        __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps) {}
        __device__ __forceinline__ InSameComponint(const InSameComponint& other) : eps(other.eps) {}

        __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
        {
-            double delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w));
+            float delta = eps * (min(r1.z, r2.z) + min(r1.w, r2.w)) * 0.5;

            return abs(r1.x - r2.x) <= delta && abs(r1.y - r2.y) <= delta
                && abs(r1.x + r1.z - r2.x - r2.z) <= delta && abs(r1.y + r1.w - r2.y - r2.w) <= delta;