From ed1b293d34a0b5d0a31d127eecb3713ebca71a72 Mon Sep 17 00:00:00 2001 From: Marina Kolpakova Date: Wed, 11 Jul 2012 12:22:22 +0000 Subject: [PATCH] refactored GPU LBP cascade. Added support for big images. Fixed bug in connected components function --- modules/gpu/src/cascadeclassifier.cpp | 80 ++++++- modules/gpu/src/cuda/lbp.cu | 229 ++++++++++++--------- modules/gpu/src/opencv2/gpu/device/lbp.hpp | 3 +- 3 files changed, 201 insertions(+), 111 deletions(-) diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 5422dcf99..09c106703 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -315,7 +315,24 @@ namespace cv { namespace gpu { namespace device DevMem2D_ objects, unsigned int* classified); - int connectedConmonents(DevMem2D_ candidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); + void classifyStumpFixed(const DevMem2Db& mstages, + const int nstages, + const DevMem2Di& mnodes, + const DevMem2Df& mleaves, + const DevMem2Di& msubsets, + const DevMem2Db& mfeatures, + const int workWidth, + const int workHeight, + const int clWidth, + const int clHeight, + float scale, + int step, + int subsetSize, + DevMem2D_ objects, + unsigned int* classified, + const int maxX); + + int connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects,int groupThreshold, float grouping_eps, unsigned int* nclasses); void bindIntegral(DevMem2Di integral); void unbindIntegral(); } @@ -337,8 +354,8 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp GpuMat candidates(1 , image.cols >> 1, CV_32SC4); // GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4); // used for debug - // candidates.setTo(cv::Scalar::all(0)); - // objects.setTo(cv::Scalar::all(0)); + candidates.setTo(cv::Scalar::all(0)); + objects.setTo(cv::Scalar::all(0)); if (maxObjectSize == cv::Size()) maxObjectSize = image.size(); @@ -349,16 +366,50 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp unsigned int* dclassified; cudaMalloc(&dclassified, sizeof(int)); cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice); - int step; + int step = 2; cv::gpu::device::lbp::bindIntegral(integral); - for( double factor = 1; ; factor *= scaleFactor ) - { - // if (factor > 2.0) break; - cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); - cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor )); - cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); + cv::Size scaledImageSize(image.cols, image.rows); + cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); + cv::Size windowSize(NxM.width, NxM.height); + double factor = 1; + + for (; processingRectSize.width / step >= 256;) + { + // std::cout << "IN FIXED: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl; + // if (factor > 2.0) break; + if (processingRectSize.width <= 0 || processingRectSize.height <= 0 ) + break; + + if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height ) + break; + + // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height ) + // continue; + + GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height)); + GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1)); + GpuMat currBuff = integralBuffer; + + cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR); + cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff); + + step = (factor <= 2.) + 1; + + cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, + processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified, processingRectSize.width); + + factor *= scaleFactor; + windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); + scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor )); + processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); + } + + for (; /*processingRectSize.width / step >= 128*/;) + { + // std::cout << "In FLOATING: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl; + // if (factor > 2.0) break; if (processingRectSize.width <= 0 || processingRectSize.height <= 0 ) break; @@ -379,12 +430,19 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified); + + factor *= scaleFactor; + windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor)); + scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor )); + processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 ); } cv::gpu::device::lbp::unbindIntegral(); if (groupThreshold <= 0 || objects.empty()) return 0; - cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified); + cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost); + // std::cout << "!!! CLASSIFIED " << *classified << std::endl; + cv::gpu::device::lbp::connectedConmonents(candidates, *classified, objects, groupThreshold, grouping_eps, dclassified); cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost); cudaSafeCall( cudaDeviceSynchronize() ); step = *classified; diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index ba2e29448..eab41b568 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -53,28 +53,27 @@ namespace cv { namespace gpu { namespace device struct LBP { - __device__ __forceinline__ LBP(const LBP& other) {(void)other;} - __device__ __forceinline__ LBP() {} + __host__ __device__ __forceinline__ LBP(const LBP& other) {(void)other;} + __host__ __device__ __forceinline__ LBP() {} - //feature as uchar x, y - left top, z,w - right bottom - __device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const + __device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const { int anchors[9]; anchors[0] = tex2D(tintegral, tx, ty); - anchors[1] = tex2D(tintegral, tx + featurez, ty); + anchors[1] = tex2D(tintegral, tx + fw, ty); anchors[0] -= anchors[1]; - anchors[2] = tex2D(tintegral, tx + featurez * 2, ty); + anchors[2] = tex2D(tintegral, tx + fw * 2, ty); anchors[1] -= anchors[2]; - anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty); + anchors[2] -= tex2D(tintegral, tx + fw * 3, ty); ty += fh; anchors[3] = tex2D(tintegral, tx, ty); - anchors[4] = tex2D(tintegral, tx + featurez, ty); + anchors[4] = tex2D(tintegral, tx + fw, ty); anchors[3] -= anchors[4]; - anchors[5] = tex2D(tintegral, tx + featurez * 2, ty); + anchors[5] = tex2D(tintegral, tx + fw * 2, ty); anchors[4] -= anchors[5]; - anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty); + anchors[5] -= tex2D(tintegral, tx + fw * 3, ty); anchors[0] -= anchors[3]; anchors[1] -= anchors[4]; @@ -83,11 +82,11 @@ namespace cv { namespace gpu { namespace device ty += fh; anchors[6] = tex2D(tintegral, tx, ty); - anchors[7] = tex2D(tintegral, tx + featurez, ty); + anchors[7] = tex2D(tintegral, tx + fw, ty); anchors[6] -= anchors[7]; - anchors[8] = tex2D(tintegral, tx + featurez * 2, ty); + anchors[8] = tex2D(tintegral, tx + fw * 2, ty); anchors[7] -= anchors[8]; - anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty); + anchors[8] -= tex2D(tintegral, tx + fw * 3, ty); anchors[3] -= anchors[6]; anchors[4] -= anchors[7]; @@ -109,11 +108,11 @@ namespace cv { namespace gpu { namespace device ty += fh; anchors[0] = tex2D(tintegral, tx, ty); - anchors[1] = tex2D(tintegral, tx + featurez, ty); + anchors[1] = tex2D(tintegral, tx + fw, ty); anchors[0] -= anchors[1]; - anchors[2] = tex2D(tintegral, tx + featurez * 2, ty); + anchors[2] = tex2D(tintegral, tx + fw * 2, ty); anchors[1] -= anchors[2]; - anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty); + anchors[2] -= tex2D(tintegral, tx + fw * 3, ty); anchors[6] -= anchors[0]; anchors[7] -= anchors[1]; @@ -142,54 +141,90 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaUnbindTexture(&tintegral)); } - __global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features, - /* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step, - const int subsetSize, DevMem2D_ objects, unsigned int* n) + struct Classifier { - int x = threadIdx.x * step; - int y = blockIdx.x * step; + __host__ __device__ __forceinline__ Classifier(const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features, + const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize) + : stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight), + scale(_scale), step(_step), subsetSize(_subsetSize){} - int current_node = 0; - int current_leave = 0; - - LBP evaluator; - for (int s = 0; s < nstages; s++ ) + __device__ __forceinline__ void operator() (int y, int x, DevMem2D_ objects, const unsigned int maxN, unsigned int* n) const { - float sum = 0; - Stage stage = stages[s]; - for (int t = 0; t < stage.ntrees; t++) - { - ClNode node = nodes[current_node]; + int current_node = 0; + int current_leave = 0; - uchar4 feature = features[node.featureIdx]; - int shift; - int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift); - int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1; - sum += leaves[idx]; - current_node += 1; - current_leave += 2; + for (int s = 0; s < nstages; ++s) + { + float sum = 0; + Stage stage = stages[s]; + for (int t = 0; t < stage.ntrees; t++) + { + ClNode node = nodes[current_node]; + uchar4 feature = features[node.featureIdx]; + + int shift; + int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift); + int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1; + sum += leaves[idx]; + + current_node += 1; + current_leave += 2; + } + + if (sum < stage.threshold) + return; } - if (sum < stage.threshold) - return; + + int4 rect; + rect.x = roundf(x * scale); + rect.y = roundf(y * scale); + rect.z = clWidth; + rect.w = clHeight; + +#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) + int res = __atomicInc(n, maxN); +#else + int res = atomicInc(n, maxN); +#endif + objects(0, res) = rect; } - int4 rect; - rect.x = roundf(x * scale); - rect.y = roundf(y * scale); - rect.z = clWidth; - rect.w = clHeight; -#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) - int res = __atomicInc(n, 100U); -#else - int res = atomicInc(n, 100U); -#endif - objects(0, res) = rect; + const Stage* stages; + const ClNode* nodes; + const float* leaves; + const int* subsets; + const uchar4* features; + + const int nstages; + const int clWidth; + const int clHeight; + const float scale; + const int step; + const int subsetSize; + const LBP evaluator; + }; + + __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_ objects, const unsigned int maxN, unsigned int* n) + { + int x = threadIdx.x * classifier.step; + int y = blockIdx.x * classifier.step; + + classifier(y, x, objects, maxN, n); + } + + __global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_ objects, const unsigned int maxN, unsigned int* n, int lines, int maxX) + { + int x = threadIdx.x * lines * classifier.step; + if (x >= maxX) return; + + int y = blockIdx.x * classifier.step / lines; + + classifier(y, x, objects, maxN, n); } template __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses) { - using cv::gpu::device::VecTraits; unsigned int tid = threadIdx.x; extern __shared__ int sbuff[]; @@ -207,23 +242,26 @@ namespace cv { namespace gpu { namespace device int cls = labels[tid]; #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) - __atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x); - __atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y); - __atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z); - __atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w); + __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x); + __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y); + __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z); + __atomicAdd((rrects + cls * 4 + 3), candidates[tid].w); #else - atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x); - atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y); - atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z); - atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w); + atomicAdd((rrects + cls * 4 + 0), candidates[tid].x); + atomicAdd((rrects + cls * 4 + 1), candidates[tid].y); + atomicAdd((rrects + cls * 4 + 2), candidates[tid].z); + atomicAdd((rrects + cls * 4 + 3), candidates[tid].w); #endif + __syncthreads(); labels[tid] = 0; + __syncthreads(); #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) __atomicInc((unsigned int*)labels + cls, n); #else atomicInc((unsigned int*)labels + cls, n); #endif + __syncthreads(); *nclasses = 0; int active = labels[tid]; @@ -235,61 +273,54 @@ namespace cv { namespace gpu { namespace device r1[1] = saturate_cast(r1[1] * s); r1[2] = saturate_cast(r1[2] * s); r1[3] = saturate_cast(r1[3] * s); + } + __syncthreads(); - int n1 = active; - __syncthreads(); - unsigned int j = 0; - if( active > groupThreshold ) - { - for (j = 0; j < n; j++) - { - int n2 = labels[j]; - if(!n2 || j == tid || n2 <= groupThreshold ) - continue; - - int* r2 = rrects + j * 4; - - int dx = saturate_cast( r2[2] * grouping_eps ); - int dy = saturate_cast( r2[3] * grouping_eps ); - - if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy && - r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy && - (n2 > max(3, n1) || n1 < 3) ) - break; - } - if( j == n) - { + if (active && active >= groupThreshold) + { + int* r1 = rrects + tid * 4; + int4 r_out; + r_out.x = r1[0]; + r_out.y = r1[1]; + r_out.z = r1[2]; + r_out.w = r1[3]; #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) - objects[__atomicInc(nclasses, n)] = VecTraits::make(r1[0], r1[1], r1[2], r1[3]); + objects[__atomicInc(nclasses, n)] = r_out; #else - objects[atomicInc(nclasses, n)] = VecTraits::make(r1[0], r1[1], r1[2], r1[3]); + int aidx = atomicInc(nclasses, n); + objects[aidx] = r_out; #endif - } - } } } void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, - /*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, - DevMem2D_ objects, unsigned int* classified) + const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified) { int blocks = ceilf(workHeight / (float)step); int threads = ceilf(workWidth / (float)step); - Stage* stages = (Stage*)(mstages.ptr()); - ClNode* nodes = (ClNode*)(mnodes.ptr()); - const float* leaves = mleaves.ptr(); - const int* subsets = msubsets.ptr(); - const uchar4* features = (uchar4*)(mfeatures.ptr()); - lbp_classify_stump<<>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep, - workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified); + Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); + lbp_classify_stump<<>>(clr, objects, objects.cols, classified); } - int connectedConmonents(DevMem2D_ candidates, DevMem2D_ objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) + void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures, + const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_ objects, unsigned int* classified, + int maxX) { - int threads = candidates.cols; + const int THREADS_BLOCK = 256; + int blocks = ceilf(workHeight / (float)step); + int threads = ceilf(workWidth / (float)step); + + Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize); + int lines = divUp(threads, THREADS_BLOCK); + lbp_classify_stump<<>>(clr, objects, objects.cols, classified, lines, maxX); + } + + int connectedConmonents(DevMem2D_ candidates, int ncandidates, DevMem2D_ objects, int groupThreshold, float grouping_eps, unsigned int* nclasses) + { + int threads = ncandidates; int smem_amount = threads * sizeof(int) + threads * sizeof(int4); - disjoin<<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses); + disjoin<<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses); return 0; } } diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp index 69867c993..b3cf6dc27 100644 --- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp @@ -61,6 +61,7 @@ __device__ __forceinline__ T __atomicInc(T* address, T val) count = tag | (count + 1); *address = count; } while (*address != count); + return (count & TAG_MASK) - 1; } @@ -85,6 +86,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val) { *address = count; } while (*address > count); + return count; } @@ -151,7 +153,6 @@ __device__ __forceinline__ T __atomicMin(T* address, T val) } } __syncthreads(); - // printf("tid %d label %d\n", tid, labels[tid]); } } // lbp