refactored GPU LBP cascade. Added support for big images. Fixed bug in connected components function
This commit is contained in:
parent
469ec7c522
commit
ed1b293d34
@ -315,7 +315,24 @@ namespace cv { namespace gpu { namespace device
|
|||||||
DevMem2D_<int4> objects,
|
DevMem2D_<int4> objects,
|
||||||
unsigned int* classified);
|
unsigned int* classified);
|
||||||
|
|
||||||
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
void classifyStumpFixed(const DevMem2Db& mstages,
|
||||||
|
const int nstages,
|
||||||
|
const DevMem2Di& mnodes,
|
||||||
|
const DevMem2Df& mleaves,
|
||||||
|
const DevMem2Di& msubsets,
|
||||||
|
const DevMem2Db& mfeatures,
|
||||||
|
const int workWidth,
|
||||||
|
const int workHeight,
|
||||||
|
const int clWidth,
|
||||||
|
const int clHeight,
|
||||||
|
float scale,
|
||||||
|
int step,
|
||||||
|
int subsetSize,
|
||||||
|
DevMem2D_<int4> objects,
|
||||||
|
unsigned int* classified,
|
||||||
|
const int maxX);
|
||||||
|
|
||||||
|
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
||||||
void bindIntegral(DevMem2Di integral);
|
void bindIntegral(DevMem2Di integral);
|
||||||
void unbindIntegral();
|
void unbindIntegral();
|
||||||
}
|
}
|
||||||
@ -337,8 +354,8 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
GpuMat candidates(1 , image.cols >> 1, CV_32SC4);
|
GpuMat candidates(1 , image.cols >> 1, CV_32SC4);
|
||||||
// GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
|
// GpuMat candidates(1 , defaultObjSearchNum, CV_32SC4);
|
||||||
// used for debug
|
// used for debug
|
||||||
// candidates.setTo(cv::Scalar::all(0));
|
candidates.setTo(cv::Scalar::all(0));
|
||||||
// objects.setTo(cv::Scalar::all(0));
|
objects.setTo(cv::Scalar::all(0));
|
||||||
if (maxObjectSize == cv::Size())
|
if (maxObjectSize == cv::Size())
|
||||||
maxObjectSize = image.size();
|
maxObjectSize = image.size();
|
||||||
|
|
||||||
@ -349,16 +366,50 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
unsigned int* dclassified;
|
unsigned int* dclassified;
|
||||||
cudaMalloc(&dclassified, sizeof(int));
|
cudaMalloc(&dclassified, sizeof(int));
|
||||||
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
|
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
|
||||||
int step;
|
int step = 2;
|
||||||
cv::gpu::device::lbp::bindIntegral(integral);
|
cv::gpu::device::lbp::bindIntegral(integral);
|
||||||
|
|
||||||
for( double factor = 1; ; factor *= scaleFactor )
|
cv::Size scaledImageSize(image.cols, image.rows);
|
||||||
{
|
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
||||||
// if (factor > 2.0) break;
|
cv::Size windowSize(NxM.width, NxM.height);
|
||||||
cv::Size windowSize(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
|
|
||||||
cv::Size scaledImageSize(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
|
|
||||||
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
|
||||||
|
|
||||||
|
double factor = 1;
|
||||||
|
|
||||||
|
for (; processingRectSize.width / step >= 256;)
|
||||||
|
{
|
||||||
|
// std::cout << "IN FIXED: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;
|
||||||
|
// if (factor > 2.0) break;
|
||||||
|
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
|
||||||
|
break;
|
||||||
|
|
||||||
|
if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
|
||||||
|
break;
|
||||||
|
|
||||||
|
// if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
|
||||||
|
// continue;
|
||||||
|
|
||||||
|
GpuMat scaledImg(resuzeBuffer, cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
|
||||||
|
GpuMat scaledIntegral(integral, cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
|
||||||
|
GpuMat currBuff = integralBuffer;
|
||||||
|
|
||||||
|
cv::gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
|
||||||
|
cv::gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
|
||||||
|
|
||||||
|
step = (factor <= 2.) + 1;
|
||||||
|
|
||||||
|
cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
|
||||||
|
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified, processingRectSize.width);
|
||||||
|
|
||||||
|
factor *= scaleFactor;
|
||||||
|
windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
|
||||||
|
scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
|
||||||
|
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; /*processingRectSize.width / step >= 128*/;)
|
||||||
|
{
|
||||||
|
// std::cout << "In FLOATING: factor " << factor << " size " << processingRectSize.width << " " << processingRectSize.height << std::endl;
|
||||||
|
// if (factor > 2.0) break;
|
||||||
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
|
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -379,12 +430,19 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
|
|
||||||
cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
|
cv::gpu::device::lbp::classifyStump(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
|
||||||
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
|
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
|
||||||
|
|
||||||
|
factor *= scaleFactor;
|
||||||
|
windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
|
||||||
|
scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
|
||||||
|
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
cv::gpu::device::lbp::unbindIntegral();
|
cv::gpu::device::lbp::unbindIntegral();
|
||||||
if (groupThreshold <= 0 || objects.empty())
|
if (groupThreshold <= 0 || objects.empty())
|
||||||
return 0;
|
return 0;
|
||||||
cv::gpu::device::lbp::connectedConmonents(candidates, objects, groupThreshold, grouping_eps, dclassified);
|
cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
// std::cout << "!!! CLASSIFIED " << *classified << std::endl;
|
||||||
|
cv::gpu::device::lbp::connectedConmonents(candidates, *classified, objects, groupThreshold, grouping_eps, dclassified);
|
||||||
cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
|
cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
cudaSafeCall( cudaDeviceSynchronize() );
|
||||||
step = *classified;
|
step = *classified;
|
||||||
|
@ -53,28 +53,27 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
struct LBP
|
struct LBP
|
||||||
{
|
{
|
||||||
__device__ __forceinline__ LBP(const LBP& other) {(void)other;}
|
__host__ __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
|
||||||
__device__ __forceinline__ LBP() {}
|
__host__ __device__ __forceinline__ LBP() {}
|
||||||
|
|
||||||
//feature as uchar x, y - left top, z,w - right bottom
|
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const
|
||||||
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
|
|
||||||
{
|
{
|
||||||
int anchors[9];
|
int anchors[9];
|
||||||
|
|
||||||
anchors[0] = tex2D(tintegral, tx, ty);
|
anchors[0] = tex2D(tintegral, tx, ty);
|
||||||
anchors[1] = tex2D(tintegral, tx + featurez, ty);
|
anchors[1] = tex2D(tintegral, tx + fw, ty);
|
||||||
anchors[0] -= anchors[1];
|
anchors[0] -= anchors[1];
|
||||||
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
|
anchors[2] = tex2D(tintegral, tx + fw * 2, ty);
|
||||||
anchors[1] -= anchors[2];
|
anchors[1] -= anchors[2];
|
||||||
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
|
anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
|
||||||
|
|
||||||
ty += fh;
|
ty += fh;
|
||||||
anchors[3] = tex2D(tintegral, tx, ty);
|
anchors[3] = tex2D(tintegral, tx, ty);
|
||||||
anchors[4] = tex2D(tintegral, tx + featurez, ty);
|
anchors[4] = tex2D(tintegral, tx + fw, ty);
|
||||||
anchors[3] -= anchors[4];
|
anchors[3] -= anchors[4];
|
||||||
anchors[5] = tex2D(tintegral, tx + featurez * 2, ty);
|
anchors[5] = tex2D(tintegral, tx + fw * 2, ty);
|
||||||
anchors[4] -= anchors[5];
|
anchors[4] -= anchors[5];
|
||||||
anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
|
anchors[5] -= tex2D(tintegral, tx + fw * 3, ty);
|
||||||
|
|
||||||
anchors[0] -= anchors[3];
|
anchors[0] -= anchors[3];
|
||||||
anchors[1] -= anchors[4];
|
anchors[1] -= anchors[4];
|
||||||
@ -83,11 +82,11 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
ty += fh;
|
ty += fh;
|
||||||
anchors[6] = tex2D(tintegral, tx, ty);
|
anchors[6] = tex2D(tintegral, tx, ty);
|
||||||
anchors[7] = tex2D(tintegral, tx + featurez, ty);
|
anchors[7] = tex2D(tintegral, tx + fw, ty);
|
||||||
anchors[6] -= anchors[7];
|
anchors[6] -= anchors[7];
|
||||||
anchors[8] = tex2D(tintegral, tx + featurez * 2, ty);
|
anchors[8] = tex2D(tintegral, tx + fw * 2, ty);
|
||||||
anchors[7] -= anchors[8];
|
anchors[7] -= anchors[8];
|
||||||
anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
|
anchors[8] -= tex2D(tintegral, tx + fw * 3, ty);
|
||||||
|
|
||||||
anchors[3] -= anchors[6];
|
anchors[3] -= anchors[6];
|
||||||
anchors[4] -= anchors[7];
|
anchors[4] -= anchors[7];
|
||||||
@ -109,11 +108,11 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
ty += fh;
|
ty += fh;
|
||||||
anchors[0] = tex2D(tintegral, tx, ty);
|
anchors[0] = tex2D(tintegral, tx, ty);
|
||||||
anchors[1] = tex2D(tintegral, tx + featurez, ty);
|
anchors[1] = tex2D(tintegral, tx + fw, ty);
|
||||||
anchors[0] -= anchors[1];
|
anchors[0] -= anchors[1];
|
||||||
anchors[2] = tex2D(tintegral, tx + featurez * 2, ty);
|
anchors[2] = tex2D(tintegral, tx + fw * 2, ty);
|
||||||
anchors[1] -= anchors[2];
|
anchors[1] -= anchors[2];
|
||||||
anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
|
anchors[2] -= tex2D(tintegral, tx + fw * 3, ty);
|
||||||
|
|
||||||
anchors[6] -= anchors[0];
|
anchors[6] -= anchors[0];
|
||||||
anchors[7] -= anchors[1];
|
anchors[7] -= anchors[1];
|
||||||
@ -142,54 +141,90 @@ namespace cv { namespace gpu { namespace device
|
|||||||
cudaSafeCall( cudaUnbindTexture(&tintegral));
|
cudaSafeCall( cudaUnbindTexture(&tintegral));
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
|
struct Classifier
|
||||||
/* const int* integral,const int istep, const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
|
|
||||||
const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
|
|
||||||
{
|
{
|
||||||
int x = threadIdx.x * step;
|
__host__ __device__ __forceinline__ Classifier(const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
|
||||||
int y = blockIdx.x * step;
|
const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
|
||||||
|
: stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
|
||||||
|
scale(_scale), step(_step), subsetSize(_subsetSize){}
|
||||||
|
|
||||||
int current_node = 0;
|
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
|
||||||
int current_leave = 0;
|
|
||||||
|
|
||||||
LBP evaluator;
|
|
||||||
for (int s = 0; s < nstages; s++ )
|
|
||||||
{
|
{
|
||||||
float sum = 0;
|
int current_node = 0;
|
||||||
Stage stage = stages[s];
|
int current_leave = 0;
|
||||||
for (int t = 0; t < stage.ntrees; t++)
|
|
||||||
{
|
|
||||||
ClNode node = nodes[current_node];
|
|
||||||
|
|
||||||
uchar4 feature = features[node.featureIdx];
|
for (int s = 0; s < nstages; ++s)
|
||||||
int shift;
|
{
|
||||||
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
|
float sum = 0;
|
||||||
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
|
Stage stage = stages[s];
|
||||||
sum += leaves[idx];
|
for (int t = 0; t < stage.ntrees; t++)
|
||||||
current_node += 1;
|
{
|
||||||
current_leave += 2;
|
ClNode node = nodes[current_node];
|
||||||
|
uchar4 feature = features[node.featureIdx];
|
||||||
|
|
||||||
|
int shift;
|
||||||
|
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
|
||||||
|
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
|
||||||
|
sum += leaves[idx];
|
||||||
|
|
||||||
|
current_node += 1;
|
||||||
|
current_leave += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sum < stage.threshold)
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if (sum < stage.threshold)
|
|
||||||
return;
|
int4 rect;
|
||||||
|
rect.x = roundf(x * scale);
|
||||||
|
rect.y = roundf(y * scale);
|
||||||
|
rect.z = clWidth;
|
||||||
|
rect.w = clHeight;
|
||||||
|
|
||||||
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||||
|
int res = __atomicInc(n, maxN);
|
||||||
|
#else
|
||||||
|
int res = atomicInc(n, maxN);
|
||||||
|
#endif
|
||||||
|
objects(0, res) = rect;
|
||||||
}
|
}
|
||||||
|
|
||||||
int4 rect;
|
const Stage* stages;
|
||||||
rect.x = roundf(x * scale);
|
const ClNode* nodes;
|
||||||
rect.y = roundf(y * scale);
|
const float* leaves;
|
||||||
rect.z = clWidth;
|
const int* subsets;
|
||||||
rect.w = clHeight;
|
const uchar4* features;
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
|
||||||
int res = __atomicInc(n, 100U);
|
const int nstages;
|
||||||
#else
|
const int clWidth;
|
||||||
int res = atomicInc(n, 100U);
|
const int clHeight;
|
||||||
#endif
|
const float scale;
|
||||||
objects(0, res) = rect;
|
const int step;
|
||||||
|
const int subsetSize;
|
||||||
|
const LBP evaluator;
|
||||||
|
};
|
||||||
|
|
||||||
|
__global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n)
|
||||||
|
{
|
||||||
|
int x = threadIdx.x * classifier.step;
|
||||||
|
int y = blockIdx.x * classifier.step;
|
||||||
|
|
||||||
|
classifier(y, x, objects, maxN, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void lbp_classify_stump(const Classifier classifier, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n, int lines, int maxX)
|
||||||
|
{
|
||||||
|
int x = threadIdx.x * lines * classifier.step;
|
||||||
|
if (x >= maxX) return;
|
||||||
|
|
||||||
|
int y = blockIdx.x * classifier.step / lines;
|
||||||
|
|
||||||
|
classifier(y, x, objects, maxN, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Pr>
|
template<typename Pr>
|
||||||
__global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
__global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
||||||
{
|
{
|
||||||
using cv::gpu::device::VecTraits;
|
|
||||||
unsigned int tid = threadIdx.x;
|
unsigned int tid = threadIdx.x;
|
||||||
extern __shared__ int sbuff[];
|
extern __shared__ int sbuff[];
|
||||||
|
|
||||||
@ -207,23 +242,26 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
int cls = labels[tid];
|
int cls = labels[tid];
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||||
__atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
|
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
||||||
__atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
|
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
||||||
__atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
|
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
||||||
__atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
|
__atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
|
||||||
#else
|
#else
|
||||||
atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
|
atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
||||||
atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
|
atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
||||||
atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
|
atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
||||||
atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
|
atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
|
||||||
#endif
|
#endif
|
||||||
|
__syncthreads();
|
||||||
labels[tid] = 0;
|
labels[tid] = 0;
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||||
__atomicInc((unsigned int*)labels + cls, n);
|
__atomicInc((unsigned int*)labels + cls, n);
|
||||||
#else
|
#else
|
||||||
atomicInc((unsigned int*)labels + cls, n);
|
atomicInc((unsigned int*)labels + cls, n);
|
||||||
#endif
|
#endif
|
||||||
|
__syncthreads();
|
||||||
*nclasses = 0;
|
*nclasses = 0;
|
||||||
|
|
||||||
int active = labels[tid];
|
int active = labels[tid];
|
||||||
@ -235,61 +273,54 @@ namespace cv { namespace gpu { namespace device
|
|||||||
r1[1] = saturate_cast<int>(r1[1] * s);
|
r1[1] = saturate_cast<int>(r1[1] * s);
|
||||||
r1[2] = saturate_cast<int>(r1[2] * s);
|
r1[2] = saturate_cast<int>(r1[2] * s);
|
||||||
r1[3] = saturate_cast<int>(r1[3] * s);
|
r1[3] = saturate_cast<int>(r1[3] * s);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
int n1 = active;
|
if (active && active >= groupThreshold)
|
||||||
__syncthreads();
|
{
|
||||||
unsigned int j = 0;
|
int* r1 = rrects + tid * 4;
|
||||||
if( active > groupThreshold )
|
int4 r_out;
|
||||||
{
|
r_out.x = r1[0];
|
||||||
for (j = 0; j < n; j++)
|
r_out.y = r1[1];
|
||||||
{
|
r_out.z = r1[2];
|
||||||
int n2 = labels[j];
|
r_out.w = r1[3];
|
||||||
if(!n2 || j == tid || n2 <= groupThreshold )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int* r2 = rrects + j * 4;
|
|
||||||
|
|
||||||
int dx = saturate_cast<int>( r2[2] * grouping_eps );
|
|
||||||
int dy = saturate_cast<int>( r2[3] * grouping_eps );
|
|
||||||
|
|
||||||
if( tid != j && r1[0] >= r2[0] - dx && r1[1] >= r2[1] - dy &&
|
|
||||||
r1[0] + r1[2] <= r2[0] + r2[2] + dx && r1[1] + r1[3] <= r2[1] + r2[3] + dy &&
|
|
||||||
(n2 > max(3, n1) || n1 < 3) )
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if( j == n)
|
|
||||||
{
|
|
||||||
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||||
objects[__atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
|
objects[__atomicInc(nclasses, n)] = r_out;
|
||||||
#else
|
#else
|
||||||
objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
|
int aidx = atomicInc(nclasses, n);
|
||||||
|
objects[aidx] = r_out;
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||||
/*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
|
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
||||||
DevMem2D_<int4> objects, unsigned int* classified)
|
|
||||||
{
|
{
|
||||||
int blocks = ceilf(workHeight / (float)step);
|
int blocks = ceilf(workHeight / (float)step);
|
||||||
int threads = ceilf(workWidth / (float)step);
|
int threads = ceilf(workWidth / (float)step);
|
||||||
|
|
||||||
Stage* stages = (Stage*)(mstages.ptr());
|
Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
|
||||||
ClNode* nodes = (ClNode*)(mnodes.ptr());
|
lbp_classify_stump<<<blocks, threads>>>(clr, objects, objects.cols, classified);
|
||||||
const float* leaves = mleaves.ptr();
|
|
||||||
const int* subsets = msubsets.ptr();
|
|
||||||
const uchar4* features = (uchar4*)(mfeatures.ptr());
|
|
||||||
lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
|
|
||||||
workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||||
|
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified,
|
||||||
|
int maxX)
|
||||||
{
|
{
|
||||||
int threads = candidates.cols;
|
const int THREADS_BLOCK = 256;
|
||||||
|
int blocks = ceilf(workHeight / (float)step);
|
||||||
|
int threads = ceilf(workWidth / (float)step);
|
||||||
|
|
||||||
|
Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
|
||||||
|
int lines = divUp(threads, THREADS_BLOCK);
|
||||||
|
lbp_classify_stump<<<blocks * lines, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, lines, maxX);
|
||||||
|
}
|
||||||
|
|
||||||
|
int connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
||||||
|
{
|
||||||
|
int threads = ncandidates;
|
||||||
int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
|
int smem_amount = threads * sizeof(int) + threads * sizeof(int4);
|
||||||
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), candidates.cols, groupThreshold, grouping_eps, nclasses);
|
disjoin<InSameComponint><<<1, threads, smem_amount>>>((int4*)candidates.ptr(), (int4*)objects.ptr(), ncandidates, groupThreshold, grouping_eps, nclasses);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -61,6 +61,7 @@ __device__ __forceinline__ T __atomicInc(T* address, T val)
|
|||||||
count = tag | (count + 1);
|
count = tag | (count + 1);
|
||||||
*address = count;
|
*address = count;
|
||||||
} while (*address != count);
|
} while (*address != count);
|
||||||
|
|
||||||
return (count & TAG_MASK) - 1;
|
return (count & TAG_MASK) - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,6 +86,7 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
|
|||||||
{
|
{
|
||||||
*address = count;
|
*address = count;
|
||||||
} while (*address > count);
|
} while (*address > count);
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -151,7 +153,6 @@ __device__ __forceinline__ T __atomicMin(T* address, T val)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
// printf("tid %d label %d\n", tid, labels[tid]);
|
|
||||||
}
|
}
|
||||||
} // lbp
|
} // lbp
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user