LBP: multiscale approach; refactored atomics usage
This commit is contained in:
parent
5dc7752d54
commit
e63ab8dec5
@ -1464,6 +1464,7 @@ private:
|
|||||||
GpuMat resuzeBuffer;
|
GpuMat resuzeBuffer;
|
||||||
|
|
||||||
GpuMat candidates;
|
GpuMat candidates;
|
||||||
|
static const int integralFactor = 4;
|
||||||
};
|
};
|
||||||
|
|
||||||
////////////////////////////////// SURF //////////////////////////////////////////
|
////////////////////////////////// SURF //////////////////////////////////////////
|
||||||
|
@ -67,7 +67,7 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP()
|
|||||||
bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const { throw_nogpu(); return true; }
|
bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const { throw_nogpu(); return true; }
|
||||||
bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&) { throw_nogpu(); return true; }
|
bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&) { throw_nogpu(); return true; }
|
||||||
Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const { throw_nogpu(); return Size(); }
|
Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const { throw_nogpu(); return Size(); }
|
||||||
void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();}
|
void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/) { throw_nogpu();}
|
||||||
|
|
||||||
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
|
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
|
||||||
double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
|
double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
|
||||||
@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame)
|
|||||||
{
|
{
|
||||||
resuzeBuffer.create(frame, CV_8UC1);
|
resuzeBuffer.create(frame, CV_8UC1);
|
||||||
|
|
||||||
integral.create(frame.height + 1, frame.width + 1, CV_32SC1);
|
integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
|
||||||
NcvSize32u roiSize;
|
NcvSize32u roiSize;
|
||||||
roiSize.width = frame.width;
|
roiSize.width = frame.width;
|
||||||
roiSize.height = frame.height;
|
roiSize.height = frame.height;
|
||||||
@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device
|
|||||||
DevMem2D_<int4> objects,
|
DevMem2D_<int4> objects,
|
||||||
unsigned int* classified);
|
unsigned int* classified);
|
||||||
|
|
||||||
|
void classifyPyramid(int frameW,
|
||||||
|
int frameH,
|
||||||
|
int windowW,
|
||||||
|
int windowH,
|
||||||
|
float initalScale,
|
||||||
|
float factor,
|
||||||
|
int total,
|
||||||
|
const DevMem2Db& mstages,
|
||||||
|
const int nstages,
|
||||||
|
const DevMem2Di& mnodes,
|
||||||
|
const DevMem2Df& mleaves,
|
||||||
|
const DevMem2Di& msubsets,
|
||||||
|
const DevMem2Db& mfeatures,
|
||||||
|
const int subsetSize,
|
||||||
|
DevMem2D_<int4> objects,
|
||||||
|
unsigned int* classified,
|
||||||
|
DevMem2Di integral);
|
||||||
|
|
||||||
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
|
||||||
void bindIntegral(DevMem2Di integral);
|
void bindIntegral(DevMem2Di integral);
|
||||||
void unbindIntegral();
|
void unbindIntegral();
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
||||||
|
|
||||||
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
|
cv::Size operator -(const cv::Size& a, const cv::Size& b)
|
||||||
double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
|
{
|
||||||
|
return cv::Size(a.width - b.width, a.height - b.height);
|
||||||
|
}
|
||||||
|
|
||||||
|
cv::Size operator +(const cv::Size& a, const int& i)
|
||||||
|
{
|
||||||
|
return cv::Size(a.width + i, a.height + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
cv::Size operator *(const cv::Size& a, const float& f)
|
||||||
|
{
|
||||||
|
return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
|
||||||
|
}
|
||||||
|
|
||||||
|
cv::Size operator /(const cv::Size& a, const float& f)
|
||||||
|
{
|
||||||
|
return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator <=(const cv::Size& a, const cv::Size& b)
|
||||||
|
{
|
||||||
|
return a.width <= b.width && a.height <= b.width;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PyrLavel
|
||||||
|
{
|
||||||
|
PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window) : order(_order)
|
||||||
|
{
|
||||||
|
scale = pow(_scale, order);
|
||||||
|
sFrame = frame / scale;
|
||||||
|
workArea = sFrame - window + 1;
|
||||||
|
sWindow = window * scale;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isFeasible(cv::Size maxObj)
|
||||||
|
{
|
||||||
|
return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyrLavel next(float factor, cv::Size frame, cv::Size window)
|
||||||
|
{
|
||||||
|
return PyrLavel(order + 1, factor, frame, window);
|
||||||
|
}
|
||||||
|
|
||||||
|
int order;
|
||||||
|
float scale;
|
||||||
|
cv::Size sFrame;
|
||||||
|
cv::Size workArea;
|
||||||
|
cv::Size sWindow;
|
||||||
|
};
|
||||||
|
|
||||||
|
int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize)
|
||||||
{
|
{
|
||||||
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
|
CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
|
||||||
|
|
||||||
@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
// used for debug
|
// used for debug
|
||||||
// candidates.setTo(cv::Scalar::all(0));
|
// candidates.setTo(cv::Scalar::all(0));
|
||||||
// objects.setTo(cv::Scalar::all(0));
|
// objects.setTo(cv::Scalar::all(0));
|
||||||
|
|
||||||
if (maxObjectSize == cv::Size())
|
if (maxObjectSize == cv::Size())
|
||||||
maxObjectSize = image.size();
|
maxObjectSize = image.size();
|
||||||
|
|
||||||
@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
GpuMat dclassified(1, 1, CV_32S);
|
GpuMat dclassified(1, 1, CV_32S);
|
||||||
cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
|
cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
// cv::gpu::device::lbp::bindIntegral(integral);
|
PyrLavel level(0, 1.0f, image.size(), NxM);
|
||||||
|
|
||||||
Size scaledImageSize(image.cols, image.rows);
|
while (level.isFeasible(maxObjectSize))
|
||||||
Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
|
||||||
Size windowSize(NxM.width, NxM.height);
|
|
||||||
|
|
||||||
float factor = 1;
|
|
||||||
|
|
||||||
for (;;)
|
|
||||||
{
|
{
|
||||||
if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
|
int acc = level.sFrame.width + 1;
|
||||||
break;
|
float iniScale = level.scale;
|
||||||
|
cv::Size area = level.workArea;
|
||||||
|
float step = (float)(1 + (level.scale <= 2.f));
|
||||||
|
|
||||||
if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
|
int total = 0, prev = 0;
|
||||||
break;
|
|
||||||
|
|
||||||
// if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
|
while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
|
||||||
// continue;
|
{
|
||||||
|
// create sutable matrix headers
|
||||||
|
GpuMat src = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
|
||||||
|
GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
|
||||||
|
GpuMat buff = integralBuffer;
|
||||||
|
|
||||||
GpuMat scaledImg = resuzeBuffer(cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
|
// generate integral for scale
|
||||||
GpuMat scaledIntegral = integral(cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
|
gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR);
|
||||||
GpuMat currBuff = integralBuffer;
|
gpu::integralBuffered(src, sint, buff);
|
||||||
|
|
||||||
gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
|
total += cvCeil(area.width / step) * cvCeil(area.height / step);
|
||||||
gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
|
// std::cout << "Total for scale: " << total << " this step contribution " << cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " << acc << " scales: " << cvCeil(area.width / step) << std::endl;
|
||||||
|
|
||||||
int step = factor <= 2.f ? 2 : 1;
|
// increment pyr lavel
|
||||||
|
level = level.next(scaleFactor, image.size(), NxM);
|
||||||
|
area = level.workArea;
|
||||||
|
|
||||||
device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
|
step = (float)(1 + (level.scale <= 2.f));
|
||||||
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified.ptr<unsigned int>());
|
prev = acc;
|
||||||
|
acc += level.sFrame.width + 1;
|
||||||
|
}
|
||||||
|
|
||||||
factor *= scaleFactor;
|
device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
|
||||||
windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
|
leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
|
||||||
scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
|
|
||||||
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// cv::gpu::device::lbp::unbindIntegral();
|
|
||||||
if (groupThreshold <= 0 || objects.empty())
|
if (groupThreshold <= 0 || objects.empty())
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
|
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
|
||||||
device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
|
device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
|
||||||
|
|
||||||
|
// candidates.copyTo(objects);
|
||||||
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
|
cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
cudaSafeCall( cudaDeviceSynchronize() );
|
||||||
|
// std::cout << classified << " !!!!!!!!!!" << std::endl;
|
||||||
|
|
||||||
return classified;
|
return classified;
|
||||||
}
|
}
|
||||||
|
@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
struct Classifier
|
struct Classifier
|
||||||
{
|
{
|
||||||
__host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
|
__host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
|
||||||
const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
|
const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
|
||||||
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
|
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
|
||||||
clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
|
clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
|
||||||
|
|
||||||
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
|
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
|
||||||
{
|
{
|
||||||
@ -255,11 +255,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
rect.z = clWidth;
|
rect.z = clWidth;
|
||||||
rect.w = clHeight;
|
rect.w = clHeight;
|
||||||
|
|
||||||
#if (__CUDA_ARCH__ < 120)
|
int res = Emulation::smem::atomicInc(n, maxN);
|
||||||
int res = __atomicInc(n, maxN);
|
|
||||||
#else
|
|
||||||
int res = atomicInc(n, maxN);
|
|
||||||
#endif
|
|
||||||
objects(0, res) = rect;
|
objects(0, res) = rect;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -317,26 +313,17 @@ namespace cv { namespace gpu { namespace device
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
int cls = labels[tid];
|
int cls = labels[tid];
|
||||||
#if (__CUDA_ARCH__ < 120)
|
Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
||||||
__atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
||||||
__atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
||||||
__atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
|
||||||
__atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
|
|
||||||
#else
|
|
||||||
atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
|
|
||||||
atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
|
|
||||||
atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
|
|
||||||
atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
|
|
||||||
#endif
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
labels[tid] = 0;
|
labels[tid] = 0;
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
#if (__CUDA_ARCH__ < 120)
|
Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
|
||||||
__atomicInc((unsigned int*)labels + cls, n);
|
|
||||||
#else
|
|
||||||
atomicInc((unsigned int*)labels + cls, n);
|
|
||||||
#endif
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
*nclasses = 0;
|
*nclasses = 0;
|
||||||
|
|
||||||
@ -354,30 +341,26 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
if (active && active >= groupThreshold)
|
if (active && active >= groupThreshold)
|
||||||
{
|
{
|
||||||
int* r1 = rrects + tid * 4;
|
int* r1 = rrects + tid * 4;
|
||||||
int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
|
int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
|
||||||
|
|
||||||
#if (__CUDA_ARCH__ < 120)
|
int aidx = Emulation::smem::atomicInc(nclasses, n);
|
||||||
objects[__atomicInc(nclasses, n)] = r_out;
|
|
||||||
#else
|
|
||||||
int aidx = atomicInc(nclasses, n);
|
|
||||||
objects[aidx] = r_out;
|
objects[aidx] = r_out;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||||
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
||||||
{
|
{
|
||||||
Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
|
Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
|
||||||
(uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
|
(uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
|
||||||
|
|
||||||
int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
|
int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
|
||||||
|
|
||||||
int block = 256;
|
int block = 256;
|
||||||
int grid = divUp(total, block);
|
int grid = divUp(total, block);
|
||||||
lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
|
lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
|
||||||
cudaSafeCall( cudaGetLastError() );
|
cudaSafeCall( cudaGetLastError() );
|
||||||
}
|
}
|
||||||
|
|
||||||
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
|
||||||
@ -385,7 +368,124 @@ namespace cv { namespace gpu { namespace device
|
|||||||
int block = ncandidates;
|
int block = ncandidates;
|
||||||
int smem = block * ( sizeof(int) + sizeof(int4) );
|
int smem = block * ( sizeof(int) + sizeof(int4) );
|
||||||
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
|
disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
|
||||||
cudaSafeCall( cudaGetLastError() );
|
cudaSafeCall( cudaGetLastError() );
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Cascade
|
||||||
|
{
|
||||||
|
__host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
|
||||||
|
const int* _subsets, const uchar4* _features, int _subsetSize)
|
||||||
|
|
||||||
|
: stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
|
||||||
|
|
||||||
|
__device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch/*, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n*/) const
|
||||||
|
{
|
||||||
|
int current_node = 0;
|
||||||
|
int current_leave = 0;
|
||||||
|
|
||||||
|
for (int s = 0; s < nstages; ++s)
|
||||||
|
{
|
||||||
|
float sum = 0;
|
||||||
|
Stage stage = stages[s];
|
||||||
|
for (int t = 0; t < stage.ntrees; t++)
|
||||||
|
{
|
||||||
|
ClNode node = nodes[current_node];
|
||||||
|
uchar4 feature = features[node.featureIdx];
|
||||||
|
|
||||||
|
int shift;
|
||||||
|
int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
|
||||||
|
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
|
||||||
|
sum += leaves[idx];
|
||||||
|
|
||||||
|
current_node += 1;
|
||||||
|
current_leave += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sum < stage.threshold)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Stage* stages;
|
||||||
|
const int nstages;
|
||||||
|
|
||||||
|
const ClNode* nodes;
|
||||||
|
const float* leaves;
|
||||||
|
const int* subsets;
|
||||||
|
const uchar4* features;
|
||||||
|
|
||||||
|
const int subsetSize;
|
||||||
|
const LBP evaluator;
|
||||||
|
};
|
||||||
|
|
||||||
|
// stepShift, scale, width_k, sum_prev => y = sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
|
||||||
|
__global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
|
||||||
|
const int workAmount, int* integral, const int pitch, DevMem2D_<int4> objects, unsigned int* classified)
|
||||||
|
{
|
||||||
|
int ftid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (ftid >= workAmount ) return;
|
||||||
|
|
||||||
|
int sum = 0;
|
||||||
|
// float scale = 1.0f;
|
||||||
|
float stepShift = (scale <= 2.f) ? 2.0 : 1.0;
|
||||||
|
int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
|
||||||
|
int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
|
||||||
|
|
||||||
|
// if (!ftid)
|
||||||
|
// printf("!!!!: %d %d", w, h);
|
||||||
|
|
||||||
|
int framTid = ftid;
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
if (framTid < (w - 1) * (h - 1)) break;
|
||||||
|
i++;
|
||||||
|
sum += __float2int_rn(frameW / scale) + 1;
|
||||||
|
framTid -= w * h;
|
||||||
|
scale *= factor;
|
||||||
|
stepShift = (scale <= 2.f) ? 2.0 : 1.0;
|
||||||
|
int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
|
||||||
|
int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
|
||||||
|
}
|
||||||
|
|
||||||
|
int y = (framTid / w);
|
||||||
|
int x = (framTid - y * w) * stepShift;
|
||||||
|
y *= stepShift;
|
||||||
|
x += sum;
|
||||||
|
|
||||||
|
// if (i == 2)
|
||||||
|
// printf("!!!!!!!!!!!!!! %f %d %d %d\n", windowW * scale, sum, y, x);
|
||||||
|
|
||||||
|
if (cascade(y, x, integral, pitch))
|
||||||
|
{
|
||||||
|
int4 rect;
|
||||||
|
rect.x = roundf( (x - sum) * scale);
|
||||||
|
rect.y = roundf(y * scale);
|
||||||
|
rect.z = roundf(windowW * scale);
|
||||||
|
rect.w = roundf(windowH * scale);
|
||||||
|
|
||||||
|
if (rect.x > frameW || rect.y > frameH) return;
|
||||||
|
// printf("OUTLAUER %d %d %d %d %d %d %d %d %d %f %f\n", x, y, ftid, framTid, rect.x, rect.y, sum, w, h, stepShift, scale);
|
||||||
|
|
||||||
|
// printf("passed: %d %d ---- %d %d %d %d %d\n", y, x, rect.x, rect.y, rect.z, rect.w, sum);
|
||||||
|
|
||||||
|
int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
|
||||||
|
objects(0, res) = rect;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
|
||||||
|
const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||||
|
const int subsetSize, DevMem2D_<int4> objects, unsigned int* classified, DevMem2Di integral)
|
||||||
|
{
|
||||||
|
const int block = 256;
|
||||||
|
int grid = divUp(workAmount, block);
|
||||||
|
Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
|
||||||
|
lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}}}
|
}}}
|
@ -44,18 +44,19 @@
|
|||||||
#define OPENCV_GPU_EMULATION_HPP_
|
#define OPENCV_GPU_EMULATION_HPP_
|
||||||
|
|
||||||
#include "warp_reduce.hpp"
|
#include "warp_reduce.hpp"
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
struct Emulation
|
struct Emulation
|
||||||
{
|
{
|
||||||
template<int CTA_SIZE>
|
template<int CTA_SIZE>
|
||||||
static __forceinline__ __device__ int Ballot(int predicate)
|
static __forceinline__ __device__ int Ballot(int predicate)
|
||||||
{
|
{
|
||||||
#if (__CUDA_ARCH__ >= 200)
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||||
return __ballot(predicate);
|
return __ballot(predicate);
|
||||||
#else
|
#else
|
||||||
__shared__ volatile int cta_buffer[CTA_SIZE]
|
__shared__ volatile int cta_buffer[CTA_SIZE];
|
||||||
|
|
||||||
int tid = threadIdx.x;
|
int tid = threadIdx.x;
|
||||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||||
@ -63,41 +64,62 @@ namespace cv { namespace gpu { namespace device
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
struct smem
|
struct smem
|
||||||
{
|
{
|
||||||
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
|
enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static __device__ __forceinline__ T atomicInc(T* address, T val)
|
static __device__ __forceinline__ T atomicInc(T* address, T val)
|
||||||
{
|
{
|
||||||
#if (__CUDA_ARCH__ < 120)
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||||
|
T count;
|
||||||
#else
|
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||||
|
do
|
||||||
#endif
|
{
|
||||||
|
count = *address & TAG_MASK;
|
||||||
}
|
count = tag | (count + 1);
|
||||||
|
*address = count;
|
||||||
template<typename T>
|
} while (*address != count);
|
||||||
static __device__ __forceinline__ void atomicAdd(T* address, T val)
|
|
||||||
{
|
return (count & TAG_MASK) - 1;
|
||||||
#if (__CUDA_ARCH__ < 120)
|
#else
|
||||||
|
return ::atomicInc(address, val);
|
||||||
#else
|
#endif
|
||||||
|
}
|
||||||
#endif
|
|
||||||
}
|
template<typename T>
|
||||||
|
static __device__ __forceinline__ void atomicAdd(T* address, T val)
|
||||||
template<typename T>
|
{
|
||||||
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||||
{
|
T count;
|
||||||
#if (__CUDA_ARCH__ < 120)
|
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
||||||
|
do
|
||||||
#else
|
{
|
||||||
|
count = *address & TAG_MASK;
|
||||||
#endif
|
count = tag | (count + val);
|
||||||
}
|
*address = count;
|
||||||
};
|
} while (*address != count);
|
||||||
|
#else
|
||||||
|
::atomicAdd(address, val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
static __device__ __forceinline__ T atomicMin(T* address, T val)
|
||||||
|
{
|
||||||
|
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
|
||||||
|
T count = min(*address, val);
|
||||||
|
do
|
||||||
|
{
|
||||||
|
*address = count;
|
||||||
|
} while (*address > count);
|
||||||
|
|
||||||
|
return count;
|
||||||
|
#else
|
||||||
|
return ::atomicMin(address, val);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
};
|
};
|
||||||
}}} // namespace cv { namespace gpu { namespace device
|
}}} // namespace cv { namespace gpu { namespace device
|
||||||
|
|
||||||
|
@ -44,52 +44,11 @@
|
|||||||
#define __OPENCV_GPU_DEVICE_LBP_HPP_
|
#define __OPENCV_GPU_DEVICE_LBP_HPP_
|
||||||
|
|
||||||
#include "internal_shared.hpp"
|
#include "internal_shared.hpp"
|
||||||
|
#include <opencv2/gpu/device/emulation.hpp>
|
||||||
|
|
||||||
namespace cv { namespace gpu { namespace device {
|
namespace cv { namespace gpu { namespace device {
|
||||||
|
|
||||||
namespace lbp{
|
namespace lbp {
|
||||||
|
|
||||||
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
__device__ __forceinline__ T __atomicInc(T* address, T val)
|
|
||||||
{
|
|
||||||
T count;
|
|
||||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
|
||||||
do
|
|
||||||
{
|
|
||||||
count = *address & TAG_MASK;
|
|
||||||
count = tag | (count + 1);
|
|
||||||
*address = count;
|
|
||||||
} while (*address != count);
|
|
||||||
|
|
||||||
return (count & TAG_MASK) - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
__device__ __forceinline__ void __atomicAdd(T* address, T val)
|
|
||||||
{
|
|
||||||
T count;
|
|
||||||
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
|
|
||||||
do
|
|
||||||
{
|
|
||||||
count = *address & TAG_MASK;
|
|
||||||
count = tag | (count + val);
|
|
||||||
*address = count;
|
|
||||||
} while (*address != count);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T>
|
|
||||||
__device__ __forceinline__ T __atomicMin(T* address, T val)
|
|
||||||
{
|
|
||||||
T count = min(*address, val);
|
|
||||||
do
|
|
||||||
{
|
|
||||||
*address = count;
|
|
||||||
} while (*address > count);
|
|
||||||
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Stage
|
struct Stage
|
||||||
{
|
{
|
||||||
@ -127,27 +86,25 @@ namespace lbp{
|
|||||||
unsigned tid = threadIdx.x;
|
unsigned tid = threadIdx.x;
|
||||||
labels[tid] = tid;
|
labels[tid] = tid;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (unsigned int id = 0; id < n; id++)
|
for (unsigned int id = 0; id < n; id++)
|
||||||
{
|
{
|
||||||
if (tid != id && predicate(vec[tid], vec[id]))
|
if (tid != id && predicate(vec[tid], vec[id]))
|
||||||
{
|
{
|
||||||
int p = labels[tid];
|
int p = labels[tid];
|
||||||
int q = labels[id];
|
int q = labels[id];
|
||||||
|
if (p < q)
|
||||||
if (p != q)
|
{
|
||||||
{
|
Emulation::smem::atomicMin(labels + id, p);
|
||||||
int m = min(p, q);
|
}
|
||||||
#if (__CUDA_ARCH__ < 120)
|
else if (p > q)
|
||||||
__atomicMin(labels + id, m);
|
{
|
||||||
#else
|
Emulation::smem::atomicMin(labels + tid, q);
|
||||||
atomicMin(labels + id, m);
|
}
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // lbp
|
} // lbp
|
||||||
|
|
||||||
} } }// namespaces
|
} } }// namespaces
|
||||||
|
Loading…
x
Reference in New Issue
Block a user