added delobal memory version
This commit is contained in:
parent
422f650b20
commit
965109228d
@ -298,23 +298,25 @@ namespace cv { namespace gpu { namespace device
|
|||||||
{
|
{
|
||||||
namespace lbp
|
namespace lbp
|
||||||
{
|
{
|
||||||
void classifyStump(const DevMem2Db& mstages,
|
// void classifyStump(const DevMem2Db& mstages,
|
||||||
const int nstages,
|
// const int nstages,
|
||||||
const DevMem2Di& mnodes,
|
// const DevMem2Di& mnodes,
|
||||||
const DevMem2Df& mleaves,
|
// const DevMem2Df& mleaves,
|
||||||
const DevMem2Di& msubsets,
|
// const DevMem2Di& msubsets,
|
||||||
const DevMem2Db& mfeatures,
|
// const DevMem2Db& mfeatures,
|
||||||
const int workWidth,
|
// const int workWidth,
|
||||||
const int workHeight,
|
// const int workHeight,
|
||||||
const int clWidth,
|
// const int clWidth,
|
||||||
const int clHeight,
|
// const int clHeight,
|
||||||
float scale,
|
// float scale,
|
||||||
int step,
|
// int step,
|
||||||
int subsetSize,
|
// int subsetSize,
|
||||||
DevMem2D_<int4> objects,
|
// DevMem2D_<int4> objects,
|
||||||
unsigned int* classified);
|
// unsigned int* classified);
|
||||||
|
|
||||||
void classifyStumpFixed(const DevMem2Db& mstages,
|
void classifyStumpFixed(const DevMem2Di& integral,
|
||||||
|
const int integralPitch,
|
||||||
|
const DevMem2Db& mstages,
|
||||||
const int nstages,
|
const int nstages,
|
||||||
const DevMem2Di& mnodes,
|
const DevMem2Di& mnodes,
|
||||||
const DevMem2Df& mleaves,
|
const DevMem2Df& mleaves,
|
||||||
@ -365,7 +367,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
cudaMalloc(&dclassified, sizeof(int));
|
cudaMalloc(&dclassified, sizeof(int));
|
||||||
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
|
cudaMemcpy(dclassified, classified, sizeof(int), cudaMemcpyHostToDevice);
|
||||||
int step = 2;
|
int step = 2;
|
||||||
cv::gpu::device::lbp::bindIntegral(integral);
|
// cv::gpu::device::lbp::bindIntegral(integral);
|
||||||
|
|
||||||
cv::Size scaledImageSize(image.cols, image.rows);
|
cv::Size scaledImageSize(image.cols, image.rows);
|
||||||
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
cv::Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
||||||
@ -393,7 +395,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
|
|
||||||
step = (factor <= 2.) + 1;
|
step = (factor <= 2.) + 1;
|
||||||
|
|
||||||
cv::gpu::device::lbp::classifyStumpFixed(stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
|
cv::gpu::device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
|
||||||
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
|
processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified);
|
||||||
|
|
||||||
factor *= scaleFactor;
|
factor *= scaleFactor;
|
||||||
@ -402,7 +404,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
|
|||||||
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
cv::gpu::device::lbp::unbindIntegral();
|
// cv::gpu::device::lbp::unbindIntegral();
|
||||||
if (groupThreshold <= 0 || objects.empty())
|
if (groupThreshold <= 0 || objects.empty())
|
||||||
return 0;
|
return 0;
|
||||||
cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
|
cudaMemcpy(classified, dclassified, sizeof(int), cudaMemcpyDeviceToHost);
|
||||||
|
@ -56,6 +56,80 @@ namespace cv { namespace gpu { namespace device
|
|||||||
__host__ __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
|
__host__ __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
|
||||||
__host__ __device__ __forceinline__ LBP() {}
|
__host__ __device__ __forceinline__ LBP() {}
|
||||||
|
|
||||||
|
// for integral matrix stored in the global memory
|
||||||
|
__device__ __forceinline__ int operator() (const int* integral, const int pitch, int ty, int tx, int fh, int fw, int& shift) const
|
||||||
|
{
|
||||||
|
int anchors[9];
|
||||||
|
|
||||||
|
anchors[0] = integral[ty * pitch + tx];
|
||||||
|
anchors[1] = integral[ty * pitch + tx + fw];
|
||||||
|
anchors[0] -= anchors[1];
|
||||||
|
anchors[2] = integral[ty * pitch + tx + fw * 2];
|
||||||
|
anchors[1] -= anchors[2];
|
||||||
|
anchors[2] -= integral[ty * pitch + tx + fw * 3];
|
||||||
|
|
||||||
|
ty += fh;
|
||||||
|
anchors[3] = integral[ty * pitch + tx];
|
||||||
|
anchors[4] = integral[ty * pitch + tx + fw];
|
||||||
|
anchors[3] -= anchors[4];
|
||||||
|
anchors[5] = integral[ty * pitch + tx + fw * 2];
|
||||||
|
anchors[4] -= anchors[5];
|
||||||
|
anchors[5] -= integral[ty * pitch + tx + fw * 3];
|
||||||
|
|
||||||
|
anchors[0] -= anchors[3];
|
||||||
|
anchors[1] -= anchors[4];
|
||||||
|
anchors[2] -= anchors[5];
|
||||||
|
// 0 - 2 contains s0 - s2
|
||||||
|
|
||||||
|
ty += fh;
|
||||||
|
anchors[6] = integral[ty * pitch + tx];
|
||||||
|
anchors[7] = integral[ty * pitch + tx + fw];
|
||||||
|
anchors[6] -= anchors[7];
|
||||||
|
anchors[8] = integral[ty * pitch + tx + fw * 2];
|
||||||
|
anchors[7] -= anchors[8];
|
||||||
|
anchors[8] -= integral[ty * pitch + tx + fw * 3];
|
||||||
|
|
||||||
|
anchors[3] -= anchors[6];
|
||||||
|
anchors[4] -= anchors[7];
|
||||||
|
anchors[5] -= anchors[8];
|
||||||
|
// 3 - 5 contains s3 - s5
|
||||||
|
|
||||||
|
anchors[0] -= anchors[4];
|
||||||
|
anchors[1] -= anchors[4];
|
||||||
|
anchors[2] -= anchors[4];
|
||||||
|
anchors[3] -= anchors[4];
|
||||||
|
anchors[5] -= anchors[4];
|
||||||
|
|
||||||
|
int response = (~(anchors[0] >> 31)) & 4;
|
||||||
|
response |= (~(anchors[1] >> 31)) & 2;;
|
||||||
|
response |= (~(anchors[2] >> 31)) & 1;
|
||||||
|
|
||||||
|
shift = (~(anchors[5] >> 31)) & 16;
|
||||||
|
shift |= (~(anchors[3] >> 31)) & 1;
|
||||||
|
|
||||||
|
ty += fh;
|
||||||
|
anchors[0] = integral[ty * pitch + tx];
|
||||||
|
anchors[1] = integral[ty * pitch + tx + fw];
|
||||||
|
anchors[0] -= anchors[1];
|
||||||
|
anchors[2] = integral[ty * pitch + tx + fw * 2];
|
||||||
|
anchors[1] -= anchors[2];
|
||||||
|
anchors[2] -= integral[ty * pitch + tx + fw * 3];
|
||||||
|
|
||||||
|
anchors[6] -= anchors[0];
|
||||||
|
anchors[7] -= anchors[1];
|
||||||
|
anchors[8] -= anchors[2];
|
||||||
|
// 0 -2 contains s6 - s8
|
||||||
|
|
||||||
|
anchors[6] -= anchors[4];
|
||||||
|
anchors[7] -= anchors[4];
|
||||||
|
anchors[8] -= anchors[4];
|
||||||
|
|
||||||
|
shift |= (~(anchors[6] >> 31)) & 2;
|
||||||
|
shift |= (~(anchors[7] >> 31)) & 4;
|
||||||
|
shift |= (~(anchors[8] >> 31)) & 8;
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
// for texture fetchrd integral matrix
|
||||||
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const
|
__device__ __forceinline__ int operator() (int ty, int tx, int fh, int fw, int& shift) const
|
||||||
{
|
{
|
||||||
int anchors[9];
|
int anchors[9];
|
||||||
@ -143,9 +217,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
|
|
||||||
struct Classifier
|
struct Classifier
|
||||||
{
|
{
|
||||||
__host__ __device__ __forceinline__ Classifier(const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
|
__host__ __device__ __forceinline__ Classifier(const int* _integral, const int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, const int* _subsets, const uchar4* _features,
|
||||||
const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
|
const int _nstages, const int _clWidth, const int _clHeight, const float _scale, const int _step, const int _subsetSize)
|
||||||
: stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
|
: integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), clWidth(_clWidth), clHeight(_clHeight),
|
||||||
scale(_scale), step(_step), subsetSize(_subsetSize){}
|
scale(_scale), step(_step), subsetSize(_subsetSize){}
|
||||||
|
|
||||||
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
|
__device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
|
||||||
@ -163,7 +237,8 @@ namespace cv { namespace gpu { namespace device
|
|||||||
uchar4 feature = features[node.featureIdx];
|
uchar4 feature = features[node.featureIdx];
|
||||||
|
|
||||||
int shift;
|
int shift;
|
||||||
int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
|
// int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
|
||||||
|
int c = evaluator(integral, pitch, (y + feature.y), x + feature.x, feature.w, feature.z, shift);
|
||||||
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
|
int idx = (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
|
||||||
sum += leaves[idx];
|
sum += leaves[idx];
|
||||||
|
|
||||||
@ -189,6 +264,9 @@ namespace cv { namespace gpu { namespace device
|
|||||||
objects(0, res) = rect;
|
objects(0, res) = rect;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int* integral;
|
||||||
|
const int pitch;
|
||||||
|
|
||||||
const Stage* stages;
|
const Stage* stages;
|
||||||
const ClNode* nodes;
|
const ClNode* nodes;
|
||||||
const float* leaves;
|
const float* leaves;
|
||||||
@ -292,24 +370,24 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
// void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||||
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
// const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
||||||
{
|
// {
|
||||||
int blocks = ceilf(workHeight / (float)step);
|
// int blocks = ceilf(workHeight / (float)step);
|
||||||
int threads = ceilf(workWidth / (float)step);
|
// int threads = ceilf(workWidth / (float)step);
|
||||||
|
|
||||||
Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
|
// Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
|
||||||
lbp_classify_stump<<<blocks, threads>>>(clr, objects, objects.cols, classified);
|
// lbp_classify_stump<<<blocks, threads>>>(clr, objects, objects.cols, classified);
|
||||||
}
|
// }
|
||||||
|
|
||||||
void classifyStumpFixed(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
|
||||||
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
|
||||||
{
|
{
|
||||||
const int THREADS_BLOCK = 256;
|
const int THREADS_BLOCK = 256;
|
||||||
int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
|
int work_amount = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
|
||||||
int blocks = divUp(work_amount, THREADS_BLOCK);
|
int blocks = divUp(work_amount, THREADS_BLOCK);
|
||||||
|
|
||||||
Classifier clr((Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
|
Classifier clr(integral.ptr(), pitch, (Stage*)(mstages.ptr()), (ClNode*)(mnodes.ptr()), mleaves.ptr(), msubsets.ptr(), (uchar4*)(mfeatures.ptr()), nstages, clWidth, clHeight, scale, step, subsetSize);
|
||||||
lbp_classify_stump<<<blocks, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, workWidth >> 1);
|
lbp_classify_stump<<<blocks, THREADS_BLOCK>>>(clr, objects, objects.cols, classified, workWidth >> 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user