Add OclCascadeClassifierBuf interface
This commit is contained in:
parent
abe2ea59ed
commit
69a0b5dde5
@ -802,6 +802,44 @@ namespace cv
|
||||
int minNeighbors, int flags, CvSize minSize = cvSize(0, 0), CvSize maxSize = cvSize(0, 0));
|
||||
};
|
||||
|
||||
class CV_EXPORTS OclCascadeClassifierBuf : public cv::CascadeClassifier
|
||||
{
|
||||
public:
|
||||
OclCascadeClassifierBuf() :
|
||||
m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}
|
||||
|
||||
~OclCascadeClassifierBuf() {}
|
||||
|
||||
void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
|
||||
double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
|
||||
Size minSize = Size(), Size maxSize = Size());
|
||||
void release();
|
||||
|
||||
private:
|
||||
void Init(const int rows, const int cols, double scaleFactor, int flags,
|
||||
const int outputsz, const size_t localThreads[],
|
||||
CvSize minSize, CvSize maxSize);
|
||||
void CreateBaseBufs(const int datasize, const int totalclassifier, const int flags, const int outputsz);
|
||||
void CreateFactorRelatedBufs(const int rows, const int cols, const int flags,
|
||||
const double scaleFactor, const size_t localThreads[],
|
||||
CvSize minSize, CvSize maxSize);
|
||||
void GenResult(CV_OUT std::vector<cv::Rect>& faces, const std::vector<cv::Rect> &rectList, const std::vector<int> &rweights);
|
||||
|
||||
int m_rows;
|
||||
int m_cols;
|
||||
int m_flags;
|
||||
int m_loopcount;
|
||||
int m_nodenum;
|
||||
bool findBiggestObject;
|
||||
bool initialized;
|
||||
double m_scaleFactor;
|
||||
Size m_minSize;
|
||||
Size m_maxSize;
|
||||
vector<CvSize> sizev;
|
||||
vector<float> scalev;
|
||||
oclMat gimg1, gsum, gsqsum;
|
||||
void * buffers;
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////// Pyramid /////////////////////////////////////
|
||||
|
@ -20,6 +20,7 @@
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Wu Xinglong, wxl370@126.com
|
||||
// Wang Yao, bitwangyaoyao@gmail.com
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -842,15 +843,13 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
|
||||
} /* j */
|
||||
}
|
||||
}
|
||||
|
||||
CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemStorage *storage, double scaleFactor,
|
||||
int minNeighbors, int flags, CvSize minSize, CvSize maxSize)
|
||||
{
|
||||
CvHaarClassifierCascade *cascade = oldCascade;
|
||||
|
||||
//double alltime = (double)cvGetTickCount();
|
||||
//double t = (double)cvGetTickCount();
|
||||
const double GROUP_EPS = 0.2;
|
||||
oclMat gtemp, gsum1, gtilted1, gsqsum1, gnormImg, gsumcanny;
|
||||
CvSeq *result_seq = 0;
|
||||
cv::Ptr<CvMemStorage> temp_storage;
|
||||
|
||||
@ -861,7 +860,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
int datasize=0;
|
||||
int totalclassifier=0;
|
||||
|
||||
//void *out;
|
||||
GpuHidHaarClassifierCascade *gcascade;
|
||||
GpuHidHaarStageClassifier *stage;
|
||||
GpuHidHaarClassifier *classifier;
|
||||
@ -870,11 +868,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
int *candidate;
|
||||
cl_int status;
|
||||
|
||||
// bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0;
|
||||
bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
|
||||
// bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0;
|
||||
|
||||
//double t = 0;
|
||||
if( maxSize.height == 0 || maxSize.width == 0 )
|
||||
{
|
||||
maxSize.height = gimg.rows;
|
||||
@ -896,27 +891,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
if( findBiggestObject )
|
||||
flags &= ~CV_HAAR_SCALE_IMAGE;
|
||||
|
||||
//gtemp = oclMat( gimg.rows, gimg.cols, CV_8UC1);
|
||||
//gsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 );
|
||||
//gsqsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32FC1 );
|
||||
|
||||
if( !cascade->hid_cascade )
|
||||
/*out = (void *)*/gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
|
||||
if( cascade->hid_cascade->has_tilted_features )
|
||||
gtilted1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 );
|
||||
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
|
||||
|
||||
result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), storage );
|
||||
|
||||
if( CV_MAT_CN(gimg.type()) > 1 )
|
||||
{
|
||||
oclMat gtemp;
|
||||
cvtColor( gimg, gtemp, CV_BGR2GRAY );
|
||||
gimg = gtemp;
|
||||
}
|
||||
|
||||
if( findBiggestObject )
|
||||
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "before if time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
|
||||
if( gimg.cols < minSize.width || gimg.rows < minSize.height )
|
||||
CV_Error(CV_StsError, "Image too small");
|
||||
@ -924,12 +912,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
if( (flags & CV_HAAR_SCALE_IMAGE) )
|
||||
{
|
||||
CvSize winSize0 = cascade->orig_window_size;
|
||||
//float scalefactor = 1.1f;
|
||||
//float factor = 1.f;
|
||||
int totalheight = 0;
|
||||
int indexy = 0;
|
||||
CvSize sz;
|
||||
//t = (double)cvGetTickCount();
|
||||
vector<CvSize> sizev;
|
||||
vector<float> scalev;
|
||||
for(factor = 1.f;; factor *= scaleFactor)
|
||||
@ -950,20 +935,15 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
sizev.push_back(sz);
|
||||
scalev.push_back(factor);
|
||||
}
|
||||
//int flag = 0;
|
||||
|
||||
oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1);
|
||||
oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1);
|
||||
oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1);
|
||||
|
||||
//cl_mem cascadebuffer;
|
||||
cl_mem stagebuffer;
|
||||
//cl_mem classifierbuffer;
|
||||
cl_mem nodebuffer;
|
||||
cl_mem candidatebuffer;
|
||||
cl_mem scaleinfobuffer;
|
||||
//cl_kernel kernel;
|
||||
//kernel = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade");
|
||||
cv::Rect roi, roi2;
|
||||
cv::Mat imgroi, imgroisq;
|
||||
cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
|
||||
@ -971,18 +951,13 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
|
||||
size_t blocksize = 8;
|
||||
size_t localThreads[3] = { blocksize, blocksize , 1 };
|
||||
size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->computeUnits()) *localThreads[0],
|
||||
size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0],
|
||||
localThreads[1], 1
|
||||
};
|
||||
int outputsz = 256 * globalThreads[0] / localThreads[0];
|
||||
int loopcount = sizev.size();
|
||||
detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
|
||||
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
// printf( "pre time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//int *it =scaleinfo;
|
||||
// t = (double)cvGetTickCount();
|
||||
|
||||
for( int i = 0; i < loopcount; i++ )
|
||||
{
|
||||
sz = sizev[i];
|
||||
@ -992,7 +967,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
resizeroi = gimg1(roi2);
|
||||
gimgroi = gsum(roi);
|
||||
gimgroisq = gsqsum(roi);
|
||||
//scaleinfo[i].rows = gimgroi.rows;
|
||||
int width = gimgroi.cols - 1 - cascade->orig_window_size.width;
|
||||
int height = gimgroi.rows - 1 - cascade->orig_window_size.height;
|
||||
scaleinfo[i].width_height = (width << 16) | height;
|
||||
@ -1000,76 +974,40 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
|
||||
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
|
||||
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
|
||||
//outputsz +=width*height;
|
||||
|
||||
scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
|
||||
scaleinfo[i].imgoff = gimgroi.offset >> 2;
|
||||
scaleinfo[i].factor = factor;
|
||||
//printf("rows = %d,ystep = %d,width = %d,height = %d,grpnumperline = %d,totalgrp = %d,imgoff = %d,factor = %f\n",
|
||||
// scaleinfo[i].rows,scaleinfo[i].ystep,scaleinfo[i].width,scaleinfo[i].height,scaleinfo[i].grpnumperline,
|
||||
// scaleinfo[i].totalgrp,scaleinfo[i].imgoff,scaleinfo[i].factor);
|
||||
cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
|
||||
//cv::imwrite("D:\\1.jpg",gimg1);
|
||||
cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
|
||||
//cv::ocl::oclMat chk(sz.height,sz.width,CV_32SC1),chksq(sz.height,sz.width,CV_32FC1);
|
||||
//cv::ocl::integral(gimg1, chk, chksq);
|
||||
//double r = cv::norm(chk,gimgroi,NORM_INF);
|
||||
//if(r > std::numeric_limits<double>::epsilon())
|
||||
//{
|
||||
// printf("failed");
|
||||
//}
|
||||
indexy += sz.height;
|
||||
}
|
||||
//int ystep = factor > 2 ? 1 : 2;
|
||||
// t = (double)cvGetTickCount() - t;
|
||||
//printf( "resize integral time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//t = (double)cvGetTickCount();
|
||||
|
||||
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
|
||||
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
|
||||
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
|
||||
node = (GpuHidHaarTreeNode *)(classifier->node);
|
||||
|
||||
//int m,n;
|
||||
//m = (gsum.cols - 1 - cascade->orig_window_size.width + ystep - 1)/ystep;
|
||||
//n = (gsum.rows - 1 - cascade->orig_window_size.height + ystep - 1)/ystep;
|
||||
//int counter = m*n;
|
||||
|
||||
int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
|
||||
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
|
||||
//if(flag == 0){
|
||||
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
|
||||
//memset((char*)candidate,0,4*sizeof(int)*outputsz);
|
||||
gpuSetImagesForHaarClassifierCascade( cascade,/* &sum1, &sqsum1, _tilted,*/ 1., gsum.step / 4 );
|
||||
|
||||
//cascadebuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifierCascade),NULL,&status);
|
||||
//openCLVerifyCall(status);
|
||||
//openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,cascadebuffer,1,0,sizeof(GpuHidHaarClassifierCascade),gcascade,0,NULL,NULL));
|
||||
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
|
||||
|
||||
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
|
||||
|
||||
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
|
||||
//classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status);
|
||||
//status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL);
|
||||
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
|
||||
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz);
|
||||
//openCLVerifyCall(status);
|
||||
|
||||
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
//flag = 1;
|
||||
//}
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "update time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
|
||||
//size_t globalThreads[3] = { counter+blocksize*blocksize-counter%(blocksize*blocksize),1,1};
|
||||
//t = (double)cvGetTickCount();
|
||||
int startstage = 0;
|
||||
int endstage = gcascade->count;
|
||||
int startnode = 0;
|
||||
@ -1087,11 +1025,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
pq.s[3] = gcascade->pq3;
|
||||
float correction = gcascade->inv_window_area;
|
||||
|
||||
//int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]);
|
||||
//int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline;
|
||||
// openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads);
|
||||
//openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_mem),(void*)&cascadebuffer));
|
||||
|
||||
vector<pair<size_t, const void *> > args;
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
|
||||
@ -1111,28 +1044,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//t = (double)cvGetTickCount();
|
||||
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, 0, 4 * sizeof(int)*outputsz, candidate, 0, NULL, NULL));
|
||||
|
||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
// t = (double)cvGetTickCount() - t;
|
||||
//printf( "post time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//t = (double)cvGetTickCount();
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
|
||||
free(scaleinfo);
|
||||
free(candidate);
|
||||
//openCLSafeCall(clReleaseMemObject(cascadebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(stagebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
|
||||
openCLSafeCall(clReleaseMemObject(nodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(candidatebuffer));
|
||||
// openCLSafeCall(clReleaseKernel(kernel));
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "release time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1150,7 +1075,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
|
||||
node = (GpuHidHaarTreeNode *)(classifier->node);
|
||||
cl_mem stagebuffer;
|
||||
//cl_mem classifierbuffer;
|
||||
cl_mem nodebuffer;
|
||||
cl_mem candidatebuffer;
|
||||
cl_mem scaleinfobuffer;
|
||||
@ -1187,24 +1111,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
size_t blocksize = 8;
|
||||
size_t localThreads[3] = { blocksize, blocksize , 1 };
|
||||
size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0],
|
||||
localThreads[1], 1
|
||||
};
|
||||
localThreads[1], 1 };
|
||||
int outputsz = 256 * globalThreads[0] / localThreads[0];
|
||||
int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
|
||||
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
|
||||
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
|
||||
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE,
|
||||
loopcount * nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
int startstage = 0;
|
||||
int endstage = gcascade->count;
|
||||
//cl_kernel kernel;
|
||||
//kernel = openCLGetKernelFromSource(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2");
|
||||
//cl_kernel kernel2 = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier");
|
||||
for(int i = 0; i < loopcount; i++)
|
||||
{
|
||||
sz = sizev[i];
|
||||
@ -1223,7 +1143,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
int height = (gsum.rows - 1 - sz.height + ystep - 1) / ystep;
|
||||
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
|
||||
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
|
||||
//outputsz +=width*height;
|
||||
|
||||
scaleinfo[i].width_height = (width << 16) | height;
|
||||
scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
|
||||
scaleinfo[i].imgoff = 0;
|
||||
@ -1241,28 +1161,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
size_t globalThreads2[3] = {nodenum, 1, 1};
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
|
||||
|
||||
//clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel2, 1, NULL, globalThreads2, 0, 0, NULL, NULL);
|
||||
//clFinish(gsum.clCxt->impl->clCmdQueue);
|
||||
}
|
||||
//clReleaseKernel(kernel2);
|
||||
|
||||
int step = gsum.step / 4;
|
||||
int startnode = 0;
|
||||
int splitstage = 3;
|
||||
int splitnode = stage[0].count + stage[1].count + stage[2].count;
|
||||
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz);
|
||||
//openCLVerifyCall(status);
|
||||
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
|
||||
correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
|
||||
//int argcount = 0;
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
|
||||
|
||||
vector<pair<size_t, const void *> > args;
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
|
||||
@ -1271,22 +1183,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&loopcount ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&pbuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&nodenum ));
|
||||
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
|
||||
candidate = (int *)clEnqueueMapBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
|
||||
candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
{
|
||||
@ -1297,7 +1208,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
free(scaleinfo);
|
||||
free(p);
|
||||
free(correction);
|
||||
clEnqueueUnmapMemObject((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, candidate, 0, 0, 0);
|
||||
clEnqueueUnmapMemObject(qu, candidatebuffer, candidate, 0, 0, 0);
|
||||
openCLSafeCall(clReleaseMemObject(stagebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
|
||||
openCLSafeCall(clReleaseMemObject(nodebuffer));
|
||||
@ -1306,21 +1217,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
openCLSafeCall(clReleaseMemObject(pbuffer));
|
||||
openCLSafeCall(clReleaseMemObject(correctionbuffer));
|
||||
}
|
||||
//t = (double)cvGetTickCount() ;
|
||||
|
||||
cvFree(&cascade->hid_cascade);
|
||||
// printf("%d\n",globalcounter);
|
||||
rectList.resize(allCandidates.size());
|
||||
if(!allCandidates.empty())
|
||||
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
|
||||
|
||||
//cout << "count = " << rectList.size()<< endl;
|
||||
|
||||
if( minNeighbors != 0 || findBiggestObject )
|
||||
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
|
||||
else
|
||||
rweights.resize(rectList.size(), 0);
|
||||
|
||||
|
||||
if( findBiggestObject && rectList.size() )
|
||||
{
|
||||
CvAvgComp result_comp = {{0, 0, 0, 0}, 0};
|
||||
@ -1346,13 +1253,565 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
cvSeqPush( result_seq, &c );
|
||||
}
|
||||
}
|
||||
//t = (double)cvGetTickCount() - t;
|
||||
//printf( "get face time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
|
||||
//alltime = (double)cvGetTickCount() - alltime;
|
||||
//printf( "all time = %g ms\n", alltime/((double)cvGetTickFrequency()*1000.) );
|
||||
|
||||
return result_seq;
|
||||
}
|
||||
|
||||
struct OclBuffers
|
||||
{
|
||||
cl_mem stagebuffer;
|
||||
cl_mem nodebuffer;
|
||||
cl_mem candidatebuffer;
|
||||
cl_mem scaleinfobuffer;
|
||||
cl_mem pbuffer;
|
||||
cl_mem correctionbuffer;
|
||||
cl_mem newnodebuffer;
|
||||
};
|
||||
|
||||
struct getRect
|
||||
{
|
||||
Rect operator()(const CvAvgComp &e) const
|
||||
{
|
||||
return e.rect;
|
||||
}
|
||||
};
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv::Rect>& faces,
|
||||
double scaleFactor, int minNeighbors, int flags,
|
||||
Size minSize, Size maxSize)
|
||||
{
|
||||
int blocksize = 8;
|
||||
int grp_per_CU = 12;
|
||||
size_t localThreads[3] = { blocksize, blocksize, 1 };
|
||||
size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0],
|
||||
localThreads[1],
|
||||
1 };
|
||||
int outputsz = 256 * globalThreads[0] / localThreads[0];
|
||||
|
||||
Init(gimg.rows, gimg.cols, scaleFactor, flags, outputsz, localThreads, minSize, maxSize);
|
||||
|
||||
const double GROUP_EPS = 0.2;
|
||||
|
||||
cv::ConcurrentRectVector allCandidates;
|
||||
std::vector<cv::Rect> rectList;
|
||||
std::vector<int> rweights;
|
||||
|
||||
CvHaarClassifierCascade *cascade = oldCascade;
|
||||
GpuHidHaarClassifierCascade *gcascade;
|
||||
GpuHidHaarStageClassifier *stage;
|
||||
GpuHidHaarClassifier *classifier;
|
||||
GpuHidHaarTreeNode *node;
|
||||
|
||||
if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
|
||||
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
|
||||
|
||||
if( CV_MAT_CN(gimg.type()) > 1 )
|
||||
{
|
||||
oclMat gtemp;
|
||||
cvtColor( gimg, gtemp, CV_BGR2GRAY );
|
||||
gimg = gtemp;
|
||||
}
|
||||
|
||||
int *candidate;
|
||||
|
||||
if( (flags & CV_HAAR_SCALE_IMAGE) )
|
||||
{
|
||||
int indexy = 0;
|
||||
CvSize sz;
|
||||
|
||||
cv::Rect roi, roi2;
|
||||
cv::Mat imgroi, imgroisq;
|
||||
cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
|
||||
|
||||
for( int i = 0; i < m_loopcount; i++ )
|
||||
{
|
||||
sz = sizev[i];
|
||||
roi = Rect(0, indexy, sz.width, sz.height);
|
||||
roi2 = Rect(0, 0, sz.width - 1, sz.height - 1);
|
||||
resizeroi = gimg1(roi2);
|
||||
gimgroi = gsum(roi);
|
||||
gimgroisq = gsqsum(roi);
|
||||
|
||||
cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
|
||||
cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
|
||||
indexy += sz.height;
|
||||
}
|
||||
|
||||
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
|
||||
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
|
||||
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
|
||||
node = (GpuHidHaarTreeNode *)(classifier->node);
|
||||
|
||||
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
|
||||
|
||||
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
|
||||
sizeof(GpuHidHaarStageClassifier) * gcascade->count,
|
||||
stage, 0, NULL, NULL));
|
||||
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
|
||||
m_nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
|
||||
int startstage = 0;
|
||||
int endstage = gcascade->count;
|
||||
int startnode = 0;
|
||||
int pixelstep = gsum.step / 4;
|
||||
int splitstage = 3;
|
||||
int splitnode = stage[0].count + stage[1].count + stage[2].count;
|
||||
cl_int4 p, pq;
|
||||
p.s[0] = gcascade->p0;
|
||||
p.s[1] = gcascade->p1;
|
||||
p.s[2] = gcascade->p2;
|
||||
p.s[3] = gcascade->p3;
|
||||
pq.s[0] = gcascade->pq0;
|
||||
pq.s[1] = gcascade->pq1;
|
||||
pq.s[2] = gcascade->pq2;
|
||||
pq.s[3] = gcascade->pq3;
|
||||
float correction = gcascade->inv_window_area;
|
||||
|
||||
vector<pair<size_t, const void *> > args;
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode ));
|
||||
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p ));
|
||||
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
|
||||
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
|
||||
memset(candidate, 0, 4 * sizeof(int) * outputsz);
|
||||
openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
|
||||
free((void *)candidate);
|
||||
candidate = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::ocl::integral(gimg, gsum, gsqsum);
|
||||
|
||||
gpuSetHaarClassifierCascade(cascade);
|
||||
|
||||
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
|
||||
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
|
||||
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
|
||||
node = (GpuHidHaarTreeNode *)(classifier->node);
|
||||
|
||||
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
|
||||
m_nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
|
||||
cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount);
|
||||
float *correction = (float *)malloc(sizeof(float) * m_loopcount);
|
||||
int startstage = 0;
|
||||
int endstage = gcascade->count;
|
||||
double factor;
|
||||
for(int i = 0; i < m_loopcount; i++)
|
||||
{
|
||||
factor = scalev[i];
|
||||
int equRect_x = (int)(factor * gcascade->p0 + 0.5);
|
||||
int equRect_y = (int)(factor * gcascade->p1 + 0.5);
|
||||
int equRect_w = (int)(factor * gcascade->p3 + 0.5);
|
||||
int equRect_h = (int)(factor * gcascade->p2 + 0.5);
|
||||
p[i].s[0] = equRect_x;
|
||||
p[i].s[1] = equRect_y;
|
||||
p[i].s[2] = equRect_x + equRect_w;
|
||||
p[i].s[3] = equRect_y + equRect_h;
|
||||
correction[i] = 1. / (equRect_w * equRect_h);
|
||||
int startnodenum = m_nodenum * i;
|
||||
float factor2 = (float)factor;
|
||||
|
||||
vector<pair<size_t, const void *> > args1;
|
||||
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer ));
|
||||
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
|
||||
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&factor2 ));
|
||||
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&correction[i] ));
|
||||
args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
|
||||
|
||||
size_t globalThreads2[3] = {m_nodenum, 1, 1};
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
|
||||
}
|
||||
|
||||
int step = gsum.step / 4;
|
||||
int startnode = 0;
|
||||
int splitstage = 3;
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL));
|
||||
|
||||
vector<pair<size_t, const void *> > args;
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
|
||||
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
{
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
}
|
||||
|
||||
free(p);
|
||||
free(correction);
|
||||
clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
|
||||
}
|
||||
|
||||
rectList.resize(allCandidates.size());
|
||||
if(!allCandidates.empty())
|
||||
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
|
||||
|
||||
if( minNeighbors != 0 || findBiggestObject )
|
||||
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
|
||||
else
|
||||
rweights.resize(rectList.size(), 0);
|
||||
|
||||
GenResult(faces, rectList, rweights);
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
|
||||
double scaleFactor, int flags,
|
||||
const int outputsz, const size_t localThreads[],
|
||||
CvSize minSize, CvSize maxSize)
|
||||
{
|
||||
CvHaarClassifierCascade *cascade = oldCascade;
|
||||
|
||||
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
|
||||
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
|
||||
|
||||
if( scaleFactor <= 1 )
|
||||
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
|
||||
|
||||
if( cols < minSize.width || rows < minSize.height )
|
||||
CV_Error(CV_StsError, "Image too small");
|
||||
|
||||
int datasize=0;
|
||||
int totalclassifier=0;
|
||||
|
||||
if( !cascade->hid_cascade )
|
||||
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
|
||||
|
||||
if( maxSize.height == 0 || maxSize.width == 0 )
|
||||
{
|
||||
maxSize.height = rows;
|
||||
maxSize.width = cols;
|
||||
}
|
||||
|
||||
findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
|
||||
if( findBiggestObject )
|
||||
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
|
||||
|
||||
CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
|
||||
CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
|
||||
|
||||
m_scaleFactor = scaleFactor;
|
||||
m_rows = rows;
|
||||
m_cols = cols;
|
||||
m_flags = flags;
|
||||
m_minSize = minSize;
|
||||
m_maxSize = maxSize;
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::CreateBaseBufs(const int datasize, const int totalclassifier,
|
||||
const int flags, const int outputsz)
|
||||
{
|
||||
if (!initialized)
|
||||
{
|
||||
buffers = malloc(sizeof(OclBuffers));
|
||||
|
||||
size_t tempSize =
|
||||
sizeof(GpuHidHaarStageClassifier) * ((GpuHidHaarClassifierCascade *)oldCascade->hid_cascade)->count;
|
||||
m_nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - tempSize - sizeof(GpuHidHaarClassifier) * totalclassifier)
|
||||
/ sizeof(GpuHidHaarTreeNode);
|
||||
|
||||
((OclBuffers *)buffers)->stagebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, tempSize);
|
||||
((OclBuffers *)buffers)->nodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, m_nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
}
|
||||
|
||||
if (initialized
|
||||
&& ((m_flags & CV_HAAR_SCALE_IMAGE) ^ (flags & CV_HAAR_SCALE_IMAGE)))
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
|
||||
}
|
||||
|
||||
if (flags & CV_HAAR_SCALE_IMAGE)
|
||||
{
|
||||
((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(),
|
||||
CL_MEM_WRITE_ONLY,
|
||||
4 * sizeof(int) * outputsz);
|
||||
}
|
||||
else
|
||||
{
|
||||
((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(),
|
||||
CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
|
||||
4 * sizeof(int) * outputsz);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
|
||||
const int rows, const int cols, const int flags,
|
||||
const double scaleFactor, const size_t localThreads[],
|
||||
CvSize minSize, CvSize maxSize)
|
||||
{
|
||||
if (initialized)
|
||||
{
|
||||
if ((m_flags & CV_HAAR_SCALE_IMAGE) && !(flags & CV_HAAR_SCALE_IMAGE))
|
||||
{
|
||||
gimg1.release();
|
||||
gsum.release();
|
||||
gsqsum.release();
|
||||
}
|
||||
else if (!(m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE))
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
|
||||
}
|
||||
else if ((m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE))
|
||||
{
|
||||
if (fabs(m_scaleFactor - scaleFactor) < 1e-6
|
||||
&& (rows == m_rows && cols == m_cols)
|
||||
&& (minSize.width == m_minSize.width)
|
||||
&& (minSize.height == m_minSize.height)
|
||||
&& (maxSize.width == m_maxSize.width)
|
||||
&& (maxSize.height == m_maxSize.height))
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fabs(m_scaleFactor - scaleFactor) < 1e-6
|
||||
&& (rows == m_rows && cols == m_cols)
|
||||
&& (minSize.width == m_minSize.width)
|
||||
&& (minSize.height == m_minSize.height)
|
||||
&& (maxSize.width == m_maxSize.width)
|
||||
&& (maxSize.height == m_maxSize.height))
|
||||
{
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int loopcount;
|
||||
int indexy = 0;
|
||||
int totalheight = 0;
|
||||
double factor;
|
||||
Rect roi;
|
||||
CvSize sz;
|
||||
CvSize winSize0 = oldCascade->orig_window_size;
|
||||
detect_piramid_info *scaleinfo;
|
||||
if (flags & CV_HAAR_SCALE_IMAGE)
|
||||
{
|
||||
for(factor = 1.f;; factor *= scaleFactor)
|
||||
{
|
||||
CvSize winSize = { cvRound(winSize0.width * factor), cvRound(winSize0.height * factor) };
|
||||
sz.width = cvRound( cols / factor ) + 1;
|
||||
sz.height = cvRound( rows / factor ) + 1;
|
||||
CvSize sz1 = { sz.width - winSize0.width - 1, sz.height - winSize0.height - 1 };
|
||||
|
||||
if( sz1.width <= 0 || sz1.height <= 0 )
|
||||
break;
|
||||
if( winSize.width > maxSize.width || winSize.height > maxSize.height )
|
||||
break;
|
||||
if( winSize.width < minSize.width || winSize.height < minSize.height )
|
||||
continue;
|
||||
|
||||
totalheight += sz.height;
|
||||
sizev.push_back(sz);
|
||||
scalev.push_back(static_cast<float>(factor));
|
||||
}
|
||||
|
||||
loopcount = sizev.size();
|
||||
gimg1.create(rows, cols, CV_8UC1);
|
||||
gsum.create(totalheight + 4, cols + 1, CV_32SC1);
|
||||
gsqsum.create(totalheight + 4, cols + 1, CV_32FC1);
|
||||
|
||||
scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
|
||||
for( int i = 0; i < loopcount; i++ )
|
||||
{
|
||||
sz = sizev[i];
|
||||
roi = Rect(0, indexy, sz.width, sz.height);
|
||||
int width = sz.width - 1 - oldCascade->orig_window_size.width;
|
||||
int height = sz.height - 1 - oldCascade->orig_window_size.height;
|
||||
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
|
||||
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
|
||||
|
||||
((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height;
|
||||
((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
|
||||
((detect_piramid_info *)scaleinfo)[i].imgoff = gsum(roi).offset >> 2;
|
||||
((detect_piramid_info *)scaleinfo)[i].factor = scalev[i];
|
||||
|
||||
indexy += sz.height;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(factor = 1;
|
||||
cvRound(factor * winSize0.width) < cols - 10 && cvRound(factor * winSize0.height) < rows - 10;
|
||||
factor *= scaleFactor)
|
||||
{
|
||||
CvSize winSize = { cvRound( winSize0.width * factor ), cvRound( winSize0.height * factor ) };
|
||||
if( winSize.width < minSize.width || winSize.height < minSize.height )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
sizev.push_back(winSize);
|
||||
scalev.push_back(factor);
|
||||
}
|
||||
|
||||
loopcount = scalev.size();
|
||||
if(loopcount == 0)
|
||||
{
|
||||
loopcount = 1;
|
||||
sizev.push_back(minSize);
|
||||
scalev.push_back( min(cvRound(minSize.width / winSize0.width), cvRound(minSize.height / winSize0.height)) );
|
||||
}
|
||||
|
||||
((OclBuffers *)buffers)->pbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY,
|
||||
sizeof(cl_int4) * loopcount);
|
||||
((OclBuffers *)buffers)->correctionbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY,
|
||||
sizeof(cl_float) * loopcount);
|
||||
((OclBuffers *)buffers)->newnodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_WRITE,
|
||||
loopcount * m_nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
|
||||
scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
|
||||
for( int i = 0; i < loopcount; i++ )
|
||||
{
|
||||
sz = sizev[i];
|
||||
factor = scalev[i];
|
||||
int ystep = cvRound(std::max(2., factor));
|
||||
int width = (cols - 1 - sz.width + ystep - 1) / ystep;
|
||||
int height = (rows - 1 - sz.height + ystep - 1) / ystep;
|
||||
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
|
||||
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
|
||||
|
||||
((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height;
|
||||
((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
|
||||
((detect_piramid_info *)scaleinfo)[i].imgoff = 0;
|
||||
((detect_piramid_info *)scaleinfo)[i].factor = factor;
|
||||
}
|
||||
}
|
||||
|
||||
if (loopcount != m_loopcount)
|
||||
{
|
||||
if (initialized)
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
|
||||
}
|
||||
((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
|
||||
}
|
||||
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
|
||||
sizeof(detect_piramid_info)*loopcount,
|
||||
scaleinfo, 0, NULL, NULL));
|
||||
free(scaleinfo);
|
||||
|
||||
m_loopcount = loopcount;
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& faces,
|
||||
const std::vector<cv::Rect> &rectList,
|
||||
const std::vector<int> &rweights)
|
||||
{
|
||||
CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) );
|
||||
|
||||
if( findBiggestObject && rectList.size() )
|
||||
{
|
||||
CvAvgComp result_comp = {{0, 0, 0, 0}, 0};
|
||||
|
||||
for( size_t i = 0; i < rectList.size(); i++ )
|
||||
{
|
||||
cv::Rect r = rectList[i];
|
||||
if( r.area() > cv::Rect(result_comp.rect).area() )
|
||||
{
|
||||
result_comp.rect = r;
|
||||
result_comp.neighbors = rweights[i];
|
||||
}
|
||||
}
|
||||
cvSeqPush( result_seq, &result_comp );
|
||||
}
|
||||
else
|
||||
{
|
||||
for( size_t i = 0; i < rectList.size(); i++ )
|
||||
{
|
||||
CvAvgComp c;
|
||||
c.rect = rectList[i];
|
||||
c.neighbors = rweights[i];
|
||||
cvSeqPush( result_seq, &c );
|
||||
}
|
||||
}
|
||||
|
||||
vector<CvAvgComp> vecAvgComp;
|
||||
Seq<CvAvgComp>(result_seq).copyTo(vecAvgComp);
|
||||
faces.resize(vecAvgComp.size());
|
||||
std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
|
||||
}
|
||||
|
||||
void cv::ocl::OclCascadeClassifierBuf::release()
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
|
||||
|
||||
if( (m_flags & CV_HAAR_SCALE_IMAGE) )
|
||||
{
|
||||
cvFree(&oldCascade->hid_cascade);
|
||||
}
|
||||
else
|
||||
{
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
|
||||
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
|
||||
}
|
||||
|
||||
free(buffers);
|
||||
buffers = NULL;
|
||||
}
|
||||
|
||||
#ifndef _MAX_PATH
|
||||
#define _MAX_PATH 1024
|
||||
|
@ -112,7 +112,7 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
|
||||
} GpuHidHaarClassifierCascade;
|
||||
|
||||
|
||||
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade,
|
||||
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global int4 * info,
|
||||
global GpuHidHaarTreeNode * nodeptr,
|
||||
@ -128,12 +128,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
const int splitnode,
|
||||
const int4 p,
|
||||
const int4 pq,
|
||||
const float correction
|
||||
//const int width,
|
||||
//const int height,
|
||||
//const int grpnumperline,
|
||||
//const int totalgrp
|
||||
)
|
||||
const float correction)
|
||||
{
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
@ -145,13 +140,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int lcl_sz = mul24(grpszx,grpszy);
|
||||
int lcl_id = mad24(lclidy,grpszx,lclidx);
|
||||
|
||||
//assume lcl_sz == 256 or 128 or 64
|
||||
//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
|
||||
//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
|
||||
__local int lclshare[1024];
|
||||
|
||||
#define OFF 0
|
||||
__local int* lcldata = lclshare + OFF;//for save win data
|
||||
__local int* lcldata = lclshare;//for save win data
|
||||
__local int* glboutindex = lcldata + 28*28;//for save global out index
|
||||
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
|
||||
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
|
||||
@ -181,7 +171,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
int imgoff = scaleinfo1.z;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
//int ystep =1;// factor > 2.0 ? 1 : 2;
|
||||
|
||||
__global const int * sum = sum1 + imgoff;
|
||||
__global const float * sqsum = sqsum1 + imgoff;
|
||||
@ -191,8 +180,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int x = mad24(grpidx,grpszx,lclidx);
|
||||
int y = mad24(grpidy,grpszy,lclidy);
|
||||
//candidate_result.x = convert_int_rtn(x*factor);
|
||||
//candidate_result.y = convert_int_rtn(y*factor);
|
||||
int grpoffx = x-lclidx;
|
||||
int grpoffy = y-lclidy;
|
||||
|
||||
@ -211,14 +198,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int4 data = *(__global int4*)&sum[glb_off];
|
||||
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
|
||||
|
||||
#if OFF
|
||||
lcldata[lcl_off] = data.x;
|
||||
lcldata[lcl_off+1] = data.y;
|
||||
lcldata[lcl_off+2] = data.z;
|
||||
lcldata[lcl_off+3] = data.w;
|
||||
#else
|
||||
vstore4(data, 0, &lcldata[lcl_off]);
|
||||
#endif
|
||||
}
|
||||
|
||||
lcloutindex[lcl_id] = 0;
|
||||
@ -231,184 +211,170 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int lcl_off = mad24(lclidy,readwidth,lclidx);
|
||||
int4 cascadeinfo1, cascadeinfo2;
|
||||
cascadeinfo1 = p;
|
||||
cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
|
||||
cascadeinfo2 = pq;
|
||||
|
||||
cascadeinfo1.x +=lcl_off;
|
||||
cascadeinfo1.z +=lcl_off;
|
||||
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
|
||||
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
|
||||
*correction;
|
||||
|
||||
//if((x < width) && (y < height))
|
||||
int p_offset = mad24(y, pixelstep, x);
|
||||
|
||||
cascadeinfo2.x +=p_offset;
|
||||
cascadeinfo2.z +=p_offset;
|
||||
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
|
||||
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
|
||||
|
||||
variance_norm_factor = variance_norm_factor * correction - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
|
||||
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
|
||||
{
|
||||
cascadeinfo1.x +=lcl_off;
|
||||
cascadeinfo1.z +=lcl_off;
|
||||
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
|
||||
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
|
||||
*correction;
|
||||
float stage_sum = 0.f;
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
|
||||
|
||||
int p_offset = mad24(y, pixelstep, x);
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
|
||||
cascadeinfo2.x +=p_offset;
|
||||
cascadeinfo2.z +=p_offset;
|
||||
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
|
||||
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
|
||||
info1.x +=lcl_off;
|
||||
info1.z +=lcl_off;
|
||||
info2.x +=lcl_off;
|
||||
info2.z +=lcl_off;
|
||||
|
||||
variance_norm_factor = variance_norm_factor * correction - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
//if( cascade->is_stump_based )
|
||||
//{
|
||||
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
|
||||
info3.x +=lcl_off;
|
||||
info3.z +=lcl_off;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
|
||||
if(result && (x < width) && (y < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
nodecounter = splitnode;
|
||||
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
|
||||
{
|
||||
lclcount[0]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
|
||||
int perfscale = queuecount > 4 ? 3 : 2;
|
||||
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
|
||||
int lcl_compute_win = lcl_sz >> perfscale;
|
||||
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
|
||||
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
|
||||
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
|
||||
for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
|
||||
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
|
||||
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
|
||||
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
|
||||
|
||||
if(lcl_compute_win_id < queuecount)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
|
||||
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
int tempnodecounter = lcl_compute_id;
|
||||
float part_sum = 0.f;
|
||||
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
|
||||
|
||||
info1.x +=lcl_off;
|
||||
info1.z +=lcl_off;
|
||||
info2.x +=lcl_off;
|
||||
info2.z +=lcl_off;
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
info1.x +=queue_pixel;
|
||||
info1.z +=queue_pixel;
|
||||
info2.x +=queue_pixel;
|
||||
info2.z +=queue_pixel;
|
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
|
||||
info3.x +=queue_pixel;
|
||||
info3.z +=queue_pixel;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
|
||||
//if((info3.z - info3.x) && (!stageinfo.z))
|
||||
//{
|
||||
info3.x +=lcl_off;
|
||||
info3.z +=lcl_off;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
//}
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
tempnodecounter +=lcl_compute_win;
|
||||
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
|
||||
partialsum[lcl_id]=part_sum;
|
||||
}
|
||||
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
|
||||
if(result && (x < width) && (y < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
nodecounter = splitnode;
|
||||
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
|
||||
{
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
//if(lcl_id == 0)
|
||||
lclcount[0]=0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
|
||||
int perfscale = queuecount > 4 ? 3 : 2;
|
||||
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
|
||||
int lcl_compute_win = lcl_sz >> perfscale;
|
||||
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
|
||||
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
|
||||
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
|
||||
for(int queueloop=0; queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/; queueloop++)
|
||||
if(lcl_compute_win_id < queuecount)
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
|
||||
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
|
||||
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
|
||||
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_compute_win_id < queuecount)
|
||||
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
|
||||
{
|
||||
|
||||
int tempnodecounter = lcl_compute_id;
|
||||
float part_sum = 0.f;
|
||||
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
|
||||
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
|
||||
info1.x +=queue_pixel;
|
||||
info1.z +=queue_pixel;
|
||||
info2.x +=queue_pixel;
|
||||
info2.z +=queue_pixel;
|
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
|
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
|
||||
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
|
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
|
||||
//if((info3.z - info3.x) && (!stageinfo.z))
|
||||
//{
|
||||
info3.x +=queue_pixel;
|
||||
info3.z +=queue_pixel;
|
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
|
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
|
||||
//}
|
||||
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
tempnodecounter +=lcl_compute_win;
|
||||
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
|
||||
partialsum[lcl_id]=part_sum;
|
||||
stage_sum += partialsum[lcl_id+i];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_compute_win_id < queuecount)
|
||||
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
|
||||
{
|
||||
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
|
||||
{
|
||||
stage_sum += partialsum[lcl_id+i];
|
||||
}
|
||||
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = temp_coord;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
lcl_compute_win_id +=(1<<perfscale);
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = temp_coord;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
queuecount = lclcount[0];
|
||||
lcl_compute_win_id +=(1<<perfscale);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
nodecounter += stageinfo.x;
|
||||
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
|
||||
//barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lcl_id<queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id<<1];
|
||||
int x = mad24(grpidx,grpszx,temp & 0xffff);
|
||||
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
|
||||
candidate_result.x = convert_int_rtn(x*factor);
|
||||
candidate_result.y = convert_int_rtn(y*factor);
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff+temp+lcl_id] = candidate_result;
|
||||
}
|
||||
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
|
||||
|
||||
queuecount = lclcount[0];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end if((x < width) && (y < height))
|
||||
nodecounter += stageinfo.x;
|
||||
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
|
||||
|
||||
if(lcl_id<queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id<<1];
|
||||
int x = mad24(grpidx,grpszx,temp & 0xffff);
|
||||
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
|
||||
candidate_result.x = convert_int_rtn(x*factor);
|
||||
candidate_result.y = convert_int_rtn(y*factor);
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff+temp+lcl_id] = candidate_result;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
|
||||
//outputoff +=mul24(width,height);
|
||||
}//end for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
}
|
||||
|
||||
|
@ -16,6 +16,7 @@
|
||||
//
|
||||
// @Authors
|
||||
// Wu Xinglong, wxl370@126.com
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -52,11 +53,11 @@ typedef struct __attribute__((aligned(128))) GpuHidHaarFeature
|
||||
{
|
||||
struct __attribute__((aligned(32)))
|
||||
{
|
||||
int p0 __attribute__((aligned(4)));
|
||||
int p1 __attribute__((aligned(4)));
|
||||
int p2 __attribute__((aligned(4)));
|
||||
int p3 __attribute__((aligned(4)));
|
||||
float weight __attribute__((aligned(4)));
|
||||
int p0 __attribute__((aligned(4)));
|
||||
int p1 __attribute__((aligned(4)));
|
||||
int p2 __attribute__((aligned(4)));
|
||||
int p3 __attribute__((aligned(4)));
|
||||
float weight __attribute__((aligned(4)));
|
||||
}
|
||||
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32)));
|
||||
}
|
||||
@ -113,173 +114,168 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
global const int *restrict sum,
|
||||
global const float *restrict sqsum,
|
||||
global int4 *candidate,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int step,
|
||||
const int loopcount,
|
||||
const int start_stage,
|
||||
const int split_stage,
|
||||
const int end_stage,
|
||||
const int startnode,
|
||||
const int splitnode,
|
||||
global int4 *p,
|
||||
//const int4 * pq,
|
||||
global float *correction,
|
||||
const int nodecount)
|
||||
{
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
int grpnumx = get_num_groups(0);
|
||||
int grpidx = get_group_id(0);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
int lcl_sz = mul24(grpszx, grpszy);
|
||||
int lcl_id = mad24(lclidy, grpszx, lclidx);
|
||||
__local int lclshare[1024];
|
||||
__local int *glboutindex = lclshare + 0;
|
||||
__local int *lclcount = glboutindex + 1;
|
||||
__local int *lcloutindex = lclcount + 1;
|
||||
__local float *partialsum = (__local float *)(lcloutindex + (lcl_sz << 1));
|
||||
glboutindex[0] = 0;
|
||||
int outputoff = mul24(grpidx, 256);
|
||||
candidate[outputoff + (lcl_id << 2)] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
|
||||
int grpszx = get_local_size(0);
|
||||
int grpszy = get_local_size(1);
|
||||
int grpnumx = get_num_groups(0);
|
||||
int grpidx = get_group_id(0);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
int lcl_sz = mul24(grpszx, grpszy);
|
||||
int lcl_id = mad24(lclidy, grpszx, lclidx);
|
||||
__local int glboutindex[1];
|
||||
__local int lclcount[1];
|
||||
__local int lcloutindex[64];
|
||||
glboutindex[0] = 0;
|
||||
int outputoff = mul24(grpidx, 256);
|
||||
candidate[outputoff + (lcl_id << 2)] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
|
||||
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
|
||||
int max_idx = rows * cols - 1;
|
||||
for (int scalei = 0; scalei < loopcount; scalei++)
|
||||
{
|
||||
int4 scaleinfo1;
|
||||
scaleinfo1 = info[scalei];
|
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16;
|
||||
int height = scaleinfo1.x & 0xffff;
|
||||
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
float correction_t = correction[scalei];
|
||||
int ystep = (int)(max(2.0f, factor) + 0.5f);
|
||||
|
||||
for (int scalei = 0; scalei < loopcount; scalei++)
|
||||
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
|
||||
{
|
||||
int4 scaleinfo1;
|
||||
scaleinfo1 = info[scalei];
|
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16;
|
||||
int height = scaleinfo1.x & 0xffff;
|
||||
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
float correction_t = correction[scalei];
|
||||
int ystep = (int)(max(2.0f, factor) + 0.5f);
|
||||
int4 cascadeinfo = p[scalei];
|
||||
int grpidy = grploop / grpnumperline;
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int ix = mad24(grpidx, grpszx, lclidx);
|
||||
int iy = mad24(grpidy, grpszy, lclidy);
|
||||
int x = ix * ystep;
|
||||
int y = iy * ystep;
|
||||
lcloutindex[lcl_id] = 0;
|
||||
lclcount[0] = 0;
|
||||
int nodecounter;
|
||||
float mean, variance_norm_factor;
|
||||
//if((ix < width) && (iy < height))
|
||||
{
|
||||
const int p_offset = mad24(y, step, x);
|
||||
cascadeinfo.x += p_offset;
|
||||
cascadeinfo.z += p_offset;
|
||||
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
|
||||
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
|
||||
* correction_t;
|
||||
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
|
||||
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
|
||||
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
bool result = true;
|
||||
nodecounter = startnode + nodecount * scalei;
|
||||
|
||||
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
|
||||
for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
|
||||
{
|
||||
int4 cascadeinfo = p[scalei];
|
||||
int grpidy = grploop / grpnumperline;
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int ix = mad24(grpidx, grpszx, lclidx);
|
||||
int iy = mad24(grpidy, grpszy, lclidy);
|
||||
int x = ix * ystep;
|
||||
int y = iy * ystep;
|
||||
lcloutindex[lcl_id] = 0;
|
||||
lclcount[0] = 0;
|
||||
int result = 1, nodecounter;
|
||||
float mean, variance_norm_factor;
|
||||
//if((ix < width) && (iy < height))
|
||||
{
|
||||
const int p_offset = mad24(y, step, x);
|
||||
cascadeinfo.x += p_offset;
|
||||
cascadeinfo.z += p_offset;
|
||||
mean = (sum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
|
||||
sum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sum[mad24(cascadeinfo.w, step, cascadeinfo.z)])
|
||||
* correction_t;
|
||||
variance_norm_factor = sqsum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
|
||||
sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
|
||||
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
|
||||
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
|
||||
result = 1;
|
||||
nodecounter = startnode + nodecount * scalei;
|
||||
|
||||
for (int stageloop = start_stage; stageloop < end_stage && result; stageloop++)
|
||||
{
|
||||
float stage_sum = 0.f;
|
||||
int4 stageinfo = *(global int4 *)(stagecascadeptr + stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
|
||||
for (int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
|
||||
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
info1.x += p_offset;
|
||||
info1.z += p_offset;
|
||||
info2.x += p_offset;
|
||||
info2.z += p_offset;
|
||||
float classsum = (sum[mad24(info1.y, step, info1.x)] - sum[mad24(info1.y, step, info1.z)] -
|
||||
sum[mad24(info1.w, step, info1.x)] + sum[mad24(info1.w, step, info1.z)]) * w.x;
|
||||
classsum += (sum[mad24(info2.y, step, info2.x)] - sum[mad24(info2.y, step, info2.z)] -
|
||||
sum[mad24(info2.w, step, info2.x)] + sum[mad24(info2.w, step, info2.z)]) * w.y;
|
||||
info3.x += p_offset;
|
||||
info3.z += p_offset;
|
||||
classsum += (sum[mad24(info3.y, step, info3.x)] - sum[mad24(info3.y, step, info3.z)] -
|
||||
sum[mad24(info3.w, step, info3.x)] + sum[mad24(info3.w, step, info3.z)]) * w.z;
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
|
||||
if (result && (ix < width) && (iy < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex << 1] = (y << 16) | x;
|
||||
lcloutindex[(queueindex << 1) + 1] = as_int(variance_norm_factor);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
nodecounter = splitnode + nodecount * scalei;
|
||||
|
||||
if (lcl_id < queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id << 1];
|
||||
int x = temp & 0xffff;
|
||||
int y = (temp & (int)0xffff0000) >> 16;
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
|
||||
candidate_result.x = x;
|
||||
candidate_result.y = y;
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff + temp + lcl_id] = candidate_result;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
float stage_sum = 0.f;
|
||||
int stagecount = stagecascadeptr[stageloop].count;
|
||||
for (int nodeloop = 0; nodeloop < stagecount; nodeloop++)
|
||||
{
|
||||
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
|
||||
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
|
||||
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
|
||||
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
|
||||
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
|
||||
float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
|
||||
float nodethreshold = w.w * variance_norm_factor;
|
||||
info1.x += p_offset;
|
||||
info1.z += p_offset;
|
||||
info2.x += p_offset;
|
||||
info2.z += p_offset;
|
||||
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
|
||||
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
|
||||
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
|
||||
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
|
||||
info3.x += p_offset;
|
||||
info3.z += p_offset;
|
||||
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
|
||||
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
|
||||
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
|
||||
nodecounter++;
|
||||
}
|
||||
result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (result && (ix < width) && (iy < height))
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex] = (y << 16) | x;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
|
||||
if (lcl_id < queuecount)
|
||||
{
|
||||
int temp = lcloutindex[lcl_id];
|
||||
int x = temp & 0xffff;
|
||||
int y = (temp & (int)0xffff0000) >> 16;
|
||||
temp = atomic_inc(glboutindex);
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
|
||||
candidate_result.x = x;
|
||||
candidate_result.y = y;
|
||||
candidate[outputoff + temp + lcl_id] = candidate_result;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
|
||||
{
|
||||
int counter = get_global_id(0);
|
||||
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
|
||||
GpuHidHaarTreeNode t1 = *(orinode + counter);
|
||||
int counter = get_global_id(0);
|
||||
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
|
||||
GpuHidHaarTreeNode t1 = *(orinode + counter);
|
||||
#pragma unroll
|
||||
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
|
||||
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
|
||||
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
|
||||
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
|
||||
}
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
|
||||
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
|
||||
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
|
||||
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
|
||||
}
|
||||
|
||||
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
|
||||
counter += nodenum;
|
||||
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
|
||||
counter += nodenum;
|
||||
#pragma unroll
|
||||
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
newnode[counter].p[i][0] = tr_x[i];
|
||||
newnode[counter].p[i][1] = tr_y[i];
|
||||
newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
|
||||
newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
|
||||
newnode[counter].weight[i] = t1.weight[i] * weight_scale;
|
||||
}
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
newnode[counter].p[i][0] = tr_x[i];
|
||||
newnode[counter].p[i][1] = tr_y[i];
|
||||
newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
|
||||
newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
|
||||
newnode[counter].weight[i] = t1.weight[i] * weight_scale;
|
||||
}
|
||||
|
||||
newnode[counter].left = t1.left;
|
||||
newnode[counter].right = t1.right;
|
||||
newnode[counter].threshold = t1.threshold;
|
||||
newnode[counter].alpha[0] = t1.alpha[0];
|
||||
newnode[counter].alpha[1] = t1.alpha[1];
|
||||
newnode[counter].left = t1.left;
|
||||
newnode[counter].right = t1.right;
|
||||
newnode[counter].threshold = t1.threshold;
|
||||
newnode[counter].alpha[0] = t1.alpha[0];
|
||||
newnode[counter].alpha[1] = t1.alpha[1];
|
||||
}
|
||||
|
||||
|
@ -16,6 +16,7 @@
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Sen Liu, swjutls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -61,40 +62,31 @@ struct getRect
|
||||
}
|
||||
};
|
||||
|
||||
PARAM_TEST_CASE(HaarTestBase, int, int)
|
||||
PARAM_TEST_CASE(Haar, double, int)
|
||||
{
|
||||
//std::vector<cv::ocl::Info> oclinfo;
|
||||
cv::ocl::OclCascadeClassifier cascade, nestedCascade;
|
||||
cv::ocl::OclCascadeClassifierBuf cascadebuf;
|
||||
cv::CascadeClassifier cpucascade, cpunestedCascade;
|
||||
// Mat img;
|
||||
|
||||
double scale;
|
||||
int index;
|
||||
int flags;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
scale = 1.0;
|
||||
index = 0;
|
||||
scale = GET_PARAM(0);
|
||||
flags = GET_PARAM(1);
|
||||
string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
|
||||
|
||||
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
|
||||
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName )))
|
||||
{
|
||||
cout << "ERROR: Could not load classifier cascade" << endl;
|
||||
return;
|
||||
}
|
||||
//int devnums = getDevice(oclinfo);
|
||||
//CV_Assert(devnums>0);
|
||||
////if you want to use undefault device, set it here
|
||||
////setDevice(oclinfo[0]);
|
||||
//cv::ocl::setBinpath("E:\\");
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////faceDetect/////////////////////////////////////////////////
|
||||
|
||||
struct Haar : HaarTestBase {};
|
||||
|
||||
TEST_F(Haar, FaceDetect)
|
||||
TEST_P(Haar, FaceDetect)
|
||||
{
|
||||
string imgName = workdir + "lena.jpg";
|
||||
Mat img = imread( imgName, 1 );
|
||||
@ -105,59 +97,65 @@ TEST_F(Haar, FaceDetect)
|
||||
return ;
|
||||
}
|
||||
|
||||
//int i = 0;
|
||||
//double t = 0;
|
||||
vector<Rect> faces, oclfaces;
|
||||
|
||||
// const static Scalar colors[] = { CV_RGB(0, 0, 255),
|
||||
// CV_RGB(0, 128, 255),
|
||||
// CV_RGB(0, 255, 255),
|
||||
// CV_RGB(0, 255, 0),
|
||||
// CV_RGB(255, 128, 0),
|
||||
// CV_RGB(255, 255, 0),
|
||||
// CV_RGB(255, 0, 0),
|
||||
// CV_RGB(255, 0, 255)
|
||||
// } ;
|
||||
|
||||
Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
|
||||
MemStorage storage(cvCreateMemStorage(0));
|
||||
cvtColor( img, gray, CV_BGR2GRAY );
|
||||
resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
|
||||
equalizeHist( smallImg, smallImg );
|
||||
|
||||
|
||||
cv::ocl::oclMat image;
|
||||
CvSeq *_objects;
|
||||
image.upload(smallImg);
|
||||
_objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
|
||||
3, 0
|
||||
| CV_HAAR_SCALE_IMAGE
|
||||
, Size(30, 30), Size(0, 0) );
|
||||
3, flags, Size(30, 30), Size(0, 0) );
|
||||
vector<CvAvgComp> vecAvgComp;
|
||||
Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
|
||||
oclfaces.resize(vecAvgComp.size());
|
||||
std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
|
||||
|
||||
cpucascade.detectMultiScale( smallImg, faces, 1.1,
|
||||
3, 0
|
||||
| CV_HAAR_SCALE_IMAGE
|
||||
, Size(30, 30), Size(0, 0) );
|
||||
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
|
||||
flags,
|
||||
Size(30, 30), Size(0, 0) );
|
||||
EXPECT_EQ(faces.size(), oclfaces.size());
|
||||
/* for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
|
||||
{
|
||||
Mat smallImgROI;
|
||||
Point center;
|
||||
Scalar color = colors[i%8];
|
||||
int radius;
|
||||
center.x = cvRound((r->x + r->width*0.5)*scale);
|
||||
center.y = cvRound((r->y + r->height*0.5)*scale);
|
||||
radius = cvRound((r->width + r->height)*0.25*scale);
|
||||
circle( img, center, radius, color, 3, 8, 0 );
|
||||
} */
|
||||
//namedWindow("result");
|
||||
//imshow("result",img);
|
||||
//waitKey(0);
|
||||
//destroyAllWindows();
|
||||
|
||||
}
|
||||
|
||||
TEST_P(Haar, FaceDetectUseBuf)
|
||||
{
|
||||
string imgName = workdir + "lena.jpg";
|
||||
Mat img = imread( imgName, 1 );
|
||||
|
||||
if(img.empty())
|
||||
{
|
||||
std::cout << "Couldn't read " << imgName << std::endl;
|
||||
return ;
|
||||
}
|
||||
|
||||
vector<Rect> faces, oclfaces;
|
||||
|
||||
Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
|
||||
MemStorage storage(cvCreateMemStorage(0));
|
||||
cvtColor( img, gray, CV_BGR2GRAY );
|
||||
resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
|
||||
equalizeHist( smallImg, smallImg );
|
||||
|
||||
cv::ocl::oclMat image;
|
||||
image.upload(smallImg);
|
||||
|
||||
cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3,
|
||||
flags,
|
||||
Size(30, 30), Size(0, 0) );
|
||||
cascadebuf.release();
|
||||
|
||||
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
|
||||
flags,
|
||||
Size(30, 30), Size(0, 0) );
|
||||
EXPECT_EQ(faces.size(), oclfaces.size());
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
|
||||
Combine(Values(1.0),
|
||||
Values(CV_HAAR_SCALE_IMAGE, 0)));
|
||||
|
||||
#endif // HAVE_OPENCL
|
||||
|
Loading…
Reference in New Issue
Block a user