Merge pull request #2007 from krodyush:pullreq/2.4-opt-131202-haar
This commit is contained in:
commit
b674cd8571
@ -866,16 +866,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
|||||||
|
|
||||||
if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
|
if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
|
||||||
{
|
{
|
||||||
//setup local group size
|
//setup local group size for "pixel step" = 1
|
||||||
localThreads[0] = 8;
|
localThreads[0] = 16;
|
||||||
localThreads[1] = 16;
|
localThreads[1] = 32;
|
||||||
localThreads[2] = 1;
|
localThreads[2] = 1;
|
||||||
|
|
||||||
//init maximal number of workgroups
|
//calc maximal number of workgroups
|
||||||
int WGNumX = 1+(sizev[0].width /(localThreads[0]));
|
int WGNumX = 1+(sizev[0].width /(localThreads[0]));
|
||||||
int WGNumY = 1+(sizev[0].height/(localThreads[1]));
|
int WGNumY = 1+(sizev[0].height/(localThreads[1]));
|
||||||
int WGNumZ = loopcount;
|
int WGNumZ = loopcount;
|
||||||
int WGNum = 0; //accurate number of non -empty workgroups
|
int WGNumTotal = 0; //accurate number of non-empty workgroups
|
||||||
|
int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2
|
||||||
oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
|
oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
|
||||||
{
|
{
|
||||||
cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
|
cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
|
||||||
@ -895,12 +896,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
|||||||
if(gx>=(Width-cascade->orig_window_size.width))
|
if(gx>=(Width-cascade->orig_window_size.width))
|
||||||
continue; // no data to process
|
continue; // no data to process
|
||||||
|
|
||||||
|
if(scaleinfo[z].factor<=2)
|
||||||
|
{
|
||||||
|
WGNumSampled++;
|
||||||
|
}
|
||||||
// save no-empty workgroup info into array
|
// save no-empty workgroup info into array
|
||||||
pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
|
pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height;
|
||||||
pWGInfo[WGNum].s[1] = (gx << 16) | gy;
|
pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy;
|
||||||
pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
|
pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff;
|
||||||
memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
|
memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float));
|
||||||
WGNum++;
|
WGNumTotal++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -908,13 +913,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
|||||||
pWGInfo = NULL;
|
pWGInfo = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// setup global sizes to have linear array of workgroups with WGNum size
|
|
||||||
globalThreads[0] = localThreads[0]*WGNum;
|
|
||||||
globalThreads[1] = localThreads[1];
|
|
||||||
globalThreads[2] = 1;
|
|
||||||
|
|
||||||
#define NODE_SIZE 12
|
#define NODE_SIZE 12
|
||||||
// pack node info to have less memory loads
|
// pack node info to have less memory loads on the device side
|
||||||
oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
|
oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
|
||||||
{
|
{
|
||||||
cl_int status;
|
cl_int status;
|
||||||
@ -963,8 +963,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
|||||||
options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
|
options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
|
||||||
options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
|
options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
|
||||||
options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
|
options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
|
||||||
options += format(" -D LSx=%d",localThreads[0]);
|
|
||||||
options += format(" -D LSy=%d",localThreads[1]);
|
|
||||||
options += format(" -D SPLITNODE=%d",splitnode);
|
options += format(" -D SPLITNODE=%d",splitnode);
|
||||||
options += format(" -D SPLITSTAGE=%d",splitstage);
|
options += format(" -D SPLITSTAGE=%d",splitstage);
|
||||||
options += format(" -D OUTPUTSZ=%d",outputsz);
|
options += format(" -D OUTPUTSZ=%d",outputsz);
|
||||||
@ -972,8 +970,39 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
|||||||
// init candiate global count by 0
|
// init candiate global count by 0
|
||||||
int pattern = 0;
|
int pattern = 0;
|
||||||
openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
|
openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
|
||||||
// execute face detector
|
|
||||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
|
if(WGNumTotal>WGNumSampled)
|
||||||
|
{// small images and each pixel is processed
|
||||||
|
// setup global sizes to have linear array of workgroups with WGNum size
|
||||||
|
int pixelstep = 1;
|
||||||
|
size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
|
||||||
|
globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled);
|
||||||
|
globalThreads[1] = LS[1];
|
||||||
|
globalThreads[2] = 1;
|
||||||
|
string options1 = options;
|
||||||
|
options1 += format(" -D PIXEL_STEP=%d",pixelstep);
|
||||||
|
options1 += format(" -D WGSTART=%d",WGNumSampled);
|
||||||
|
options1 += format(" -D LSx=%d",LS[0]);
|
||||||
|
options1 += format(" -D LSy=%d",LS[1]);
|
||||||
|
// execute face detector
|
||||||
|
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str());
|
||||||
|
}
|
||||||
|
if(WGNumSampled>0)
|
||||||
|
{// large images each 4th pixel is processed
|
||||||
|
// setup global sizes to have linear array of workgroups with WGNum size
|
||||||
|
int pixelstep = 2;
|
||||||
|
size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
|
||||||
|
globalThreads[0] = LS[0]*WGNumSampled;
|
||||||
|
globalThreads[1] = LS[1];
|
||||||
|
globalThreads[2] = 1;
|
||||||
|
string options2 = options;
|
||||||
|
options2 += format(" -D PIXEL_STEP=%d",pixelstep);
|
||||||
|
options2 += format(" -D WGSTART=%d",0);
|
||||||
|
options2 += format(" -D LSx=%d",LS[0]);
|
||||||
|
options2 += format(" -D LSy=%d",LS[1]);
|
||||||
|
// execute face detector
|
||||||
|
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str());
|
||||||
|
}
|
||||||
//read candidate buffer back and put it into host list
|
//read candidate buffer back and put it into host list
|
||||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||||
assert(candidate[0]<outputsz);
|
assert(candidate[0]<outputsz);
|
||||||
|
@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
|
|||||||
)
|
)
|
||||||
|
|
||||||
{
|
{
|
||||||
// this version used information provided for each workgroup
|
|
||||||
// no empty WG
|
|
||||||
int gid = (int)get_group_id(0);
|
int gid = (int)get_group_id(0);
|
||||||
int lid_x = (int)get_local_id(0);
|
int lid_x = (int)get_local_id(0);
|
||||||
int lid_y = (int)get_local_id(1);
|
int lid_y = (int)get_local_id(1);
|
||||||
int lid = lid_y*LSx+lid_x;
|
int lid = lid_y*LSx+lid_x;
|
||||||
int4 WGInfo = pWGInfo[gid];
|
int4 WGInfo = pWGInfo[WGSTART+gid];
|
||||||
int GroupX = (WGInfo.y >> 16)&0xFFFF;
|
int GroupX = (WGInfo.y >> 16)&0xFFFF;
|
||||||
int GroupY = (WGInfo.y >> 0 )& 0xFFFF;
|
int GroupY = (WGInfo.y >> 0 )& 0xFFFF;
|
||||||
int Width = (WGInfo.x >> 16)&0xFFFF;
|
int Width = (WGInfo.x >> 16)&0xFFFF;
|
||||||
@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
|
|||||||
int ImgOffset = WGInfo.z;
|
int ImgOffset = WGInfo.z;
|
||||||
float ScaleFactor = as_float(WGInfo.w);
|
float ScaleFactor = as_float(WGInfo.w);
|
||||||
|
|
||||||
#define DATA_SIZE_X (LSx+WND_SIZE_X)
|
#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X)
|
||||||
#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
|
#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y)
|
||||||
#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
|
#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
|
||||||
|
|
||||||
local int SumL[DATA_SIZE];
|
local int SumL[DATA_SIZE];
|
||||||
@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
|
|||||||
int4 info1 = p;
|
int4 info1 = p;
|
||||||
int4 info2 = pq;
|
int4 info2 = pq;
|
||||||
|
|
||||||
{
|
// calc processed ROI coordinate in local mem
|
||||||
int xl = lid_x;
|
int xl = lid_x*PIXEL_STEP;
|
||||||
int yl = lid_y;
|
int yl = lid_y*PIXEL_STEP;
|
||||||
|
|
||||||
|
{// calc variance_norm_factor for all stages
|
||||||
int OffsetLocal = yl * DATA_SIZE_X + xl;
|
int OffsetLocal = yl * DATA_SIZE_X + xl;
|
||||||
int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl);
|
int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl);
|
||||||
|
|
||||||
@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked(
|
|||||||
|
|
||||||
int result = (1.0f>0.0f);
|
int result = (1.0f>0.0f);
|
||||||
for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
|
for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
|
||||||
{// iterate until candidate is exist
|
{// iterate until candidate is valid
|
||||||
float stage_sum = 0.0f;
|
float stage_sum = 0.0f;
|
||||||
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
|
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
|
||||||
((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
|
((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
|
||||||
|
int lcl_off = (yl*DATA_SIZE_X)+(xl);
|
||||||
int stagecount = stageinfo->count;
|
int stagecount = stageinfo->count;
|
||||||
float stagethreshold = stageinfo->threshold;
|
float stagethreshold = stageinfo->threshold;
|
||||||
int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
|
|
||||||
for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
|
for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
|
||||||
{
|
{
|
||||||
// simple macro to extract shorts from int
|
// simple macro to extract shorts from int
|
||||||
@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked(
|
|||||||
int4 n1 = pN[1];
|
int4 n1 = pN[1];
|
||||||
int4 n2 = pN[2];
|
int4 n2 = pN[2];
|
||||||
float nodethreshold = as_float(n2.y) * variance_norm_factor;
|
float nodethreshold = as_float(n2.y) * variance_norm_factor;
|
||||||
// calc sum of intensity pixels according to node information
|
// calc sum of intensity pixels according to classifier node information
|
||||||
float classsum =
|
float classsum =
|
||||||
(SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
|
(SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
|
||||||
(SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
|
(SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
|
||||||
@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
|
|||||||
int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
|
int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
|
||||||
if(index<OUTPUTSZ)
|
if(index<OUTPUTSZ)
|
||||||
{
|
{
|
||||||
int x = GroupX+lid_x;
|
int x = GroupX+xl;
|
||||||
int y = GroupY+lid_y;
|
int y = GroupY+yl;
|
||||||
int4 candidate_result;
|
int4 candidate_result;
|
||||||
candidate_result.x = convert_int_rtn(x*ScaleFactor);
|
candidate_result.x = convert_int_rtn(x*ScaleFactor);
|
||||||
candidate_result.y = convert_int_rtn(y*ScaleFactor);
|
candidate_result.y = convert_int_rtn(y*ScaleFactor);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user