Merge pull request #1731 from perping:2.4_haar

This commit is contained in:
Andrey Pavlenko 2013-11-01 16:46:17 +04:00 committed by OpenCV Buildbot
commit 2767be9a5e
3 changed files with 70 additions and 25 deletions

View File

@ -1059,11 +1059,11 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
{
sz = sizev[i];
factor = scalev[i];
int ystep = cvRound(std::max(2., factor));
int equRect_x = (int)(factor * gcascade->p0 + 0.5);
int equRect_y = (int)(factor * gcascade->p1 + 0.5);
int equRect_w = (int)(factor * gcascade->p3 + 0.5);
int equRect_h = (int)(factor * gcascade->p2 + 0.5);
double ystep = std::max(2., factor);
int equRect_x = cvRound(factor * gcascade->p0);
int equRect_y = cvRound(factor * gcascade->p1);
int equRect_w = cvRound(factor * gcascade->p3);
int equRect_h = cvRound(factor * gcascade->p2);
p[i].s[0] = equRect_x;
p[i].s[1] = equRect_y;
p[i].s[2] = equRect_x + equRect_w;
@ -1676,9 +1676,9 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
{
sz = sizev[i];
factor = scalev[i];
int ystep = cvRound(std::max(2., factor));
int width = (cols - 1 - sz.width + ystep - 1) / ystep;
int height = (rows - 1 - sz.height + ystep - 1) / ystep;
double ystep = cv::max(2.,factor);
int width = cvRound((cols - 1 - sz.width + ystep - 1) / ystep);
int height = cvRound((rows - 1 - sz.height + ystep - 1) / ystep);
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;

View File

@ -11,6 +11,7 @@
// Jia Haipeng, jiahaipeng95@gmail.com
// Nathan, liujun@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
// Erping Pang, erping@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
@ -320,7 +321,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int glb_x = grpoffx + (lcl_x<<2);
int glb_y = grpoffy + lcl_y;
int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
int glb_off = mad24(min(glb_y, height + WINDOWSIZE - 1),pixelstep,glb_x);
int4 data = *(__global int4*)&sum[glb_off];
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
@ -420,12 +421,23 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
result = (stage_sum >= stagethreshold);
}
if(result && (x < width) && (y < height))
if(factor < 2)
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
if(result && lclidx %2 ==0 && lclidy %2 ==0 )
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
}
}
else
{
if(result)
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
@ -548,11 +560,27 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
temp = glboutindex[0];
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
candidate_result.x = convert_int_rtn(x*factor);
candidate_result.y = convert_int_rtn(y*factor);
candidate_result.zw = (int2)convert_int_rte(factor*20.f);
candidate_result.x = convert_int_rte(x*factor);
candidate_result.y = convert_int_rte(y*factor);
atomic_inc(glboutindex);
candidate[outputoff+temp+lcl_id] = candidate_result;
int i = outputoff+temp+lcl_id;
if(candidate[i].z == 0)
{
candidate[i] = candidate_result;
}
else
{
for(i=i+1;;i++)
{
if(candidate[i].z == 0)
{
candidate[i] = candidate_result;
break;
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)

View File

@ -18,6 +18,7 @@
// Wu Xinglong, wxl370@126.com
// Sen Liu, swjtuls1987@126.com
// Peng Xiao, pengxiao@outlook.com
// Erping Pang, erping@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
@ -141,7 +142,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
int totalgrp = scaleinfo1.y & 0xffff;
float factor = as_float(scaleinfo1.w);
float correction_t = correction[scalei];
int ystep = (int)(max(2.0f, factor) + 0.5f);
float ystep = max(2.0f, factor);
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
{
@ -150,8 +151,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
int grpidx = grploop - mul24(grpidy, grpnumperline);
int ix = mad24(grpidx, grpszx, lclidx);
int iy = mad24(grpidy, grpszy, lclidy);
int x = ix * ystep;
int y = iy * ystep;
int x = round(ix * ystep);
int y = round(iy * ystep);
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int nodecounter;
@ -242,7 +243,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
barrier(CLK_LOCAL_MEM_FENCE);
if (result && (ix < width) && (iy < height))
if (result)
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex] = (y << 16) | x;
@ -257,10 +258,26 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
int y = (temp & (int)0xffff0000) >> 16;
temp = atomic_inc(glboutindex);
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
candidate_result.x = x;
candidate_result.y = y;
candidate[outputoff + temp + lcl_id] = candidate_result;
int i = outputoff+temp+lcl_id;
if(candidate[i].z == 0)
{
candidate[i] = candidate_result;
}
else
{
for(i=i+1;;i++)
{
if(candidate[i].z == 0)
{
candidate[i] = candidate_result;
break;
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
@ -283,7 +300,7 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
}
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
counter += nodenum;
#pragma unroll