Use preprocessor for constant values in OpenCL kernel instead of
the parameter variable. It could improve the performance of OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/*. Especially, OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/15 OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/16 could be improved about 2% in Intel platform. Signed-off-by: Yan Wang <yan.wang@linux.intel.com>
This commit is contained in:
parent
009aec5164
commit
efa84d8225
@ -1060,6 +1060,7 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
|
|||||||
}
|
}
|
||||||
|
|
||||||
int nstages = (int)data.stages.size();
|
int nstages = (int)data.stages.size();
|
||||||
|
int splitstage_ocl = 1;
|
||||||
|
|
||||||
if( featureType == FeatureEvaluator::HAAR )
|
if( featureType == FeatureEvaluator::HAAR )
|
||||||
{
|
{
|
||||||
@ -1071,11 +1072,11 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
|
|||||||
{
|
{
|
||||||
String opts;
|
String opts;
|
||||||
if (lbufSize.area())
|
if (lbufSize.area())
|
||||||
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d",
|
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
|
||||||
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree);
|
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
|
||||||
else
|
else
|
||||||
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d",
|
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
|
||||||
localsz.width, localsz.height, data.maxNodesPerTree);
|
localsz.width, localsz.height, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
|
||||||
haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
|
haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
|
||||||
if( haarKernel.empty() )
|
if( haarKernel.empty() )
|
||||||
return false;
|
return false;
|
||||||
@ -1083,7 +1084,6 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
|
|||||||
|
|
||||||
Rect normrect = haar->getNormRect();
|
Rect normrect = haar->getNormRect();
|
||||||
int sqofs = haar->getSquaresOffset();
|
int sqofs = haar->getSquaresOffset();
|
||||||
int splitstage_ocl = 1;
|
|
||||||
|
|
||||||
haarKernel.args((int)scales.size(),
|
haarKernel.args((int)scales.size(),
|
||||||
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
|
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
|
||||||
@ -1091,13 +1091,12 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
|
|||||||
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
|
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
|
||||||
|
|
||||||
// cascade classifier
|
// cascade classifier
|
||||||
splitstage_ocl, nstages,
|
|
||||||
ocl::KernelArg::PtrReadOnly(ustages),
|
ocl::KernelArg::PtrReadOnly(ustages),
|
||||||
ocl::KernelArg::PtrReadOnly(unodes),
|
ocl::KernelArg::PtrReadOnly(unodes),
|
||||||
ocl::KernelArg::PtrReadOnly(uleaves),
|
ocl::KernelArg::PtrReadOnly(uleaves),
|
||||||
|
|
||||||
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
|
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
|
||||||
normrect, sqofs, data.origWinSize, (int)MAX_FACES);
|
normrect, sqofs, data.origWinSize);
|
||||||
ok = haarKernel.run(2, globalsize, localsize, true);
|
ok = haarKernel.run(2, globalsize, localsize, true);
|
||||||
}
|
}
|
||||||
else if( featureType == FeatureEvaluator::LBP )
|
else if( featureType == FeatureEvaluator::LBP )
|
||||||
@ -1113,16 +1112,16 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
|
|||||||
{
|
{
|
||||||
String opts;
|
String opts;
|
||||||
if (lbufSize.area())
|
if (lbufSize.area())
|
||||||
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d",
|
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
|
||||||
localsz.width, localsz.height, lbufSize.area(), lbufSize.width);
|
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, splitstage_ocl, nstages, MAX_FACES);
|
||||||
else
|
else
|
||||||
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d", localsz.width, localsz.height);
|
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
|
||||||
|
localsz.width, localsz.height, splitstage_ocl, nstages, MAX_FACES);
|
||||||
lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
|
lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
|
||||||
if( lbpKernel.empty() )
|
if( lbpKernel.empty() )
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int splitstage_ocl = 1;
|
|
||||||
int subsetSize = (data.ncategories + 31)/32;
|
int subsetSize = (data.ncategories + 31)/32;
|
||||||
lbpKernel.args((int)scales.size(),
|
lbpKernel.args((int)scales.size(),
|
||||||
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
|
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
|
||||||
@ -1130,14 +1129,13 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
|
|||||||
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
|
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
|
||||||
|
|
||||||
// cascade classifier
|
// cascade classifier
|
||||||
splitstage_ocl, nstages,
|
|
||||||
ocl::KernelArg::PtrReadOnly(ustages),
|
ocl::KernelArg::PtrReadOnly(ustages),
|
||||||
ocl::KernelArg::PtrReadOnly(unodes),
|
ocl::KernelArg::PtrReadOnly(unodes),
|
||||||
ocl::KernelArg::PtrReadOnly(usubsets),
|
ocl::KernelArg::PtrReadOnly(usubsets),
|
||||||
subsetSize,
|
subsetSize,
|
||||||
|
|
||||||
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
|
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
|
||||||
data.origWinSize, (int)MAX_FACES);
|
data.origWinSize);
|
||||||
|
|
||||||
ok = lbpKernel.run(2, globalsize, localsize, true);
|
ok = lbpKernel.run(2, globalsize, localsize, true);
|
||||||
}
|
}
|
||||||
|
@ -70,14 +70,12 @@ void runHaarClassifier(
|
|||||||
__global const int* sum,
|
__global const int* sum,
|
||||||
int _sumstep, int sumoffset,
|
int _sumstep, int sumoffset,
|
||||||
__global const OptHaarFeature* optfeatures,
|
__global const OptHaarFeature* optfeatures,
|
||||||
|
|
||||||
int splitstage, int nstages,
|
|
||||||
__global const Stage* stages,
|
__global const Stage* stages,
|
||||||
__global const Node* nodes,
|
__global const Node* nodes,
|
||||||
__global const float* leaves0,
|
__global const float* leaves0,
|
||||||
|
|
||||||
volatile __global int* facepos,
|
volatile __global int* facepos,
|
||||||
int4 normrect, int sqofs, int2 windowsize, int maxFaces)
|
int4 normrect, int sqofs, int2 windowsize)
|
||||||
{
|
{
|
||||||
int lx = get_local_id(0);
|
int lx = get_local_id(0);
|
||||||
int ly = get_local_id(1);
|
int ly = get_local_id(1);
|
||||||
@ -165,7 +163,7 @@ void runHaarClassifier(
|
|||||||
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
|
float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
|
||||||
nf = nf > 0 ? nf : 1.f;
|
nf = nf > 0 ? nf : 1.f;
|
||||||
|
|
||||||
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
|
for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
|
||||||
{
|
{
|
||||||
int ntrees = stages[stageIdx].ntrees;
|
int ntrees = stages[stageIdx].ntrees;
|
||||||
float s = 0.f;
|
float s = 0.f;
|
||||||
@ -221,7 +219,7 @@ void runHaarClassifier(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
|
if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
|
||||||
{
|
{
|
||||||
int count = atomic_inc(lcount);
|
int count = atomic_inc(lcount);
|
||||||
lbuf[count] = (int)(ix | (iy << 8));
|
lbuf[count] = (int)(ix | (iy << 8));
|
||||||
@ -229,7 +227,7 @@ void runHaarClassifier(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
|
for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
|
||||||
{
|
{
|
||||||
int nrects = lcount[0];
|
int nrects = lcount[0];
|
||||||
|
|
||||||
@ -335,13 +333,13 @@ void runHaarClassifier(
|
|||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if( stageIdx == nstages )
|
if( stageIdx == N_STAGES )
|
||||||
{
|
{
|
||||||
int nrects = lcount[0];
|
int nrects = lcount[0];
|
||||||
if( lidx < nrects )
|
if( lidx < nrects )
|
||||||
{
|
{
|
||||||
int nfaces = atomic_inc(facepos);
|
int nfaces = atomic_inc(facepos);
|
||||||
if( nfaces < maxFaces )
|
if( nfaces < MAX_FACES )
|
||||||
{
|
{
|
||||||
volatile __global int* face = facepos + 1 + nfaces*3;
|
volatile __global int* face = facepos + 1 + nfaces*3;
|
||||||
int val = lbuf[lidx];
|
int val = lbuf[lidx];
|
||||||
@ -364,15 +362,13 @@ __kernel void runLBPClassifierStumpSimple(
|
|||||||
__global const int* sum,
|
__global const int* sum,
|
||||||
int _sumstep, int sumoffset,
|
int _sumstep, int sumoffset,
|
||||||
__global const OptLBPFeature* optfeatures,
|
__global const OptLBPFeature* optfeatures,
|
||||||
|
|
||||||
int splitstage, int nstages,
|
|
||||||
__global const Stage* stages,
|
__global const Stage* stages,
|
||||||
__global const Stump* stumps,
|
__global const Stump* stumps,
|
||||||
__global const int* bitsets,
|
__global const int* bitsets,
|
||||||
int bitsetSize,
|
int bitsetSize,
|
||||||
|
|
||||||
volatile __global int* facepos,
|
volatile __global int* facepos,
|
||||||
int2 windowsize, int maxFaces)
|
int2 windowsize)
|
||||||
{
|
{
|
||||||
int lx = get_local_id(0);
|
int lx = get_local_id(0);
|
||||||
int ly = get_local_id(1);
|
int ly = get_local_id(1);
|
||||||
@ -381,7 +377,6 @@ __kernel void runLBPClassifierStumpSimple(
|
|||||||
int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
|
int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
|
||||||
int ngroups = get_num_groups(0)*get_num_groups(1);
|
int ngroups = get_num_groups(0)*get_num_groups(1);
|
||||||
int scaleIdx, tileIdx, stageIdx;
|
int scaleIdx, tileIdx, stageIdx;
|
||||||
int startStage = 0, endStage = nstages;
|
|
||||||
int sumstep = (int)(_sumstep/sizeof(int));
|
int sumstep = (int)(_sumstep/sizeof(int));
|
||||||
|
|
||||||
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
|
for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
|
||||||
@ -404,7 +399,7 @@ __kernel void runLBPClassifierStumpSimple(
|
|||||||
__global const Stump* stump = stumps;
|
__global const Stump* stump = stumps;
|
||||||
__global const int* bitset = bitsets;
|
__global const int* bitset = bitsets;
|
||||||
|
|
||||||
for( stageIdx = 0; stageIdx < endStage; stageIdx++ )
|
for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )
|
||||||
{
|
{
|
||||||
int i, ntrees = stages[stageIdx].ntrees;
|
int i, ntrees = stages[stageIdx].ntrees;
|
||||||
float s = 0.f;
|
float s = 0.f;
|
||||||
@ -433,10 +428,10 @@ __kernel void runLBPClassifierStumpSimple(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( stageIdx == nstages )
|
if( stageIdx == N_STAGES )
|
||||||
{
|
{
|
||||||
int nfaces = atomic_inc(facepos);
|
int nfaces = atomic_inc(facepos);
|
||||||
if( nfaces < maxFaces )
|
if( nfaces < MAX_FACES )
|
||||||
{
|
{
|
||||||
volatile __global int* face = facepos + 1 + nfaces*3;
|
volatile __global int* face = facepos + 1 + nfaces*3;
|
||||||
face[0] = scaleIdx;
|
face[0] = scaleIdx;
|
||||||
@ -455,15 +450,13 @@ void runLBPClassifierStump(
|
|||||||
__global const int* sum,
|
__global const int* sum,
|
||||||
int _sumstep, int sumoffset,
|
int _sumstep, int sumoffset,
|
||||||
__global const OptLBPFeature* optfeatures,
|
__global const OptLBPFeature* optfeatures,
|
||||||
|
|
||||||
int splitstage, int nstages,
|
|
||||||
__global const Stage* stages,
|
__global const Stage* stages,
|
||||||
__global const Stump* stumps,
|
__global const Stump* stumps,
|
||||||
__global const int* bitsets,
|
__global const int* bitsets,
|
||||||
int bitsetSize,
|
int bitsetSize,
|
||||||
|
|
||||||
volatile __global int* facepos,
|
volatile __global int* facepos,
|
||||||
int2 windowsize, int maxFaces)
|
int2 windowsize)
|
||||||
{
|
{
|
||||||
int lx = get_local_id(0);
|
int lx = get_local_id(0);
|
||||||
int ly = get_local_id(1);
|
int ly = get_local_id(1);
|
||||||
@ -525,7 +518,7 @@ void runLBPClassifierStump(
|
|||||||
__global const int* p = psum0 + mad24(iy, sumstep, ix);
|
__global const int* p = psum0 + mad24(iy, sumstep, ix);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
|
for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
|
||||||
{
|
{
|
||||||
int ntrees = stages[stageIdx].ntrees;
|
int ntrees = stages[stageIdx].ntrees;
|
||||||
float s = 0.f;
|
float s = 0.f;
|
||||||
@ -554,14 +547,14 @@ void runLBPClassifierStump(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
|
if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
|
||||||
{
|
{
|
||||||
int count = atomic_inc(lcount);
|
int count = atomic_inc(lcount);
|
||||||
lbuf[count] = (int)(ix | (iy << 8));
|
lbuf[count] = (int)(ix | (iy << 8));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
|
for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
|
||||||
{
|
{
|
||||||
int nrects = lcount[0];
|
int nrects = lcount[0];
|
||||||
|
|
||||||
@ -639,13 +632,13 @@ void runLBPClassifierStump(
|
|||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
if( stageIdx == nstages )
|
if( stageIdx == N_STAGES )
|
||||||
{
|
{
|
||||||
int nrects = lcount[0];
|
int nrects = lcount[0];
|
||||||
if( lidx < nrects )
|
if( lidx < nrects )
|
||||||
{
|
{
|
||||||
int nfaces = atomic_inc(facepos);
|
int nfaces = atomic_inc(facepos);
|
||||||
if( nfaces < maxFaces )
|
if( nfaces < MAX_FACES )
|
||||||
{
|
{
|
||||||
volatile __global int* face = facepos + 1 + nfaces*3;
|
volatile __global int* face = facepos + 1 + nfaces*3;
|
||||||
int val = lbuf[lidx];
|
int val = lbuf[lidx];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user