Use preprocessor for constant values in OpenCL kernel instead of

the parameter variable.

It could improve the performance of
OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/*.
Especially,
OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/15
OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/16
could be improved about 2% in Intel platform.

Signed-off-by: Yan Wang <yan.wang@linux.intel.com>
This commit is contained in:
Yan Wang
2014-12-16 16:21:05 +08:00
parent 009aec5164
commit efa84d8225
2 changed files with 27 additions and 36 deletions

View File

@@ -1060,6 +1060,7 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
}
int nstages = (int)data.stages.size();
int splitstage_ocl = 1;
if( featureType == FeatureEvaluator::HAAR )
{
@@ -1071,11 +1072,11 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
{
String opts;
if (lbufSize.area())
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree);
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
else
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d",
localsz.width, localsz.height, data.maxNodesPerTree);
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
if( haarKernel.empty() )
return false;
@@ -1083,7 +1084,6 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
Rect normrect = haar->getNormRect();
int sqofs = haar->getSquaresOffset();
int splitstage_ocl = 1;
haarKernel.args((int)scales.size(),
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
@@ -1091,13 +1091,12 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
// cascade classifier
splitstage_ocl, nstages,
ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(unodes),
ocl::KernelArg::PtrReadOnly(uleaves),
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
normrect, sqofs, data.origWinSize, (int)MAX_FACES);
normrect, sqofs, data.origWinSize);
ok = haarKernel.run(2, globalsize, localsize, true);
}
else if( featureType == FeatureEvaluator::LBP )
@@ -1113,16 +1112,16 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
{
String opts;
if (lbufSize.area())
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width);
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, lbufSize.area(), lbufSize.width, splitstage_ocl, nstages, MAX_FACES);
else
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d", localsz.width, localsz.height);
opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
localsz.width, localsz.height, splitstage_ocl, nstages, MAX_FACES);
lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
if( lbpKernel.empty() )
return false;
}
int splitstage_ocl = 1;
int subsetSize = (data.ncategories + 31)/32;
lbpKernel.args((int)scales.size(),
ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
@@ -1130,14 +1129,13 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
// cascade classifier
splitstage_ocl, nstages,
ocl::KernelArg::PtrReadOnly(ustages),
ocl::KernelArg::PtrReadOnly(unodes),
ocl::KernelArg::PtrReadOnly(usubsets),
subsetSize,
ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
data.origWinSize, (int)MAX_FACES);
data.origWinSize);
ok = lbpKernel.run(2, globalsize, localsize, true);
}