Use preprocessor for constant values in OpenCL kernel instead of

the parameter variable. It could improve the performance of OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/*. Especially, OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/15 OCL_Cascade_Image_MinSize_CascadeClassifier.CascadeClassifier/16 could be improved about 2% in Intel platform. Signed-off-by: Yan Wang <yan.wang@linux.intel.com>
2014-12-16 16:21:05 +08:00
parent 009aec5164
commit efa84d8225
2 changed files with 27 additions and 36 deletions
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -1060,6 +1060,7 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
    }
    int nstages = (int)data.stages.size();
    int splitstage_ocl = 1;
    if( featureType == FeatureEvaluator::HAAR )
    {
@@ -1071,11 +1072,11 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
        {
            String opts;
            if (lbufSize.area())
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
-                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree);
+                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
            else
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
-                              localsz.width, localsz.height, data.maxNodesPerTree);
+                              localsz.width, localsz.height, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
            haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
            if( haarKernel.empty() )
                return false;
@@ -1083,7 +1084,6 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
        Rect normrect = haar->getNormRect();
        int sqofs = haar->getSquaresOffset();
        int splitstage_ocl = 1;
        haarKernel.args((int)scales.size(),
                        ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
@@ -1091,13 +1091,12 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
                        ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
                        // cascade classifier
                        splitstage_ocl, nstages,
                        ocl::KernelArg::PtrReadOnly(ustages),
                        ocl::KernelArg::PtrReadOnly(unodes),
                        ocl::KernelArg::PtrReadOnly(uleaves),
                        ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
-                        normrect, sqofs, data.origWinSize, (int)MAX_FACES);
+                        normrect, sqofs, data.origWinSize);
        ok = haarKernel.run(2, globalsize, localsize, true);
    }
    else if( featureType == FeatureEvaluator::LBP )
@@ -1113,16 +1112,16 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
        {
            String opts;
            if (lbufSize.area())
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
-                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width);
+                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width, splitstage_ocl, nstages, MAX_FACES);
            else
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d", localsz.width, localsz.height);
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
                              localsz.width, localsz.height, splitstage_ocl, nstages, MAX_FACES);
            lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
            if( lbpKernel.empty() )
                return false;
        }
        int splitstage_ocl = 1;
        int subsetSize = (data.ncategories + 31)/32;
        lbpKernel.args((int)scales.size(),
                       ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
@@ -1130,14 +1129,13 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
                       ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
                       // cascade classifier
                       splitstage_ocl, nstages,
                       ocl::KernelArg::PtrReadOnly(ustages),
                       ocl::KernelArg::PtrReadOnly(unodes),
                       ocl::KernelArg::PtrReadOnly(usubsets),
                       subsetSize,
                       ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
-                       data.origWinSize, (int)MAX_FACES);
+                       data.origWinSize);
        ok = lbpKernel.run(2, globalsize, localsize, true);
    }
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -70,14 +70,12 @@ void runHaarClassifier(
    __global const int* sum,
    int _sumstep, int sumoffset,
    __global const OptHaarFeature* optfeatures,
    int splitstage, int nstages,
    __global const Stage* stages,
    __global const Node* nodes,
    __global const float* leaves0,
    volatile __global int* facepos,
-    int4 normrect, int sqofs, int2 windowsize, int maxFaces)
+    int4 normrect, int sqofs, int2 windowsize)
 {
    int lx = get_local_id(0);
    int ly = get_local_id(1);
@@ -165,7 +163,7 @@ void runHaarClassifier(
                float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
                nf = nf > 0 ? nf : 1.f;
-                for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
+                for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
                {
                    int ntrees = stages[stageIdx].ntrees;
                    float s = 0.f;
@@ -221,7 +219,7 @@ void runHaarClassifier(
                        break;
                }
-                if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
+                if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
                {
                    int count = atomic_inc(lcount);
                    lbuf[count] = (int)(ix | (iy << 8));
@@ -229,7 +227,7 @@ void runHaarClassifier(
                }
            }
-            for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
+            for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
            {
                int nrects = lcount[0];
@@ -335,13 +333,13 @@ void runHaarClassifier(
            }
            barrier(CLK_LOCAL_MEM_FENCE);
-            if( stageIdx == nstages )
+            if( stageIdx == N_STAGES )
            {
                int nrects = lcount[0];
                if( lidx < nrects )
                {
                    int nfaces = atomic_inc(facepos);
-                    if( nfaces < maxFaces )
+                    if( nfaces < MAX_FACES )
                    {
                        volatile __global int* face = facepos + 1 + nfaces*3;
                        int val = lbuf[lidx];
@@ -364,15 +362,13 @@ __kernel void runLBPClassifierStumpSimple(
    __global const int* sum,
    int _sumstep, int sumoffset,
    __global const OptLBPFeature* optfeatures,
    int splitstage, int nstages,
    __global const Stage* stages,
    __global const Stump* stumps,
    __global const int* bitsets,
    int bitsetSize,
    volatile __global int* facepos,
-    int2 windowsize, int maxFaces)
+    int2 windowsize)
 {
    int lx = get_local_id(0);
    int ly = get_local_id(1);
@@ -381,7 +377,6 @@ __kernel void runLBPClassifierStumpSimple(
    int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
    int ngroups = get_num_groups(0)*get_num_groups(1);
    int scaleIdx, tileIdx, stageIdx;
    int startStage = 0, endStage = nstages;
    int sumstep = (int)(_sumstep/sizeof(int));
    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
@@ -404,7 +399,7 @@ __kernel void runLBPClassifierStumpSimple(
                __global const Stump* stump = stumps;
                __global const int* bitset = bitsets;
-                for( stageIdx = 0; stageIdx < endStage; stageIdx++ )
+                for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )
                {
                    int i, ntrees = stages[stageIdx].ntrees;
                    float s = 0.f;
@@ -433,10 +428,10 @@ __kernel void runLBPClassifierStumpSimple(
                        break;
                }
-                if( stageIdx == nstages )
+                if( stageIdx == N_STAGES )
                {
                    int nfaces = atomic_inc(facepos);
-                    if( nfaces < maxFaces )
+                    if( nfaces < MAX_FACES )
                    {
                        volatile __global int* face = facepos + 1 + nfaces*3;
                        face[0] = scaleIdx;
@@ -455,15 +450,13 @@ void runLBPClassifierStump(
    __global const int* sum,
    int _sumstep, int sumoffset,
    __global const OptLBPFeature* optfeatures,
    int splitstage, int nstages,
    __global const Stage* stages,
    __global const Stump* stumps,
    __global const int* bitsets,
    int bitsetSize,
    volatile __global int* facepos,
-    int2 windowsize, int maxFaces)
+    int2 windowsize)
 {
    int lx = get_local_id(0);
    int ly = get_local_id(1);
@@ -525,7 +518,7 @@ void runLBPClassifierStump(
                __global const int* p = psum0 + mad24(iy, sumstep, ix);
                #endif
-                for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
+                for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
                {
                    int ntrees = stages[stageIdx].ntrees;
                    float s = 0.f;
@@ -554,14 +547,14 @@ void runLBPClassifierStump(
                        break;
                }
-                if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
+                if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
                {
                    int count = atomic_inc(lcount);
                    lbuf[count] = (int)(ix | (iy << 8));
                }
            }
-            for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
+            for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
            {
                int nrects = lcount[0];
@@ -639,13 +632,13 @@ void runLBPClassifierStump(
            }
            barrier(CLK_LOCAL_MEM_FENCE);
-            if( stageIdx == nstages )
+            if( stageIdx == N_STAGES )
            {
                int nrects = lcount[0];
                if( lidx < nrects )
                {
                    int nfaces = atomic_inc(facepos);
-                    if( nfaces < maxFaces )
+                    if( nfaces < MAX_FACES )
                    {
                        volatile __global int* face = facepos + 1 + nfaces*3;
                        int val = lbuf[lidx];