Merge pull request #2007 from krodyush:pullreq/2.4-opt-131202-haar

2013-12-19 16:46:58 +04:00 · 2013-12-19 16:46:58 +04:00 · b674cd8571
commit b674cd8571
parent 8c91a1af71 917b883cf0
2 changed files with 62 additions and 33 deletions
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@ -866,16 +866,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
        {
-            //setup local group size
+            //setup local group size for "pixel step" = 1
-            localThreads[0] = 8;
+            localThreads[0] = 16;
-            localThreads[1] = 16;
+            localThreads[1] = 32;
            localThreads[2] = 1;
-            //init maximal number of workgroups
+            //calc maximal number of workgroups
            int WGNumX = 1+(sizev[0].width /(localThreads[0]));
            int WGNumY = 1+(sizev[0].height/(localThreads[1]));
            int WGNumZ = loopcount;
-            int WGNum = 0; //accurate number of non -empty workgroups
+            int WGNumTotal = 0; //accurate number of non-empty workgroups
            int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2
            oclMat      oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
            {
                cl_int4*    pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
@ -895,12 +896,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
                            if(gx>=(Width-cascade->orig_window_size.width))
                                continue; // no data to process
                            if(scaleinfo[z].factor<=2)
                            {
                                WGNumSampled++;
                            }
                            // save no-empty workgroup info into array
-                            pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
+                            pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height;
-                            pWGInfo[WGNum].s[1] = (gx << 16) | gy;
+                            pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy;
-                            pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
+                            pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff;
-                            memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
+                            memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float));
-                            WGNum++;
+                            WGNumTotal++;
                        }
                    }
                }
@ -908,13 +913,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
                pWGInfo = NULL;
            }
            // setup global sizes to have linear array of workgroups with WGNum size
            globalThreads[0] = localThreads[0]*WGNum;
            globalThreads[1] = localThreads[1];
            globalThreads[2] = 1;
 #define NODE_SIZE 12
-            // pack node info to have less memory loads
+            // pack node info to have less memory loads on the device side
            oclMat  oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
            {
                cl_int  status;
@ -963,8 +963,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
            options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
            options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
            options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
            options += format(" -D LSx=%d",localThreads[0]);
            options += format(" -D LSy=%d",localThreads[1]);
            options += format(" -D SPLITNODE=%d",splitnode);
            options += format(" -D SPLITSTAGE=%d",splitstage);
            options += format(" -D OUTPUTSZ=%d",outputsz);
@ -972,8 +970,39 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
            // init candiate global count by 0
            int pattern = 0;
            openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
-            // execute face detector
+
-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
+            if(WGNumTotal>WGNumSampled)
            {// small images and each pixel is processed
                // setup global sizes to have linear array of workgroups with WGNum size
                int     pixelstep = 1;
                size_t  LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
                globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled);
                globalThreads[1] = LS[1];
                globalThreads[2] = 1;
                string options1 = options;
                options1 += format(" -D PIXEL_STEP=%d",pixelstep);
                options1 += format(" -D WGSTART=%d",WGNumSampled);
                options1 += format(" -D LSx=%d",LS[0]);
                options1 += format(" -D LSy=%d",LS[1]);
                // execute face detector
                openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str());
            }
            if(WGNumSampled>0)
            {// large images each 4th pixel is processed
                // setup global sizes to have linear array of workgroups with WGNum size
                int     pixelstep = 2;
                size_t  LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
                globalThreads[0] = LS[0]*WGNumSampled;
                globalThreads[1] = LS[1];
                globalThreads[2] = 1;
                string options2 = options;
                options2 += format(" -D PIXEL_STEP=%d",pixelstep);
                options2 += format(" -D WGSTART=%d",0);
                options2 += format(" -D LSx=%d",LS[0]);
                options2 += format(" -D LSy=%d",LS[1]);
                // execute face detector
                openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str());
            }
            //read candidate buffer back and put it into host list
            openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
            assert(candidate[0]<outputsz);
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    )
 {
 // this version used information provided for each workgroup
 // no empty WG
    int     gid = (int)get_group_id(0);
    int     lid_x = (int)get_local_id(0);
    int     lid_y = (int)get_local_id(1);
    int     lid = lid_y*LSx+lid_x;
-    int4    WGInfo = pWGInfo[gid];
+    int4    WGInfo = pWGInfo[WGSTART+gid];
    int     GroupX = (WGInfo.y >> 16)&0xFFFF;
    int     GroupY = (WGInfo.y >> 0 )& 0xFFFF;
    int     Width  = (WGInfo.x >> 16)&0xFFFF;
@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    int     ImgOffset = WGInfo.z;
    float   ScaleFactor = as_float(WGInfo.w);
-#define DATA_SIZE_X (LSx+WND_SIZE_X)
+#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X)
-#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
+#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y)
 #define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
    local int SumL[DATA_SIZE];
@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    int4    info1 = p;
    int4    info2 = pq;
-    {
+    // calc processed ROI coordinate in local mem
-        int     xl = lid_x;
+    int     xl = lid_x*PIXEL_STEP;
-        int     yl = lid_y;
+    int     yl = lid_y*PIXEL_STEP;
    {// calc variance_norm_factor for all stages
        int     OffsetLocal =          yl * DATA_SIZE_X +         xl;
        int     OffsetGlobal = (GroupY+yl)* pixelstep   + (GroupX+xl);
@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    int result = (1.0f>0.0f);
    for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
-    {// iterate until candidate is exist
+    {// iterate until candidate is valid
        float   stage_sum = 0.0f;
        __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
            ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
        int     lcl_off = (yl*DATA_SIZE_X)+(xl);
        int stagecount = stageinfo->count;
        float stagethreshold = stageinfo->threshold;
        int     lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
        for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
        {
        // simple macro to extract shorts from int
@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked(
            int4    n1 = pN[1];
            int4    n2 = pN[2];
            float   nodethreshold  = as_float(n2.y) * variance_norm_factor;
-            // calc sum of intensity pixels according to node information
+            // calc sum of intensity pixels according to classifier node information
            float classsum =
                (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
                (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
        int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
        if(index<OUTPUTSZ)
        {
-            int     x = GroupX+lid_x;
+            int     x = GroupX+xl;
-            int     y = GroupY+lid_y;
+            int     y = GroupY+yl;
            int4 candidate_result;
            candidate_result.x = convert_int_rtn(x*ScaleFactor);
            candidate_result.y = convert_int_rtn(y*ScaleFactor);