started adding OpenCL acceleration of LBP-based object detectors

2013-12-20 18:39:35 +04:00
parent b4bd5bab6d
commit 1540910542
3 changed files with 138 additions and 109 deletions
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -1,19 +1,22 @@
 ///////////////////////////// OpenCL kernels for face detection //////////////////////////////
 ////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////

-typedef struct __attribute__((aligned(4))) OptFeature
+typedef struct __attribute__((aligned(4))) OptHaarFeature
 {
    int4 ofs[3] __attribute__((aligned (4)));
    float4 weight __attribute__((aligned (4)));
 }
-OptFeature;
+OptHaarFeature;
+
+typedef struct __attribute__((aligned(4))) OptLBPFeature
+{
+    int16 ofs __attribute__((aligned (4)));
+}
+OptLBPFeature;

 typedef struct __attribute__((aligned(4))) Stump
 {
-    int featureIdx __attribute__((aligned (4)));
-    float threshold __attribute__((aligned (4))); // for ordered features only
-    float left __attribute__((aligned (4)));
-    float right __attribute__((aligned (4)));
+    float4 st __attribute__((aligned (4)));
 }
 Stump;

@@ -30,7 +33,7 @@ __kernel void runHaarClassifierStump(
    int sumstep, int sumoffset,
    __global const int* sqsum,
    int sqsumstep, int sqsumoffset,
-    __global const OptFeature* optfeatures,
+    __global const OptHaarFeature* optfeatures,

    int nstages,
    __global const Stage* stages,
@@ -47,11 +50,8 @@ __kernel void runHaarClassifierStump(

    if( ix < imgsize.x && iy < imgsize.y )
    {
-        int ntrees;
-        int stageIdx, i;
-        float s = 0.f;
+        int stageIdx;
        __global const Stump* stump = stumps;
-        __global const OptFeature* f;

        __global const int* psum = sum + mad24(iy, sumstep, ix);
        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
@@ -61,20 +61,19 @@ __kernel void runHaarClassifierStump(
                      pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
-        float4 weight, vsval;
-        int4 ofs, ofs0, ofs1, ofs2;
        nf = nf > 0 ? nf : 1.f;

        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
        {
-            ntrees = stages[stageIdx].ntrees;
-            s = 0.f;
+            int i, ntrees = stages[stageIdx].ntrees;
+            float s = 0.f;
            for( i = 0; i < ntrees; i++, stump++ )
            {
-                f = optfeatures + stump->featureIdx;
-                weight = f->weight;
+                float4 st = stump->st;
+                __global const OptHaarFeature* f = optfeatures + as_int(st.x);
+                float4 weight = f->weight;

-                ofs = f->ofs[0];
+                int4 ofs = f->ofs[0];
                sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
                ofs = f->ofs[1];
                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
@@ -84,7 +83,7 @@ __kernel void runHaarClassifierStump(
                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
                }

-                s += (sval < stump->threshold*nf) ? stump->left : stump->right;
+                s += (sval < st.y*nf) ? st.z : st.w;
            }

            if( s < stages[stageIdx].threshold )
@@ -110,9 +109,7 @@ __kernel void runHaarClassifierStump(
 __kernel void runLBPClassifierStump(
    __global const int* sum,
    int sumstep, int sumoffset,
-    __global const int* sqsum,
-    int sqsumstep, int sqsumoffset,
-    __global const OptFeature* optfeatures,
+    __global const OptLBPFeature* optfeatures,

    int nstages,
    __global const Stage* stages,
@@ -124,50 +121,45 @@ __kernel void runLBPClassifierStump(
    int2 imgsize, int xyscale, float factor,
    int4 normrect, int2 windowsize, int maxFaces)
 {
-    int ix = get_global_id(0)*xyscale*VECTOR_SIZE;
+    int ix = get_global_id(0)*xyscale;
    int iy = get_global_id(1)*xyscale;
    sumstep /= sizeof(int);
    sqsumstep /= sizeof(int);
-
+    
    if( ix < imgsize.x && iy < imgsize.y )
    {
-        int ntrees;
-        int stageIdx, i;
-        float s = 0.f;
+        int stageIdx;
        __global const Stump* stump = stumps;
-        __global const int* bitset = bitsets;
-        __global const OptFeature* f;
-
-        __global const int* psum = sum + mad24(iy, sumstep, ix);
-        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
-        int normarea = normrect.z * normrect.w;
-        float invarea = 1.f/normarea;
-        float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
-        pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
-        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
-        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
-        float4 weight;
-        int4 ofs;
-        nf = nf > 0 ? nf : 1.f;
-
+        
        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
        {
-            ntrees = stages[stageIdx].ntrees;
-            s = 0.f;
-            for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+            int i, ntrees = stages[stageIdx].ntrees;
+            float s = 0.f;
+            for( i = 0; i < ntrees; i++, stump++ )
            {
-                f = optfeatures + stump->featureIdx;
-
-                weight = f->weight;
-
-                // compute LBP feature to val
-                s += (bitset[val >> 5] & (1 << (val & 31))) ? stump->left : stump->right;
+                float4 st = stump->st;
+                __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                int16 ofs = f->ofs;
+                
+                
+                
+                int4 ofs = f->ofs[0];
+                sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                ofs = f->ofs[1];
+                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                if( weight.z > 0 )
+                {
+                    ofs = f->ofs[2];
+                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                }
+                
+                s += (sval < st.y*nf) ? st.z : st.w;
            }
-
+            
            if( s < stages[stageIdx].threshold )
            break;
        }
-
+        
        if( stageIdx == nstages )
        {
            int nfaces = atomic_inc(facepos);