some attempts to tune the performance

2013-12-19 14:09:44 +04:00
parent 02fb3f0a77
commit b7553d4e2e
26 changed files with 187 additions and 150 deletions
--- a/modules/objdetect/src/opencl/haarobjectdetect.cl
+++ b/modules/objdetect/src/opencl/haarobjectdetect.cl
@@ -1,43 +1,5 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Wang Weiyan, wangweiyanster@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Nathan, liujun@multicorewareinc.com
-//    Peng Xiao, pengxiao@outlook.com
-//    Erping Pang, erping@multicorewareinc.com
-//    Vadim Pisarevsky, vadim.pisarevsky@itseez.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
+///////////////////////////// OpenCL kernels for face detection //////////////////////////////
+////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////

 typedef struct __attribute__((aligned(4))) OptFeature
 {
@@ -46,20 +8,14 @@ typedef struct __attribute__((aligned(4))) OptFeature
 }
 OptFeature;

-typedef struct __attribute__((aligned(4))) DTreeNode
+typedef struct __attribute__((aligned(4))) Stump
 {
    int featureIdx __attribute__((aligned (4)));
    float threshold __attribute__((aligned (4))); // for ordered features only
-    int left __attribute__((aligned (4)));
-    int right __attribute__((aligned (4)));
+    float left __attribute__((aligned (4)));
+    float right __attribute__((aligned (4)));
 }
-DTreeNode;
-
-typedef struct __attribute__((aligned (4))) DTree
-{
-    int nodeCount __attribute__((aligned (4)));
-}
-DTree;
+Stump;

 typedef struct __attribute__((aligned (4))) Stage
 {
@@ -78,25 +34,23 @@ __kernel void runHaarClassifierStump(

    int nstages,
    __global const Stage* stages,
-    __global const DTree* trees,
-    __global const DTreeNode* nodes,
-    __global const float* leaves,
+    __global const Stump* stumps,

    volatile __global int* facepos,
    int2 imgsize, int xyscale, float factor,
-    int4 normrect, int2 windowsize)
+    int4 normrect, int2 windowsize, int maxFaces)
 {
-    int ix = get_global_id(0)*xyscale;
+    int ix = get_global_id(0)*xyscale*VECTOR_SIZE;
    int iy = get_global_id(1)*xyscale;
    sumstep /= sizeof(int);
    sqsumstep /= sizeof(int);
    
    if( ix < imgsize.x && iy < imgsize.y )
    {
-        int ntrees, nodeOfs = 0, leafOfs = 0;
+        int ntrees;
        int stageIdx, i;
        float s = 0.f;
-        __global const DTreeNode* node;
+        __global const Stump* stump = stumps;
        __global const OptFeature* f;
        
        __global const int* psum = sum + mad24(iy, sumstep, ix);
@@ -107,19 +61,17 @@ __kernel void runHaarClassifierStump(
                      pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
-        float4 weight;
-        int4 ofs;
+        float4 weight, vsval;
+        int4 ofs, ofs0, ofs1, ofs2;
        nf = nf > 0 ? nf : 1.f;
        
        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
        {
            ntrees = stages[stageIdx].ntrees;
            s = 0.f;
-            for( i = 0; i < ntrees; i++, nodeOfs++, leafOfs += 2 )
+            for( i = 0; i < ntrees; i++, stump++ )
            {
-                node = nodes + nodeOfs;
-                f = optfeatures + node->featureIdx;
-                
+                f = optfeatures + stump->featureIdx;
                weight = f->weight;
                
                ofs = f->ofs[0];
@@ -131,7 +83,8 @@ __kernel void runHaarClassifierStump(
                    ofs = f->ofs[2];
                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
                }
-                s += leaves[ sval < node->threshold*nf ? leafOfs : leafOfs + 1 ];
+                
+                s += (sval < stump->threshold*nf) ? stump->left : stump->right;
            }
            
            if( s < stages[stageIdx].threshold )
@@ -142,7 +95,7 @@ __kernel void runHaarClassifierStump(
        {
            int nfaces = atomic_inc(facepos);
            //printf("detected face #d!!!!\n", nfaces);
-            if( nfaces < MAX_FACES )
+            if( nfaces < maxFaces )
            {
                volatile __global int* face = facepos + 1 + nfaces*4;
                face[0] = convert_int_rte(ix*factor);
@@ -153,3 +106,82 @@ __kernel void runHaarClassifierStump(
        }
    }
 }
+
+#if 0
+__kernel void runLBPClassifierStump(
+    __global const int* sum,
+    int sumstep, int sumoffset,
+    __global const int* sqsum,
+    int sqsumstep, int sqsumoffset,
+    __global const OptFeature* optfeatures,
+
+    int nstages,
+    __global const Stage* stages,
+    __global const Stump* stumps,
+    __global const int* bitsets,
+    int bitsetSize,
+
+    volatile __global int* facepos,
+    int2 imgsize, int xyscale, float factor,
+    int4 normrect, int2 windowsize, int maxFaces)
+{
+    int ix = get_global_id(0)*xyscale*VECTOR_SIZE;
+    int iy = get_global_id(1)*xyscale;
+    sumstep /= sizeof(int);
+    sqsumstep /= sizeof(int);
+    
+    if( ix < imgsize.x && iy < imgsize.y )
+    {
+        int ntrees;
+        int stageIdx, i;
+        float s = 0.f;
+        __global const Stump* stump = stumps;
+        __global const int* bitset = bitsets;
+        __global const OptFeature* f;
+        
+        __global const int* psum = sum + mad24(iy, sumstep, ix);
+        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
+        int normarea = normrect.z * normrect.w;
+        float invarea = 1.f/normarea;
+        float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
+        pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
+        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
+        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
+        float4 weight;
+        int4 ofs;
+        nf = nf > 0 ? nf : 1.f;
+        
+        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        {
+            ntrees = stages[stageIdx].ntrees;
+            s = 0.f;
+            for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+            {
+                f = optfeatures + stump->featureIdx;
+                
+                weight = f->weight;
+                
+                // compute LBP feature to val
+                s += (bitset[val >> 5] & (1 << (val & 31))) ? stump->left : stump->right;
+            }
+            
+            if( s < stages[stageIdx].threshold )
+            break;
+        }
+        
+        if( stageIdx == nstages )
+        {
+            int nfaces = atomic_inc(facepos);
+            if( nfaces < maxFaces )
+            {
+                volatile __global int* face = facepos + 1 + nfaces*4;
+                face[0] = convert_int_rte(ix*factor);
+                face[1] = convert_int_rte(iy*factor);
+                face[2] = convert_int_rte(windowsize.x*factor);
+                face[3] = convert_int_rte(windowsize.y*factor);
+            }
+        }
+    }
+}
+#endif
+