LBP: switched to texture implementation

2012-07-10 11:58:15 +00:00
parent b0606b0557
commit 86d785622b
6 changed files with 143 additions and 131 deletions
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -48,8 +48,102 @@ namespace cv { namespace gpu { namespace device
 {
    namespace lbp
    {
+
+        texture<int, cudaTextureType2D, cudaReadModeElementType> tintegral(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+        struct LBP
+        {
+            __device__ __forceinline__ LBP(const LBP& other) {(void)other;}
+            __device__ __forceinline__ LBP() {}
+
+            //feature as uchar x, y - left top, z,w - right bottom
+            __device__ __forceinline__ int operator() (int ty, int tx, int fh, int featurez, int& shift) const
+            {
+                int anchors[9];
+
+                anchors[0]  = tex2D(tintegral, tx, ty);
+                anchors[1]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[0] -= anchors[1];
+                anchors[2]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[1] -= anchors[2];
+                anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
+
+                ty += fh;
+                anchors[3]  = tex2D(tintegral, tx, ty);
+                anchors[4]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[3] -= anchors[4];
+                anchors[5]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[4] -= anchors[5];
+                anchors[5] -= tex2D(tintegral, tx + featurez * 3, ty);
+
+                anchors[0] -= anchors[3];
+                anchors[1] -= anchors[4];
+                anchors[2] -= anchors[5];
+                // 0 - 2 contains s0 - s2
+
+                ty += fh;
+                anchors[6]  = tex2D(tintegral, tx, ty);
+                anchors[7]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[6] -= anchors[7];
+                anchors[8]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[7] -= anchors[8];
+                anchors[8] -= tex2D(tintegral, tx + featurez * 3, ty);
+
+                anchors[3] -= anchors[6];
+                anchors[4] -= anchors[7];
+                anchors[5] -= anchors[8];
+                // 3 - 5 contains s3 - s5
+
+                anchors[0] -= anchors[4];
+                anchors[1] -= anchors[4];
+                anchors[2] -= anchors[4];
+                anchors[3] -= anchors[4];
+                anchors[5] -= anchors[4];
+
+                int response = (~(anchors[0] >> 31)) & 4;
+                response |= (~(anchors[1] >> 31)) & 2;;
+                response |= (~(anchors[2] >> 31)) & 1;
+
+                shift = (~(anchors[5] >> 31)) & 16;
+                shift |= (~(anchors[3] >> 31)) & 1;
+
+                ty += fh;
+                anchors[0]  = tex2D(tintegral, tx, ty);
+                anchors[1]  = tex2D(tintegral, tx + featurez, ty);
+                anchors[0] -= anchors[1];
+                anchors[2]  = tex2D(tintegral, tx + featurez * 2, ty);
+                anchors[1] -= anchors[2];
+                anchors[2] -= tex2D(tintegral, tx + featurez * 3, ty);
+
+                anchors[6] -= anchors[0];
+                anchors[7] -= anchors[1];
+                anchors[8] -= anchors[2];
+                // 0 -2 contains s6 - s8
+
+                anchors[6] -= anchors[4];
+                anchors[7] -= anchors[4];
+                anchors[8] -= anchors[4];
+
+                shift |= (~(anchors[6] >> 31)) & 2;
+                shift |= (~(anchors[7] >> 31)) & 4;
+                shift |= (~(anchors[8] >> 31)) & 8;
+                return response;
+            }
+        };
+
+        void bindIntegral(DevMem2Di integral)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
+            cudaSafeCall( cudaBindTexture2D(0, &tintegral, integral.ptr(), &desc, (size_t)integral.cols, (size_t)integral.rows, (size_t)integral.step));
+        }
+
+        void unbindIntegral()
+        {
+             cudaSafeCall( cudaUnbindTexture(&tintegral));
+        }
+
        __global__ void lbp_classify_stump(const Stage* stages, const int nstages, const ClNode* nodes, const float* leaves, const int* subsets, const uchar4* features,
-            const int* integral, const int istep, const int workWidth,const int workHeight, const int clWidth, const int clHeight, const float scale, const int step,
+           /* const int* integral,const int istep,  const int workWidth,const int workHeight,*/ const int clWidth, const int clHeight, const float scale, const int step,
            const int subsetSize, DevMem2D_<int4> objects, unsigned int* n)
        {
            int x = threadIdx.x * step;
@@ -63,21 +157,18 @@ namespace cv { namespace gpu { namespace device
            {
                float sum = 0;
                Stage stage = stages[s];
-
                for (int t = 0; t < stage.ntrees; t++)
                {
                    ClNode node = nodes[current_node];

                    uchar4 feature = features[node.featureIdx];
-                    int c = evaluator( (y + feature.y) * istep + x + feature.x , feature.w * istep, feature.z, integral, istep);
-                    const int* subsetIdx = subsets + (current_node * subsetSize);
-
-                    int idx =  (subsetIdx[c >> 5] & ( 1 << (c & 31))) ? current_leave : current_leave + 1;
+                    int shift;
+                    int c = evaluator(y + feature.y, x + feature.x, feature.w, feature.z, shift);
+                    int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
                    sum += leaves[idx];
                    current_node += 1;
                    current_leave += 2;
                }
-
                if (sum < stage.threshold)
                    return;
            }
@@ -85,8 +176,8 @@ namespace cv { namespace gpu { namespace device
            int4 rect;
            rect.x = roundf(x * scale);
            rect.y = roundf(y * scale);
-            rect.z = roundf(clWidth);
-            rect.w = roundf(clHeight);
+            rect.z = clWidth;
+            rect.w = clHeight;
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
            int res = __atomicInc(n, 100U);
 #else
@@ -178,8 +269,8 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void classifyStump(const DevMem2Db mstages, const int nstages, const DevMem2Di mnodes, const DevMem2Df mleaves, const DevMem2Di msubsets, const DevMem2Db mfeatures,
-                           const DevMem2Di integral, const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
+        void classifyStump(const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
+                           /*const DevMem2Di& integral,*/ const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize,
                           DevMem2D_<int4> objects, unsigned int* classified)
        {
            int blocks  = ceilf(workHeight / (float)step);
@@ -190,11 +281,8 @@ namespace cv { namespace gpu { namespace device
            const float* leaves = mleaves.ptr();
            const int* subsets = msubsets.ptr();
            const uchar4* features = (uchar4*)(mfeatures.ptr());
-            const int* integ = integral.ptr();
-            int istep = integral.step / sizeof(int);
-
-            lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, integ, istep,
-                workWidth, workHeight, clWidth, clHeight, scale, step, subsetSize, objects, classified);
+            lbp_classify_stump<<<blocks, threads>>>(stages, nstages, nodes, leaves, subsets, features, /*integ, istep,
+                workWidth, workHeight,*/ clWidth, clHeight, scale, step, subsetSize, objects, classified);
        }

        int connectedConmonents(DevMem2D_<int4> candidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)