LBP: multiscale approach; refactored atomics usage

2012-07-17 08:27:34 +00:00 · 2012-07-17 08:27:34 +00:00 · e63ab8dec5
commit e63ab8dec5
parent 5dc7752d54
5 changed files with 314 additions and 162 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -1464,6 +1464,7 @@ private:
    GpuMat resuzeBuffer;
 	GpuMat candidates;
    static const int integralFactor = 4;
 };
 ////////////////////////////////// SURF //////////////////////////////////////////
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -67,7 +67,7 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP()
 bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const                               { throw_nogpu(); return true; }
 bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string&)                         { throw_nogpu(); return true; }
 Size cv::gpu::CascadeClassifier_GPU_LBP::getClassifierSize() const                   { throw_nogpu(); return Size(); }
-void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/)       { throw_nogpu();}
+void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size /*frame*/)         { throw_nogpu();}
 int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const cv::gpu::GpuMat& /*image*/, cv::gpu::GpuMat& /*objectsBuf*/,
 double /*scaleFactor*/, int /*minNeighbors*/, cv::Size /*maxObjectSize*/){ throw_nogpu(); return 0;}
@ -86,7 +86,7 @@ void cv::gpu::CascadeClassifier_GPU_LBP::allocateBuffers(cv::Size frame)
    {
        resuzeBuffer.create(frame, CV_8UC1);
-        integral.create(frame.height + 1, frame.width + 1, CV_32SC1);
+        integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
        NcvSize32u roiSize;
        roiSize.width = frame.width;
        roiSize.height = frame.height;
@ -284,14 +284,83 @@ namespace cv { namespace gpu { namespace device
                                DevMem2D_<int4> objects,
                                unsigned int* classified);
        void classifyPyramid(int frameW,
                             int frameH,
                             int windowW,
                             int windowH,
                             float initalScale,
                             float factor,
                             int total,
                             const DevMem2Db& mstages,
                             const int nstages,
                             const DevMem2Di& mnodes,
                             const DevMem2Df& mleaves,
                             const DevMem2Di& msubsets,
                             const DevMem2Db& mfeatures,
                             const int subsetSize,
                             DevMem2D_<int4> objects,
                             unsigned int* classified,
                             DevMem2Di integral);
        void connectedConmonents(DevMem2D_<int4>  candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
        void bindIntegral(DevMem2Di integral);
        void unbindIntegral();
    }
 }}}
-int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects,
+cv::Size operator -(const cv::Size& a, const cv::Size& b)
-                                                        double scaleFactor, int groupThreshold, cv::Size maxObjectSize /*, Size minSize=Size()*/)
+{
    return cv::Size(a.width - b.width, a.height - b.height);
 }
 cv::Size operator +(const cv::Size& a, const int& i)
 {
    return cv::Size(a.width + i, a.height + i);
 }
 cv::Size operator *(const cv::Size& a, const float& f)
 {
    return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
 }
 cv::Size operator /(const cv::Size& a, const float& f)
 {
    return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
 }
 bool operator <=(const cv::Size& a, const cv::Size& b)
 {
    return a.width <= b.width && a.height <= b.width;
 }
 struct PyrLavel
 {
    PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window) : order(_order)
    {
        scale = pow(_scale, order);
        sFrame = frame / scale;
        workArea = sFrame - window + 1;
        sWindow = window * scale;
    }
    bool isFeasible(cv::Size maxObj)
    {
        return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
    }
    PyrLavel next(float factor, cv::Size frame, cv::Size window)
    {
        return PyrLavel(order + 1, factor, frame, window);
    }
    int order;
    float scale;
    cv::Size sFrame;
    cv::Size workArea;
    cv::Size sWindow;
 };
 int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& objects, double scaleFactor, int groupThreshold, cv::Size maxObjectSize)
 {
    CV_Assert(!empty() && scaleFactor > 1 && image.depth() == CV_8U);
@ -306,6 +375,7 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
    // used for debug
    // candidates.setTo(cv::Scalar::all(0));
    // objects.setTo(cv::Scalar::all(0));
    if (maxObjectSize == cv::Size())
        maxObjectSize = image.size();
@ -315,52 +385,54 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
    GpuMat dclassified(1, 1, CV_32S);
    cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
-    // cv::gpu::device::lbp::bindIntegral(integral);
+    PyrLavel level(0, 1.0f, image.size(), NxM);
-    Size scaledImageSize(image.cols, image.rows);
+    while (level.isFeasible(maxObjectSize))
    Size processingRectSize( scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
    Size windowSize(NxM.width, NxM.height);
    float factor = 1;
    for (;;)
    {
-        if (processingRectSize.width <= 0 || processingRectSize.height <= 0 )
+        int acc = level.sFrame.width + 1;
-            break;
+        float iniScale = level.scale;
        cv::Size area = level.workArea;
        float step = (float)(1 + (level.scale <= 2.f));
-        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
+        int total = 0, prev  = 0;
            break;
-        // if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
+        while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
-        //     continue;
+        {
            // create sutable matrix headers
            GpuMat src  = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
            GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
            GpuMat buff = integralBuffer;
-        GpuMat scaledImg      = resuzeBuffer(cv::Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
+            // generate integral for scale
-        GpuMat scaledIntegral = integral(cv::Rect(0, 0, scaledImageSize.width + 1, scaledImageSize.height + 1));
+            gpu::resize(image, src, level.sFrame, 0, 0, CV_INTER_LINEAR);
-        GpuMat currBuff = integralBuffer;
+            gpu::integralBuffered(src, sint, buff);
-        gpu::resize(image, scaledImg, scaledImageSize, 0, 0, CV_INTER_LINEAR);
+            total += cvCeil(area.width / step) * cvCeil(area.height / step);
-        gpu::integralBuffered(scaledImg, scaledIntegral, currBuff);
+            // std::cout << "Total for scale: " << total <<  " this step contribution " <<  cvCeil(area.width / step) * cvCeil(area.height / step) << " previous width shift " << prev << " acc " <<  acc << " scales: " << cvCeil(area.width / step) << std::endl;
-        int step = factor <= 2.f ? 2 : 1;
+            // increment pyr lavel
            level = level.next(scaleFactor, image.size(), NxM);
            area = level.workArea;
-        device::lbp::classifyStumpFixed(integral, integral.step1(), stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat, leaves_mat, subsets_mat, features_mat,
+            step =  (float)(1 + (level.scale <= 2.f));
-            processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, factor, step, subsetSize, candidates, dclassified.ptr<unsigned int>());
+            prev = acc;
            acc += level.sFrame.width + 1;
        }
-        factor *= scaleFactor;
+        device::lbp::classifyPyramid(image.cols, image.rows, NxM.width, NxM.height, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
-        windowSize = cv::Size(cvRound(NxM.width * factor), cvRound(NxM.height * factor));
+            leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
        scaledImageSize = cv::Size(cvRound( image.cols / factor ), cvRound( image.rows / factor ));
        processingRectSize = cv::Size(scaledImageSize.width - NxM.width + 1, scaledImageSize.height - NxM.height + 1 );
    }
    // cv::gpu::device::lbp::unbindIntegral();
    if (groupThreshold <= 0  || objects.empty())
        return 0;
    cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
    device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
    // candidates.copyTo(objects);
    cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
    cudaSafeCall( cudaDeviceSynchronize() );
    // std::cout << classified << " !!!!!!!!!!" <<  std::endl;
    return classified;
 }
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -216,10 +216,10 @@ namespace cv { namespace gpu { namespace device
        struct Classifier
        {
-            __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves, 
+            __host__ __device__ __forceinline__ Classifier(const int* _integral, int _pitch, const Stage* _stages, const ClNode* _nodes, const float* _leaves,
-				const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
+                const int* _subsets, const uchar4* _features, int _nstages, int _clWidth, int _clHeight, float _scale, int _step, int _subsetSize)
-            : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages), 
+            : integral(_integral), pitch(_pitch), stages(_stages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), nstages(_nstages),
-			  clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
+              clWidth(_clWidth), clHeight(_clHeight), scale(_scale), step(_step), subsetSize(_subsetSize){}
            __device__ __forceinline__ void operator() (int y, int x, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n) const
            {
@ -255,11 +255,7 @@ namespace cv { namespace gpu { namespace device
                rect.z = clWidth;
                rect.w = clHeight;
-#if (__CUDA_ARCH__ < 120)
+                int res = Emulation::smem::atomicInc(n, maxN);
                int res = __atomicInc(n, maxN);
 #else
                int res = atomicInc(n, maxN);
 #endif
                objects(0, res) = rect;
            }
@ -317,26 +313,17 @@ namespace cv { namespace gpu { namespace device
            __syncthreads();
            int cls = labels[tid];
-#if (__CUDA_ARCH__ < 120)
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
-            __atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
-            __atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
-            __atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
-            __atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
+
 #else
            atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
            atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
            atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
            atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
 #endif
            __syncthreads();
            labels[tid] = 0;
            __syncthreads();
-#if (__CUDA_ARCH__ < 120)
+            Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
-            __atomicInc((unsigned int*)labels + cls, n);
+
 #else
            atomicInc((unsigned int*)labels + cls, n);
 #endif
            __syncthreads();
            *nclasses = 0;
@ -354,30 +341,26 @@ namespace cv { namespace gpu { namespace device
            if (active && active >= groupThreshold)
            {
-                int* r1 = rrects + tid * 4;                				
+                int* r1 = rrects + tid * 4;
-				int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
+                int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
-#if (__CUDA_ARCH__ < 120)
+                int aidx = Emulation::smem::atomicInc(nclasses, n);
                objects[__atomicInc(nclasses, n)] = r_out;
 #else
                int aidx = atomicInc(nclasses, n);
                objects[aidx] = r_out;
 #endif
            }
        }
        void classifyStumpFixed(const DevMem2Di& integral, const int pitch, const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
                           const int workWidth, const int workHeight, const int clWidth, const int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, unsigned int* classified)
-        {			          			            
+        {
-			Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets, 
+            Classifier clr(integral, pitch, (Stage*)mstages.ptr(), (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets,
-				(uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
+                (uchar4*)mfeatures.ptr(), nstages, clWidth, clHeight, scale, step, subsetSize);
-			int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step); 
+            int total = ceilf(workHeight / (float)step) * ceilf(workWidth / (float)step);
-			int block = 256;
+            int block = 256;
            int grid  = divUp(total, block);
            lbp_classify_stump<<<grid, block>>>(clr, objects, objects.cols, classified, workWidth >> 1);
-			cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaGetLastError() );
        }
        void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
@ -385,7 +368,124 @@ namespace cv { namespace gpu { namespace device
            int block = ncandidates;
            int smem  = block * ( sizeof(int) + sizeof(int4) );
            disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
-			cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall( cudaGetLastError() );
        }
        struct Cascade
        {
            __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
                const int* _subsets, const uchar4* _features, int _subsetSize)
            : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
            __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch/*, DevMem2D_<int4> objects, const unsigned int maxN, unsigned int* n*/) const
            {
                int current_node = 0;
                int current_leave = 0;
                for (int s = 0; s < nstages; ++s)
                {
                    float sum = 0;
                    Stage stage = stages[s];
                    for (int t = 0; t < stage.ntrees; t++)
                    {
                        ClNode node = nodes[current_node];
                        uchar4 feature = features[node.featureIdx];
                        int shift;
                        int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
                        int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
                        sum += leaves[idx];
                        current_node += 1;
                        current_leave += 2;
                    }
                    if (sum < stage.threshold)
                        return false;
                }
                return true;
            }
            const Stage*  stages;
            const int nstages;
            const ClNode* nodes;
            const float* leaves;
            const int* subsets;
            const uchar4* features;
            const int subsetSize;
            const LBP evaluator;
        };
        // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
        __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
            const int workAmount, int* integral, const int pitch, DevMem2D_<int4> objects, unsigned int* classified)
        {
            int ftid = blockIdx.x * blockDim.x + threadIdx.x;
            if (ftid >= workAmount ) return;
            int sum = 0;
            // float scale = 1.0f;
            float stepShift = (scale <= 2.f) ? 2.0 : 1.0;
            int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
            int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
            // if (!ftid)
                // printf("!!!!: %d %d", w, h);
            int framTid = ftid;
            int i = 0;
            while (1)
            {
                if (framTid < (w - 1) * (h - 1)) break;
                i++;
                sum +=  __float2int_rn(frameW / scale) + 1;
                framTid -= w * h;
                scale *= factor;
                stepShift = (scale <= 2.f) ? 2.0 : 1.0;
                int w = ceilf( ( __float2int_rn(frameW / scale) - windowW + 1) / stepShift);
                int h = ceilf( ( __float2int_rn(frameH / scale) - windowH + 1) / stepShift);
            }
            int y = (framTid / w);
            int x = (framTid - y * w) * stepShift;
            y *= stepShift;
            x += sum;
            // if (i == 2)
            // printf("!!!!!!!!!!!!!! %f %d %d %d\n", windowW * scale, sum, y, x);
            if (cascade(y, x, integral, pitch))
            {
                int4 rect;
                rect.x = roundf( (x - sum) * scale);
                rect.y = roundf(y * scale);
                rect.z = roundf(windowW * scale);
                rect.w = roundf(windowH * scale);
                if (rect.x > frameW || rect.y > frameH) return;
                    // printf("OUTLAUER %d %d %d %d %d %d %d %d %d %f %f\n", x, y, ftid, framTid, rect.x, rect.y, sum, w, h, stepShift, scale);
                // printf("passed: %d %d ---- %d %d %d %d %d\n", y, x, rect.x, rect.y, rect.z, rect.w, sum);
                int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
                objects(0, res) = rect;
            }
        }
        void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
            const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
            const int subsetSize, DevMem2D_<int4> objects, unsigned int* classified, DevMem2Di integral)
        {
            const int block = 256;
            int grid = divUp(workAmount, block);
            Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
        }
    }
 }}}
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@ -44,18 +44,19 @@
 #define OPENCV_GPU_EMULATION_HPP_
 #include "warp_reduce.hpp"
 #include <stdio.h>
 namespace cv { namespace gpu { namespace device
 {
    struct Emulation
    {
-		template<int CTA_SIZE>
+        template<int CTA_SIZE>
        static __forceinline__ __device__ int Ballot(int predicate)
        {
-#if (__CUDA_ARCH__ >= 200) 
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
            return __ballot(predicate);
 #else
-			__shared__ volatile int cta_buffer[CTA_SIZE]
+            __shared__ volatile int cta_buffer[CTA_SIZE];
            int tid = threadIdx.x;
            cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
@ -63,41 +64,62 @@ namespace cv { namespace gpu { namespace device
 #endif
        }
-		struct smem
+        struct smem
-		{
+        {
-			enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
+            enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
-			
+
-			template<typename T>
+            template<typename T>
-			static __device__ __forceinline__ T atomicInc(T* address, T val)
+            static __device__ __forceinline__ T atomicInc(T* address, T val)
-			{
+            {
-#if (__CUDA_ARCH__ < 120)
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
-
+                T count;
-#else
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
-			
+                do
-#endif
+                {
-		
+                    count = *address & TAG_MASK;
-			}
+                    count = tag | (count + 1);
-
+                    *address = count;
-			template<typename T>
+                } while (*address != count);
-			static __device__ __forceinline__ void atomicAdd(T* address, T val)
+
-			{
+                return (count & TAG_MASK) - 1;
-#if (__CUDA_ARCH__ < 120)
+#else
-
+                return ::atomicInc(address, val);
-#else
+#endif
-			
+            }
-#endif
+
-			}
+            template<typename T>
-
+            static __device__ __forceinline__ void atomicAdd(T* address, T val)
-			template<typename T>
+            {
-			__device__ __forceinline__ T __atomicMin(T* address, T val)
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
-			{
+                T count;
-#if (__CUDA_ARCH__ < 120)
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
-
+                do
-#else
+                {
-			
+                    count = *address & TAG_MASK;
-#endif
+                    count = tag | (count + val);
-			}
+                    *address = count;
-		};
+                } while (*address != count);
 #else
                ::atomicAdd(address, val);
 #endif
            }
            template<typename T>
            static __device__ __forceinline__ T atomicMin(T* address, T val)
            {
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
                T count = min(*address, val);
                do
                {
                    *address = count;
                } while (*address > count);
                return count;
 #else
                return ::atomicMin(address, val);
 #endif
            }
        };
    };
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@ -44,52 +44,11 @@
 #define __OPENCV_GPU_DEVICE_LBP_HPP_
 #include "internal_shared.hpp"
 #include <opencv2/gpu/device/emulation.hpp>
 namespace cv { namespace gpu { namespace device {
-namespace lbp{
+namespace lbp {
    #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
 	template<typename T>
 	__device__ __forceinline__ T __atomicInc(T* address, T val)
 	{
 		T count;
 		unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
 		do
 		{
 			count = *address & TAG_MASK;
 			count = tag | (count + 1);
 			*address = count;
 		} while (*address != count);
 		return (count & TAG_MASK) - 1;
 	}
 	template<typename T>
 	__device__ __forceinline__ void __atomicAdd(T* address, T val)
 	{
 		T count;
 		unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
 		do
 		{
 			count = *address & TAG_MASK;
 			count = tag | (count + val);
 			*address = count;
 		} while (*address != count);
 	}
 	template<typename T>
 	__device__ __forceinline__ T __atomicMin(T* address, T val)
 	{
 		T count = min(*address, val);
 		do
 		{
 			*address = count;
 		} while (*address > count);
 		return count;
 	}
    struct Stage
    {
@ -127,27 +86,25 @@ namespace lbp{
        unsigned tid = threadIdx.x;
        labels[tid] = tid;
        __syncthreads();
        for (unsigned int id = 0; id < n; id++)
        {
            if (tid != id && predicate(vec[tid], vec[id]))
            {
                int p = labels[tid];
                int q = labels[id];
-
+                if (p < q)
-				if (p != q)
+                {
-				{
+                    Emulation::smem::atomicMin(labels + id, p);
-					int m = min(p, q);
+                }
-#if (__CUDA_ARCH__ < 120)
+                else if (p > q)
-                    __atomicMin(labels + id, m);
+                {
-#else
+                    Emulation::smem::atomicMin(labels + tid, q);
-                    atomicMin(labels + id, m);
+                }
 #endif
 				}
            }
        }
        __syncthreads();
    }
 } // lbp
 } } }// namespaces