Canny

2012-11-12 13:02:17 +04:00 · 2012-11-12 13:02:17 +04:00 · 28716d7f30
commit 28716d7f30
parent e299595667
4 changed files with 450 additions and 477 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -792,31 +792,23 @@ private:
    GpuMat lab, l, ab;
 };
 struct CV_EXPORTS CannyBuf
 {
    void create(const Size& image_size, int apperture_size = 3);
    void release();
-struct CV_EXPORTS CannyBuf;
+    GpuMat dx, dy;
    GpuMat mag;
    GpuMat map;
    GpuMat st1, st2;
    Ptr<FilterEngine_GPU> filterDX, filterDY;
 };
 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
 CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
 struct CV_EXPORTS CannyBuf
 {
    CannyBuf() {}
    explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
    CannyBuf(const GpuMat& dx_, const GpuMat& dy_);
    void create(const Size& image_size, int apperture_size = 3);
    void release();
    GpuMat dx, dy;
    GpuMat dx_buf, dy_buf;
    GpuMat edgeBuf;
    GpuMat trackBuf1, trackBuf2;
    Ptr<FilterEngine_GPU> filterDX, filterDY;
 };
 class CV_EXPORTS ImagePyramid
 {
 public:
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@ -43,459 +43,463 @@
 #if !defined CUDA_DISABLER
 #include <utility>
-#include <algorithm>
+#include "opencv2/gpu/device/common.hpp"
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/emulation.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/utility.hpp"
 using namespace cv::gpu;
 using namespace cv::gpu::device;
 namespace
 {
    struct L1 : binary_function<int, int, float>
    {
        __device__ __forceinline__ float operator ()(int x, int y) const
        {
            return ::abs(x) + ::abs(y);
        }
        __device__ __forceinline__ L1() {}
        __device__ __forceinline__ L1(const L1&) {}
    };
    struct L2 : binary_function<int, int, float>
    {
        __device__ __forceinline__ float operator ()(int x, int y) const
        {
            return ::sqrtf(x * x + y * y);
        }
        __device__ __forceinline__ L2() {}
        __device__ __forceinline__ L2(const L2&) {}
    };
 }
 namespace cv { namespace gpu { namespace device
 {
-    namespace canny
+    template <> struct TransformFunctorTraits<L1> : DefaultTransformFunctorTraits<L1>
    {
-        __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        enum { smart_shift = 4 };
    };
    template <> struct TransformFunctorTraits<L2> : DefaultTransformFunctorTraits<L2>
    {
        enum { smart_shift = 4 };
    };
 }}}
 namespace
 {
    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
    struct SrcTex
    {
        const int xoff;
        const int yoff;
        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
        __device__ __forceinline__ int operator ()(int y, int x) const
        {
-            __shared__ int smem[16][18];
+            return tex2D(tex_src, x + xoff, y + yoff);
        }
    };
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+    template <class Norm> __global__
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+    void calcMagnitude(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            if (i < rows)
+        if (y >= mag.rows || x >= mag.cols)
-            {
+            return;
                smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
                if (threadIdx.x == 0)
                {
                    smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
                    smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
                }
                __syncthreads();
-                if (j < cols)
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
-                {
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
-                    dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
+
-                    dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
+        dx(y, x) = dxVal;
-                }
+        dy(y, x) = dyVal;
-            }
+
        mag(y, x) = norm(dxVal, dyVal);
    }
 }
 namespace canny
 {
    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
        bindTexture(&tex_src, srcWhole);
        SrcTex src(xoff, yoff);
        if (L2Grad)
        {
            L2 norm;
            ::calcMagnitude<<<grid, block>>>(src, dx, dy, mag, norm);
        }
        else
        {
            L1 norm;
            ::calcMagnitude<<<grid, block>>>(src, dx, dy, mag, norm);
        }
-        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        cudaSafeCall( cudaGetLastError() );
        cudaSafeCall(cudaThreadSynchronize());
    }
    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
    {
        if (L2Grad)
        {
-            dim3 block(16, 16, 1);
+            L2 norm;
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
            calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
        }
-
+        else
        struct L1
        {
-            static __device__ __forceinline__ float calc(int x, int y)
+            L1 norm;
-            {
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
                return ::abs(x) + ::abs(y);
            }
        };
        struct L2
        {
            static __device__ __forceinline__ float calc(int x, int y)
            {
                return ::sqrtf(x * x + y * y);
            }
        };
        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
            PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
        {
            __shared__ int sdx[18][16];
            __shared__ int sdy[18][16];
            const int j = blockIdx.x * blockDim.x + threadIdx.x;
            const int i = blockIdx.y * blockDim.y + threadIdx.y;
            if (j < cols)
            {
                sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
                sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
                if (threadIdx.y == 0)
                {
                    sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
                    sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
                    sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
                    sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
                }
                __syncthreads();
                if (i < rows)
                {
                    int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
                    int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
                    dx.ptr(i)[j] = x;
                    dy.ptr(i)[j] = y;
                    mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
                }
            }
        }
    }
 }
-        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+//////////////////////////////////////////////////////////////////////////////////////////
 namespace
 {
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
    __global__ void calcMap(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
    {
        const int CANNY_SHIFT = 15;
        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
        if (x >= dx.cols || y >= dx.rows)
            return;
        int dxVal = dx(y, x);
        int dyVal = dy(y, x);
        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
        const float m = tex2D(tex_mag, x, y);
        dxVal = ::abs(dxVal);
        dyVal = ::abs(dyVal);
        // 0 - the pixel can not belong to an edge
        // 1 - the pixel might belong to an edge
        // 2 - the pixel does belong to an edge
        int edge_type = 0;
        if (m > low_thresh)
        {
-            dim3 block(16, 16, 1);
+            const int tg22x = dxVal * TG22;
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
-            if (L2Grad)
+            dyVal <<= CANNY_SHIFT;
-                calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+
            if (dyVal < tg22x)
            {
                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else if(dyVal > tg67x)
            {
                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else
-                calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+            {
-
+                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
-            cudaSafeCall( cudaGetLastError() );
+                    edge_type = 1 + (int)(m > high_thresh);
-
+            }
            cudaSafeCall(cudaThreadSynchronize());
        }
-        template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+        map(y, x) = edge_type;
-        {
+    }
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+}
            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-            if (i < rows && j < cols)
+namespace canny
-                mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+{
    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
        bindTexture(&tex_mag, mag);
        ::calcMap<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
        cudaSafeCall( cudaGetLastError() );
        cudaSafeCall( cudaDeviceSynchronize() );
    }
 }
 //////////////////////////////////////////////////////////////////////////////////////////
 namespace
 {
    __device__ int counter = 0;
    __global__ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st)
    {
        __shared__ volatile int smem[18][18];
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
        if (threadIdx.y == 0)
            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
        if (threadIdx.y == blockDim.y - 1)
            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
        if (threadIdx.x == 0)
            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
        if (threadIdx.x == blockDim.x - 1)
            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
        if (threadIdx.x == 0 && threadIdx.y == 0)
            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
        __syncthreads();
        if (x >= map.cols || y >= map.rows)
            return;
        int n;
        #pragma unroll
        for (int k = 0; k < 16; ++k)
        {
            n = 0;
            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
            {
                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
            }
            if (n > 0)
                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
        }
-        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
        map(y, x) = e;
        n = 0;
        if (e == 2)
        {
-            dim3 block(16, 16, 1);
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-            if (L2Grad)
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-                calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
            else
                calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
-            cudaSafeCall( cudaGetLastError() );
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-            cudaSafeCall( cudaDeviceSynchronize() );
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
        }
-        //////////////////////////////////////////////////////////////////////////////////////////
+        if (n > 0)
        #define CANNY_SHIFT 15
        #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
        __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
        {
-            __shared__ float smem[18][18];
+            const int ind =  ::atomicAdd(&counter, 1);
            st[ind] = make_ushort2(x, y);
        }
    }
 }
-            const int j = blockIdx.x * 16 + threadIdx.x;
+namespace canny
-            const int i = blockIdx.y * 16 + threadIdx.y;
+{
    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
    {
        void* counter_ptr;
        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-            const int tid = threadIdx.y * 16 + threadIdx.x;
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
            const int lx = tid % 18;
            const int ly = tid / 18;
-            if (ly < 14)
+        const dim3 block(16, 16);
-                smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
-            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+        ::edgesHysteresisLocal<<<grid, block>>>(map, st1);
-                smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+        cudaSafeCall( cudaGetLastError() );
        cudaSafeCall( cudaDeviceSynchronize() );
    }
 }
 //////////////////////////////////////////////////////////////////////////////////////////
 namespace
 {
    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
    __global__ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
    {
        const int stack_size = 512;
        __shared__ int s_counter;
        __shared__ int s_ind;
        __shared__ ushort2 s_st[stack_size];
        if (threadIdx.x == 0)
            s_counter = 0;
        __syncthreads();
        int ind = blockIdx.y * gridDim.x + blockIdx.x;
        if (ind >= count)
            return;
        ushort2 pos = st1[ind];
        if (threadIdx.x < 8)
        {
            pos.x += c_dx[threadIdx.x];
            pos.y += c_dy[threadIdx.x];
            if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1)
            {
                map(pos.y, pos.x) = 2;
                ind = Emulation::smem::atomicAdd(&s_counter, 1);
                s_st[ind] = pos;
            }
        }
        __syncthreads();
        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
        {
            const int subTaskIdx = threadIdx.x >> 3;
            const int portion = ::min(s_counter, blockDim.x >> 3);
            if (subTaskIdx < portion)
                pos = s_st[s_counter - 1 - subTaskIdx];
            __syncthreads();
            if (i < rows && j < cols)
            {
                int x = dx.ptr(i)[j];
                int y = dy.ptr(i)[j];
                const int s = (x ^ y) < 0 ? -1 : 1;
                const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
                x = ::abs(x);
                y = ::abs(y);
                // 0 - the pixel can not belong to an edge
                // 1 - the pixel might belong to an edge
                // 2 - the pixel does belong to an edge
                int edge_type = 0;
                if (m > low_thresh)
                {
                    const int tg22x = x * TG22;
                    const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
                    y <<= CANNY_SHIFT;
                    if (y < tg22x)
                    {
                        if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
                            edge_type = 1 + (int)(m > high_thresh);
                    }
                    else if( y > tg67x )
                    {
                        if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
                            edge_type = 1 + (int)(m > high_thresh);
                    }
                    else
                    {
                        if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
                            edge_type = 1 + (int)(m > high_thresh);
                    }
                }
                map.ptr(i + 1)[j + 1] = edge_type;
            }
        }
        #undef CANNY_SHIFT
        #undef TG22
        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
        {
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
            calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
        }
        //////////////////////////////////////////////////////////////////////////////////////////
        __device__ unsigned int counter = 0;
        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
        {
            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
            __shared__ int smem[18][18];
            const int j = blockIdx.x * 16 + threadIdx.x;
            const int i = blockIdx.y * 16 + threadIdx.y;
            const int tid = threadIdx.y * 16 + threadIdx.x;
            const int lx = tid % 18;
            const int ly = tid / 18;
            if (ly < 14)
                smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
                smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
            __syncthreads();
            if (i < rows && j < cols)
            {
                int n;
                #pragma unroll
                for (int k = 0; k < 16; ++k)
                {
                    n = 0;
                    if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
                    {
                        n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
                        n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
                        n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
                        n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
                        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
                        n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
                        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
                        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
                    }
                    if (n > 0)
                        smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
                }
                const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
                map.ptr(i + 1)[j + 1] = e;
                n = 0;
                if (e == 2)
                {
                    n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
                }
                if (n > 0)
                {
                    const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
                    st[ind] = make_ushort2(j + 1, i + 1);
                }
            }
            #endif
        }
        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
            edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
        }
        __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
        __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
        {
            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
            const int stack_size = 512;
            __shared__ unsigned int s_counter;
            __shared__ unsigned int s_ind;
            __shared__ ushort2 s_st[stack_size];
            if (threadIdx.x == 0)
-                s_counter = 0;
+                s_counter -= portion;
            __syncthreads();
-            int ind = blockIdx.y * gridDim.x + blockIdx.x;
+            if (subTaskIdx < portion)
            if (ind < count)
            {
-                ushort2 pos = st1[ind];
+                pos.x += c_dx[threadIdx.x & 7];
                pos.y += c_dy[threadIdx.x & 7];
-                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1)
                {
-                    if (threadIdx.x < 8)
+                    map(pos.y, pos.x) = 2;
                    {
                        pos.x += c_dx[threadIdx.x];
                        pos.y += c_dy[threadIdx.x];
-                        if (map.ptr(pos.y)[pos.x] == 1)
+                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
                        {
                            map.ptr(pos.y)[pos.x] = 2;
-                            ind = atomicInc(&s_counter, (unsigned int)(-1));
+                    s_st[ind] = pos;
                            s_st[ind] = pos;
                        }
                    }
                    __syncthreads();
                    while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
                    {
                        const int subTaskIdx = threadIdx.x >> 3;
                        const int portion = ::min(s_counter, blockDim.x >> 3);
                        pos.x = pos.y = 0;
                        if (subTaskIdx < portion)
                            pos = s_st[s_counter - 1 - subTaskIdx];
                        __syncthreads();
                        if (threadIdx.x == 0)
                            s_counter -= portion;
                        __syncthreads();
                        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
                        {
                            pos.x += c_dx[threadIdx.x & 7];
                            pos.y += c_dy[threadIdx.x & 7];
                            if (map.ptr(pos.y)[pos.x] == 1)
                            {
                                map.ptr(pos.y)[pos.x] = 2;
                                ind = atomicInc(&s_counter, (unsigned int)(-1));
                                s_st[ind] = pos;
                            }
                        }
                        __syncthreads();
                    }
                    if (s_counter > 0)
                    {
                        if (threadIdx.x == 0)
                        {
                            ind = atomicAdd(&counter, s_counter);
                            s_ind = ind - s_counter;
                        }
                        __syncthreads();
                        ind = s_ind;
                        for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
                        {
                            st2[ind + i] = s_st[i];
                        }
                    }
                }
            }
-            #endif
+            __syncthreads();
        }
-        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+        if (s_counter > 0)
        {
-            void* counter_ptr;
+            if (threadIdx.x == 0)
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
            unsigned int count;
            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
            while (count > 0)
            {
-                cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+                ind = ::atomicAdd(&counter, s_counter);
-
+                s_ind = ind - s_counter;
                dim3 block(128, 1, 1);
                dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
                edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
                cudaSafeCall( cudaGetLastError() );
                cudaSafeCall( cudaDeviceSynchronize() );
                cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
                std::swap(st1, st2);
            }
            __syncthreads();
            ind = s_ind;
            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
                st2[ind + i] = s_st[i];
        }
    }
 }
-        __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
+namespace canny
 {
    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
    {
        void* counter_ptr;
        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, ::counter) );
        int count;
        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
        while (count > 0)
        {
-            const int j = blockIdx.x * 16 + threadIdx.x;
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
            const int i = blockIdx.y * 16 + threadIdx.y;
-            if (i < rows && j < cols)
+            const dim3 block(128);
-                dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
+            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
        }
-        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
+            ::edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, count);
        {
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
            getEdges<<<grid, block>>>(map, dst, rows, cols);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
            std::swap(st1, st2);
        }
-    } // namespace canny
+    }
-}}} // namespace cv { namespace gpu { namespace device
+}
 //////////////////////////////////////////////////////////////////////////////////////////
-#endif /* CUDA_DISABLER */
+namespace
 {
    struct GetEdges : unary_function<int, uchar>
    {
        __device__ __forceinline__ uchar operator ()(int e) const
        {
            return (uchar)(-(e >> 1));
        }
        __device__ __forceinline__ GetEdges() {}
        __device__ __forceinline__ GetEdges(const GetEdges&) {}
    };
 }
 namespace cv { namespace gpu { namespace device
 {
    template <> struct TransformFunctorTraits<GetEdges> : DefaultTransformFunctorTraits<GetEdges>
    {
        enum { smart_shift = 4 };
    };
 }}}
 namespace canny
 {
    void getEdges(PtrStepSzi map, PtrStepSzb dst)
    {
        transform(map, dst, GetEdges(), WithOutMask(), 0);
    }
 }
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -91,7 +91,6 @@ void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_n
 void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); }
 cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); }
 void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); }
 void cv::gpu::CannyBuf::release() { throw_nogpu(); }
@ -1466,92 +1465,76 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 //////////////////////////////////////////////////////////////////////////////
 // Canny
 cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_)
 {
    CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
    create(dx_.size(), -1);
 }
 void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
 {
-    ensureSizeIsEnough(image_size, CV_32SC1, dx);
+    if (apperture_size > 0)
-    ensureSizeIsEnough(image_size, CV_32SC1, dy);
+    {
        ensureSizeIsEnough(image_size, CV_32SC1, dx);
        ensureSizeIsEnough(image_size, CV_32SC1, dy);
-    if (apperture_size == 3)
+        if (apperture_size != 3)
-    {
+        {
        ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
        ensureSizeIsEnough(image_size, CV_32SC1, dy_buf);
    }
    else if(apperture_size > 0)
    {
        if (!filterDX)
            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
        if (!filterDY)
            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
        }
    }
-    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf);
+    ensureSizeIsEnough(image_size, CV_32FC1, mag);
    ensureSizeIsEnough(image_size, CV_32SC1, map);
-    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
-    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
+    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
 }
 void cv::gpu::CannyBuf::release()
 {
    dx.release();
    dy.release();
-    dx_buf.release();
+    mag.release();
-    dy_buf.release();
+    map.release();
-    edgeBuf.release();
+    st1.release();
-    trackBuf1.release();
+    st2.release();
    trackBuf2.release();
 }
-namespace cv { namespace gpu { namespace device
+namespace canny
 {
-    namespace canny
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
-    {
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
-        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
-        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
-        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
-        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
+    void getEdges(PtrStepSzi map, PtrStepSzb dst);
-
+}
        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
    }
 }}}
 namespace
 {
-    void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
+    void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
    {
-        using namespace ::cv::gpu::device::canny;
+        using namespace canny;
-        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
+        calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
-        edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols);
+        edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
-        edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols);
+        edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
-        getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
+        getEdges(buf.map, dst);
    }
 }
 void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
 {
-    CannyBuf buf(src.size(), apperture_size);
+    CannyBuf buf;
    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
 }
 void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
 {
-    using namespace ::cv::gpu::device::canny;
+    using namespace canny;
    CV_Assert(src.type() == CV_8UC1);
@ -1562,37 +1545,37 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
        std::swap( low_thresh, high_thresh);
    dst.create(src.size(), CV_8U);
    dst.setTo(Scalar::all(0));
    buf.create(src.size(), apperture_size);
    buf.edgeBuf.setTo(Scalar::all(0));
    if (apperture_size == 3)
    {
-        calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
+        Size wholeSize;
        Point ofs;
        src.locateROI(wholeSize, ofs);
        GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step);
-        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+        calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient);
    }
    else
    {
        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
-        calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+        calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
    }
-    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+    CannyCaller(buf.dx, buf.dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }
 void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
 {
-    CannyBuf buf(dx, dy);
+    CannyBuf buf;
    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
 }
 void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
 {
-    using namespace ::cv::gpu::device::canny;
+    using namespace canny;
    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
@ -1601,17 +1584,11 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d
        std::swap( low_thresh, high_thresh);
    dst.create(dx.size(), CV_8U);
    dst.setTo(Scalar::all(0));
    buf.dx = dx; buf.dy = dy;
    buf.create(dx.size(), -1);
    buf.edgeBuf.setTo(Scalar::all(0));
-    calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
+    calcMagnitude(dx, dy, buf.mag, L2gradient);
-    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+    CannyCaller(dx, dy, buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
 }
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -313,7 +313,7 @@ TEST_P(Canny, Accuracy)
        cv::Mat edges_gold;
        cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
-        EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2);
+        EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
    }
 }