From 767ac9aa108b29c09867959503f939295e78267f Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Mon, 8 Aug 2011 08:53:55 +0000
Subject: [PATCH] added gpu::Canny function

---
 modules/gpu/include/opencv2/gpu/gpu.hpp |  27 +-
 modules/gpu/src/cuda/canny.cu           | 489 ++++++++++++++++++++++++
 modules/gpu/src/imgproc_gpu.cpp         | 152 +++++++-
 modules/gpu/test/test_imgproc.cpp       |  67 ++++
 samples/gpu/performance/tests.cpp       |  22 ++
 5 files changed, 754 insertions(+), 3 deletions(-)
 create mode 100644 modules/gpu/src/cuda/canny.cu

diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 9a64893d3..c7330666d 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -976,7 +976,32 @@ namespace cv
         //! performs linear blending of two images
         //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
         CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, 
-            GpuMat& result, Stream& stream = Stream::Null());       
+            GpuMat& result, Stream& stream = Stream::Null());
+
+        
+        struct CV_EXPORTS CannyBuf;
+        
+        CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+        CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
+        CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
+        CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
+
+        struct CV_EXPORTS CannyBuf
+        {
+            CannyBuf() {}
+            explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
+            CannyBuf(const GpuMat& dx_, const GpuMat& dy_);
+
+            void create(const Size& image_size, int apperture_size = 3);
+            
+            void release();
+
+            GpuMat dx, dy;
+            GpuMat dx_buf, dy_buf;
+            GpuMat edgeBuf;
+            GpuMat trackBuf1, trackBuf2;
+            Ptr<FilterEngine_GPU> filterDX, filterDY;
+        };
 
         ////////////////////////////// Matrix reductions //////////////////////////////
 
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu
new file mode 100644
index 000000000..5a7a868fa
--- /dev/null
+++ b/modules/gpu/src/cuda/canny.cu
@@ -0,0 +1,489 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace canny
+{
+    __global__ void calcSobelRowPass(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+    {
+        __shared__ int smem[16][18];
+
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i < rows)
+        {
+            smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
+            if (threadIdx.x == 0)
+            {
+                smem[threadIdx.y][0] = src.ptr(i)[max(j - 1, 0)];
+                smem[threadIdx.y][17] = src.ptr(i)[min(j + 16, cols - 1)];
+            }
+            __syncthreads();
+
+            if (j < cols)
+            {
+                dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
+                dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
+            }
+        }
+    }
+
+    void calcSobelRowPass_gpu(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    struct L1
+    {
+        static __device__ __forceinline__ float calc(int x, int y)
+        {
+            return abs(x) + abs(y);
+        }
+    };
+    struct L2
+    {
+        static __device__ __forceinline__ float calc(int x, int y)
+        {
+            return sqrtf(x * x + y * y);
+        }
+    };
+
+    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+    {
+        __shared__ int sdx[18][16];
+        __shared__ int sdy[18][16];
+
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (j < cols)
+        {
+            sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
+            sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
+            if (threadIdx.y == 0)
+            {
+                sdx[0][threadIdx.x] = dx_buf.ptr(max(i - 1, 0))[j];
+                sdx[17][threadIdx.x] = dx_buf.ptr(min(i + 16, rows - 1))[j];
+
+                sdy[0][threadIdx.x] = dy_buf.ptr(max(i - 1, 0))[j];
+                sdy[17][threadIdx.x] = dy_buf.ptr(min(i + 16, rows - 1))[j];
+            }
+            __syncthreads();
+
+            if (i < rows)
+            {
+                int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
+                int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
+
+                dx.ptr(i)[j] = x;
+                dy.ptr(i)[j] = y;
+
+                mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
+            }
+        }
+    }
+
+    void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        if (L2Grad)
+            calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+        else
+            calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+    {
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i < rows && j < cols)
+            mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+    }
+
+    void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        if (L2Grad)
+            calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
+        else
+            calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+        
+#define CANNY_SHIFT 15
+#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+
+    __global__ void calcMap(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+    {
+        __shared__ float smem[18][18];
+
+        const int j = blockIdx.x * 16 + threadIdx.x;
+        const int i = blockIdx.y * 16 + threadIdx.y;
+
+        const int tid = threadIdx.y * 16 + threadIdx.x;
+        const int lx = tid % 18;
+        const int ly = tid / 18;
+
+        if (ly < 14)
+            smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+
+        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+            smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+
+        __syncthreads();
+
+        if (i < rows && j < cols)
+        {
+            int x = dx.ptr(i)[j];
+            int y = dy.ptr(i)[j];
+            const int s = (x ^ y) < 0 ? -1 : 1;
+            const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+            x = abs(x);
+            y = abs(y);
+
+            // 0 - the pixel can not belong to an edge
+            // 1 - the pixel might belong to an edge
+            // 2 - the pixel does belong to an edge
+            int edge_type = 0;
+
+            if (m > low_thresh)
+            {
+                const int tg22x = x * TG22;
+                const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
+
+                y <<= CANNY_SHIFT;
+
+                if (y < tg22x)
+                {
+                    if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+                else if( y > tg67x )
+                {
+                    if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+                else
+                {
+                    if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            
+            map.ptr(i + 1)[j + 1] = edge_type;
+        }
+    }
+
+#undef CANNY_SHIFT
+#undef TG22
+
+    void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+    __device__ unsigned int counter = 0;
+
+    __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
+    {
+        #if __CUDA_ARCH__ >= 120
+
+        __shared__ int smem[18][18];
+
+        const int j = blockIdx.x * 16 + threadIdx.x;
+        const int i = blockIdx.y * 16 + threadIdx.y;
+
+        const int tid = threadIdx.y * 16 + threadIdx.x;
+        const int lx = tid % 18;
+        const int ly = tid / 18; 
+
+        if (ly < 14)
+            smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+
+        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+            smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+
+        __syncthreads();
+
+        if (i < rows && j < cols)
+        {
+            int n;
+
+            #pragma unroll
+            for (int k = 0; k < 16; ++k)
+            {
+                n = 0;
+
+                if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+                {
+                    n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+                    
+                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+                    
+                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+                }
+
+                if (n > 0)
+                    smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+            }
+
+            const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+            map.ptr(i + 1)[j + 1] = e;
+
+            n = 0;
+
+            if (e == 2)
+            {
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+                
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+                
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+            }
+
+            if (n > 0)
+            {
+                const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
+                st[ind] = make_ushort2(j + 1, i + 1);
+            }
+        }
+
+        #endif
+    }
+
+    void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+    __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
+    {
+        #if __CUDA_ARCH__ >= 120
+
+        const int stack_size = 512;
+        
+        __shared__ unsigned int s_counter;
+        __shared__ unsigned int s_ind;
+        __shared__ ushort2 s_st[stack_size];
+
+        if (threadIdx.x == 0)
+            s_counter = 0;
+        __syncthreads();
+
+        int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (ind < count)
+        {
+            ushort2 pos = st1[ind];
+
+            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+            {
+                if (threadIdx.x < 8)
+                {
+                    pos.x += c_dx[threadIdx.x];
+                    pos.y += c_dy[threadIdx.x];
+
+                    if (map.ptr(pos.y)[pos.x] == 1)
+                    {
+                        map.ptr(pos.y)[pos.x] = 2;
+
+                        ind = atomicInc(&s_counter, (unsigned int)(-1));
+
+                        s_st[ind] = pos;
+                    }
+                }
+                __syncthreads();
+
+                while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+                {
+                    const int subTaskIdx = threadIdx.x >> 3;
+                    const int portion = min(s_counter, blockDim.x >> 3);
+
+                    pos.x = pos.y = 0;
+
+                    if (subTaskIdx < portion)
+                        pos = s_st[s_counter - 1 - subTaskIdx];
+                    __syncthreads();
+                        
+                    if (threadIdx.x == 0)
+                        s_counter -= portion;
+                    __syncthreads();
+                     
+                    if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                    {
+                        pos.x += c_dx[threadIdx.x & 7];
+                        pos.y += c_dy[threadIdx.x & 7];
+
+                        if (map.ptr(pos.y)[pos.x] == 1)
+                        {
+                            map.ptr(pos.y)[pos.x] = 2;
+
+                            ind = atomicInc(&s_counter, (unsigned int)(-1));
+
+                            s_st[ind] = pos;
+                        }
+                    }
+                    __syncthreads();
+                }
+
+                if (s_counter > 0)
+                {
+                    if (threadIdx.x == 0)
+                    {
+                        ind = atomicAdd(&counter, s_counter);
+                        s_ind = ind - s_counter;
+                    }
+                    __syncthreads();
+
+                    ind = s_ind;
+
+                    for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                    {
+                        st2[ind + i] = s_st[i];
+                    }
+                }
+            }
+        }
+
+        #endif
+    }
+
+    void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, "cv::gpu::canny::counter") );
+        
+        unsigned int count;
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+
+        while (count > 0)
+        {
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+
+            dim3 block(128, 1, 1);
+            dim3 grid(min(count, 65535u), divUp(count, 65535), 1);
+            edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
+
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+
+            swap(st1, st2);
+        }
+    }
+
+    __global__ void getEdges(PtrStepi map, PtrStep dst, int rows, int cols)
+    {
+        const int j = blockIdx.x * 16 + threadIdx.x;
+        const int i = blockIdx.y * 16 + threadIdx.y;
+
+        if (i < rows && j < cols)
+            dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
+    }
+
+    void getEdges_gpu(PtrStepi map, PtrStep dst, int rows, int cols)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        getEdges<<<grid, block>>>(map, dst, rows, cols);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+}}}
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 504c6c8c8..479ad8e80 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -92,8 +92,13 @@ void cv::gpu::pyrDown(const GpuMat&, GpuMat&, PyrDownBuf&, Stream&) { throw_nogp
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::PyrUpBuf::create(Size, int) { throw_nogpu(); }
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, PyrUpBuf&, Stream&) { throw_nogpu(); }
-
-
+void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
+void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
+void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
+void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); }
+cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); }
+void cv::gpu::CannyBuf::release() { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -1627,6 +1632,149 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, PyrUpBuf& buf, Stream& strea
     buf.filter->apply(buf.buf, dst, Rect(0, 0, buf.buf.cols, buf.buf.rows), stream);
 }
 
+
+//////////////////////////////////////////////////////////////////////////////
+// Canny
+
+cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_)
+{
+    CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
+
+    create(dx_.size(), -1);
+}
+
+void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
+{
+    ensureSizeIsEnough(image_size, CV_32SC1, dx);
+    ensureSizeIsEnough(image_size, CV_32SC1, dy);
+
+    if (apperture_size == 3)
+    {
+        ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
+        ensureSizeIsEnough(image_size, CV_32SC1, dy_buf);
+    }
+    else if(apperture_size > 0)
+    {
+        if (!filterDX)
+            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
+        if (!filterDY)
+            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+    }
+
+    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf);
+
+    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
+    ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
+}
+
+void cv::gpu::CannyBuf::release()
+{
+    dx.release();
+    dy.release();
+    dx_buf.release();
+    dy_buf.release();
+    edgeBuf.release();
+    trackBuf1.release();
+    trackBuf2.release();
+}
+
+namespace cv { namespace gpu { namespace canny
+{    
+    void calcSobelRowPass_gpu(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
+
+    void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
+    void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad);
+
+    void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh);
+    
+    void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols);
+
+    void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
+
+    void getEdges_gpu(PtrStepi map, PtrStep dst, int rows, int cols);
+}}}
+
+namespace
+{
+    void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
+    {
+        using namespace cv::gpu::canny;
+
+        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
+        
+        edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), dst.rows, dst.cols);
+        
+        edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr<ushort2>(), buf.trackBuf2.ptr<ushort2>(), dst.rows, dst.cols);
+        
+        getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols);
+    }
+}
+
+void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    CannyBuf buf(src.size(), apperture_size);
+    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
+}
+
+void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    using namespace cv::gpu::canny;
+
+    CV_Assert(src.type() == CV_8UC1);
+
+    if( low_thresh > high_thresh )
+        std::swap( low_thresh, high_thresh);
+
+    dst.create(src.size(), CV_8U);
+    dst.setTo(Scalar::all(0));
+    
+    buf.create(src.size(), apperture_size);
+    buf.edgeBuf.setTo(Scalar::all(0));
+
+    if (apperture_size == 3)
+    {
+        calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
+
+        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+    }
+    else
+    {
+        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
+        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
+
+        calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient);
+    }
+
+    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+}
+
+void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
+{
+    CannyBuf buf(dx, dy);
+    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
+}
+
+void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
+{
+    using namespace cv::gpu::canny;
+
+    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
+
+    if( low_thresh > high_thresh )
+        std::swap( low_thresh, high_thresh);
+
+    dst.create(dx.size(), CV_8U);
+    dst.setTo(Scalar::all(0));
+    
+    buf.dx = dx; buf.dy = dy;
+    buf.create(dx.size(), -1);
+    buf.edgeBuf.setTo(Scalar::all(0));
+
+    calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient);
+
+    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
+}
+
 #endif /* !defined (HAVE_CUDA) */
 
 
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index 3839aa64b..407d3be94 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -2217,4 +2217,71 @@ TEST_P(PyrUp, Accuracy)
 
 INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, testing::ValuesIn(devices()));
 
+////////////////////////////////////////////////////////
+// Canny
+
+struct Canny : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int, bool> >
+{
+    static cv::Mat img;
+
+    static void SetUpTestCase() 
+    {
+        img = readImage("stereobm/aloe-L.png", CV_LOAD_IMAGE_GRAYSCALE); 
+    }
+
+    static void TearDownTestCase() 
+    {
+        img.release();
+    } 
+
+    cv::gpu::DeviceInfo devInfo;
+    int apperture_size;
+    bool L2gradient;
+
+    double low_thresh;
+    double high_thresh;
+
+    cv::Mat edges_gold;
+
+    virtual void SetUp() 
+    {
+        devInfo = std::tr1::get<0>(GetParam());
+        apperture_size = std::tr1::get<1>(GetParam());
+        L2gradient = std::tr1::get<2>(GetParam());
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        low_thresh = 50.0;
+        high_thresh = 100.0;
+        
+        cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, L2gradient);
+    }
+};
+
+cv::Mat Canny::img;
+
+TEST_P(Canny, Accuracy)
+{
+    PRINT_PARAM(devInfo);
+    PRINT_PARAM(apperture_size);
+    PRINT_PARAM(L2gradient);
+
+    cv::Mat edges;
+
+    ASSERT_NO_THROW(
+        cv::gpu::GpuMat d_edges;
+
+        cv::gpu::Canny(cv::gpu::GpuMat(img), d_edges, low_thresh, high_thresh, apperture_size, L2gradient);
+
+        d_edges.download(edges);
+    );
+
+    EXPECT_MAT_SIMILAR(edges_gold, edges, 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine(
+                        testing::ValuesIn(devices()),
+                        testing::Values(3, 5),
+                        testing::Values(false, true)));
+
 #endif // HAVE_CUDA
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index a101191ca..ef2e2b7a6 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -937,3 +937,25 @@ TEST(equalizeHist)
         GPU_OFF;
     }
 }
+
+
+TEST(Canny)
+{
+    Mat img = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
+
+    if (img.empty()) throw runtime_error("can't open aloeL.jpg");
+
+    Mat edges(img.size(), CV_8UC1);
+
+    CPU_ON;
+    Canny(img, edges, 50.0, 100.0);
+    CPU_OFF;
+    
+    gpu::GpuMat d_img(img);
+    gpu::GpuMat d_edges(img.size(), CV_8UC1);
+    gpu::CannyBuf d_buf(img.size());
+
+    GPU_ON;
+    gpu::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
+    GPU_OFF;
+}