added gpu::Canny function

2011-08-08 08:53:55 +00:00
parent 0a2c7803b6
commit 767ac9aa10
5 changed files with 754 additions and 3 deletions
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -0,0 +1,489 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace cv { namespace gpu { namespace canny
+{
+    __global__ void calcSobelRowPass(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+    {
+        __shared__ int smem[16][18];
+
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i < rows)
+        {
+            smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
+            if (threadIdx.x == 0)
+            {
+                smem[threadIdx.y][0] = src.ptr(i)[max(j - 1, 0)];
+                smem[threadIdx.y][17] = src.ptr(i)[min(j + 16, cols - 1)];
+            }
+            __syncthreads();
+
+            if (j < cols)
+            {
+                dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
+                dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
+            }
+        }
+    }
+
+    void calcSobelRowPass_gpu(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    struct L1
+    {
+        static __device__ __forceinline__ float calc(int x, int y)
+        {
+            return abs(x) + abs(y);
+        }
+    };
+    struct L2
+    {
+        static __device__ __forceinline__ float calc(int x, int y)
+        {
+            return sqrtf(x * x + y * y);
+        }
+    };
+
+    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+    {
+        __shared__ int sdx[18][16];
+        __shared__ int sdy[18][16];
+
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (j < cols)
+        {
+            sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
+            sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
+            if (threadIdx.y == 0)
+            {
+                sdx[0][threadIdx.x] = dx_buf.ptr(max(i - 1, 0))[j];
+                sdx[17][threadIdx.x] = dx_buf.ptr(min(i + 16, rows - 1))[j];
+
+                sdy[0][threadIdx.x] = dy_buf.ptr(max(i - 1, 0))[j];
+                sdy[17][threadIdx.x] = dy_buf.ptr(min(i + 16, rows - 1))[j];
+            }
+            __syncthreads();
+
+            if (i < rows)
+            {
+                int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
+                int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
+
+                dx.ptr(i)[j] = x;
+                dy.ptr(i)[j] = y;
+
+                mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
+            }
+        }
+    }
+
+    void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        if (L2Grad)
+            calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+        else
+            calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+    {
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i < rows && j < cols)
+            mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+    }
+
+    void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        if (L2Grad)
+            calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
+        else
+            calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+        
+#define CANNY_SHIFT 15
+#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+
+    __global__ void calcMap(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+    {
+        __shared__ float smem[18][18];
+
+        const int j = blockIdx.x * 16 + threadIdx.x;
+        const int i = blockIdx.y * 16 + threadIdx.y;
+
+        const int tid = threadIdx.y * 16 + threadIdx.x;
+        const int lx = tid % 18;
+        const int ly = tid / 18;
+
+        if (ly < 14)
+            smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+
+        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+            smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+
+        __syncthreads();
+
+        if (i < rows && j < cols)
+        {
+            int x = dx.ptr(i)[j];
+            int y = dy.ptr(i)[j];
+            const int s = (x ^ y) < 0 ? -1 : 1;
+            const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+            x = abs(x);
+            y = abs(y);
+
+            // 0 - the pixel can not belong to an edge
+            // 1 - the pixel might belong to an edge
+            // 2 - the pixel does belong to an edge
+            int edge_type = 0;
+
+            if (m > low_thresh)
+            {
+                const int tg22x = x * TG22;
+                const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
+
+                y <<= CANNY_SHIFT;
+
+                if (y < tg22x)
+                {
+                    if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+                else if( y > tg67x )
+                {
+                    if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+                else
+                {
+                    if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            
+            map.ptr(i + 1)[j + 1] = edge_type;
+        }
+    }
+
+#undef CANNY_SHIFT
+#undef TG22
+
+    void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+    __device__ unsigned int counter = 0;
+
+    __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
+    {
+        #if __CUDA_ARCH__ >= 120
+
+        __shared__ int smem[18][18];
+
+        const int j = blockIdx.x * 16 + threadIdx.x;
+        const int i = blockIdx.y * 16 + threadIdx.y;
+
+        const int tid = threadIdx.y * 16 + threadIdx.x;
+        const int lx = tid % 18;
+        const int ly = tid / 18; 
+
+        if (ly < 14)
+            smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+
+        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+            smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+
+        __syncthreads();
+
+        if (i < rows && j < cols)
+        {
+            int n;
+
+            #pragma unroll
+            for (int k = 0; k < 16; ++k)
+            {
+                n = 0;
+
+                if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+                {
+                    n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+                    
+                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+                    
+                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+                }
+
+                if (n > 0)
+                    smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+            }
+
+            const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+            map.ptr(i + 1)[j + 1] = e;
+
+            n = 0;
+
+            if (e == 2)
+            {
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+                
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+                
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+            }
+
+            if (n > 0)
+            {
+                const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
+                st[ind] = make_ushort2(j + 1, i + 1);
+            }
+        }
+
+        #endif
+    }
+
+    void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+    __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
+    {
+        #if __CUDA_ARCH__ >= 120
+
+        const int stack_size = 512;
+        
+        __shared__ unsigned int s_counter;
+        __shared__ unsigned int s_ind;
+        __shared__ ushort2 s_st[stack_size];
+
+        if (threadIdx.x == 0)
+            s_counter = 0;
+        __syncthreads();
+
+        int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (ind < count)
+        {
+            ushort2 pos = st1[ind];
+
+            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+            {
+                if (threadIdx.x < 8)
+                {
+                    pos.x += c_dx[threadIdx.x];
+                    pos.y += c_dy[threadIdx.x];
+
+                    if (map.ptr(pos.y)[pos.x] == 1)
+                    {
+                        map.ptr(pos.y)[pos.x] = 2;
+
+                        ind = atomicInc(&s_counter, (unsigned int)(-1));
+
+                        s_st[ind] = pos;
+                    }
+                }
+                __syncthreads();
+
+                while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+                {
+                    const int subTaskIdx = threadIdx.x >> 3;
+                    const int portion = min(s_counter, blockDim.x >> 3);
+
+                    pos.x = pos.y = 0;
+
+                    if (subTaskIdx < portion)
+                        pos = s_st[s_counter - 1 - subTaskIdx];
+                    __syncthreads();
+                        
+                    if (threadIdx.x == 0)
+                        s_counter -= portion;
+                    __syncthreads();
+                     
+                    if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                    {
+                        pos.x += c_dx[threadIdx.x & 7];
+                        pos.y += c_dy[threadIdx.x & 7];
+
+                        if (map.ptr(pos.y)[pos.x] == 1)
+                        {
+                            map.ptr(pos.y)[pos.x] = 2;
+
+                            ind = atomicInc(&s_counter, (unsigned int)(-1));
+
+                            s_st[ind] = pos;
+                        }
+                    }
+                    __syncthreads();
+                }
+
+                if (s_counter > 0)
+                {
+                    if (threadIdx.x == 0)
+                    {
+                        ind = atomicAdd(&counter, s_counter);
+                        s_ind = ind - s_counter;
+                    }
+                    __syncthreads();
+
+                    ind = s_ind;
+
+                    for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                    {
+                        st2[ind + i] = s_st[i];
+                    }
+                }
+            }
+        }
+
+        #endif
+    }
+
+    void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, "cv::gpu::canny::counter") );
+        
+        unsigned int count;
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+
+        while (count > 0)
+        {
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+
+            dim3 block(128, 1, 1);
+            dim3 grid(min(count, 65535u), divUp(count, 65535), 1);
+            edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
+
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+
+            swap(st1, st2);
+        }
+    }
+
+    __global__ void getEdges(PtrStepi map, PtrStep dst, int rows, int cols)
+    {
+        const int j = blockIdx.x * 16 + threadIdx.x;
+        const int i = blockIdx.y * 16 + threadIdx.y;
+
+        if (i < rows && j < cols)
+            dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
+    }
+
+    void getEdges_gpu(PtrStepi map, PtrStep dst, int rows, int cols)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+        getEdges<<<grid, block>>>(map, dst, rows, cols);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+}}}