added gpu::Canny function
This commit is contained in:
489
modules/gpu/src/cuda/canny.cu
Normal file
489
modules/gpu/src/cuda/canny.cu
Normal file
@@ -0,0 +1,489 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
namespace cv { namespace gpu { namespace canny
|
||||
{
|
||||
__global__ void calcSobelRowPass(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
|
||||
{
|
||||
__shared__ int smem[16][18];
|
||||
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i < rows)
|
||||
{
|
||||
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
smem[threadIdx.y][0] = src.ptr(i)[max(j - 1, 0)];
|
||||
smem[threadIdx.y][17] = src.ptr(i)[min(j + 16, cols - 1)];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (j < cols)
|
||||
{
|
||||
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
|
||||
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void calcSobelRowPass_gpu(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
struct L1
|
||||
{
|
||||
static __device__ __forceinline__ float calc(int x, int y)
|
||||
{
|
||||
return abs(x) + abs(y);
|
||||
}
|
||||
};
|
||||
struct L2
|
||||
{
|
||||
static __device__ __forceinline__ float calc(int x, int y)
|
||||
{
|
||||
return sqrtf(x * x + y * y);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
|
||||
{
|
||||
__shared__ int sdx[18][16];
|
||||
__shared__ int sdy[18][16];
|
||||
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (j < cols)
|
||||
{
|
||||
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
|
||||
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
|
||||
if (threadIdx.y == 0)
|
||||
{
|
||||
sdx[0][threadIdx.x] = dx_buf.ptr(max(i - 1, 0))[j];
|
||||
sdx[17][threadIdx.x] = dx_buf.ptr(min(i + 16, rows - 1))[j];
|
||||
|
||||
sdy[0][threadIdx.x] = dy_buf.ptr(max(i - 1, 0))[j];
|
||||
sdy[17][threadIdx.x] = dy_buf.ptr(min(i + 16, rows - 1))[j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows)
|
||||
{
|
||||
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
|
||||
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
|
||||
|
||||
dx.ptr(i)[j] = x;
|
||||
dy.ptr(i)[j] = y;
|
||||
|
||||
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
if (L2Grad)
|
||||
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
|
||||
else
|
||||
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
|
||||
{
|
||||
const int j = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int i = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i < rows && j < cols)
|
||||
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
|
||||
}
|
||||
|
||||
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
if (L2Grad)
|
||||
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
|
||||
else
|
||||
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CANNY_SHIFT 15
|
||||
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
|
||||
|
||||
__global__ void calcMap(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
|
||||
{
|
||||
__shared__ float smem[18][18];
|
||||
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * 16 + threadIdx.x;
|
||||
const int lx = tid % 18;
|
||||
const int ly = tid / 18;
|
||||
|
||||
if (ly < 14)
|
||||
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
|
||||
|
||||
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
|
||||
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows && j < cols)
|
||||
{
|
||||
int x = dx.ptr(i)[j];
|
||||
int y = dy.ptr(i)[j];
|
||||
const int s = (x ^ y) < 0 ? -1 : 1;
|
||||
const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
x = abs(x);
|
||||
y = abs(y);
|
||||
|
||||
// 0 - the pixel can not belong to an edge
|
||||
// 1 - the pixel might belong to an edge
|
||||
// 2 - the pixel does belong to an edge
|
||||
int edge_type = 0;
|
||||
|
||||
if (m > low_thresh)
|
||||
{
|
||||
const int tg22x = x * TG22;
|
||||
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
|
||||
|
||||
y <<= CANNY_SHIFT;
|
||||
|
||||
if (y < tg22x)
|
||||
{
|
||||
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else if( y > tg67x )
|
||||
{
|
||||
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
|
||||
edge_type = 1 + (int)(m > high_thresh);
|
||||
}
|
||||
}
|
||||
|
||||
map.ptr(i + 1)[j + 1] = edge_type;
|
||||
}
|
||||
}
|
||||
|
||||
#undef CANNY_SHIFT
|
||||
#undef TG22
|
||||
|
||||
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__device__ unsigned int counter = 0;
|
||||
|
||||
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 120
|
||||
|
||||
__shared__ int smem[18][18];
|
||||
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
const int tid = threadIdx.y * 16 + threadIdx.x;
|
||||
const int lx = tid % 18;
|
||||
const int ly = tid / 18;
|
||||
|
||||
if (ly < 14)
|
||||
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
|
||||
|
||||
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
|
||||
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (i < rows && j < cols)
|
||||
{
|
||||
int n;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 16; ++k)
|
||||
{
|
||||
n = 0;
|
||||
|
||||
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
|
||||
}
|
||||
|
||||
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
map.ptr(i + 1)[j + 1] = e;
|
||||
|
||||
n = 0;
|
||||
|
||||
if (e == 2)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
{
|
||||
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
|
||||
st[ind] = make_ushort2(j + 1, i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
|
||||
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
|
||||
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
|
||||
|
||||
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 120
|
||||
|
||||
const int stack_size = 512;
|
||||
|
||||
__shared__ unsigned int s_counter;
|
||||
__shared__ unsigned int s_ind;
|
||||
__shared__ ushort2 s_st[stack_size];
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter = 0;
|
||||
__syncthreads();
|
||||
|
||||
int ind = blockIdx.y * gridDim.x + blockIdx.x;
|
||||
|
||||
if (ind < count)
|
||||
{
|
||||
ushort2 pos = st1[ind];
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
{
|
||||
if (threadIdx.x < 8)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x];
|
||||
pos.y += c_dy[threadIdx.x];
|
||||
|
||||
if (map.ptr(pos.y)[pos.x] == 1)
|
||||
{
|
||||
map.ptr(pos.y)[pos.x] = 2;
|
||||
|
||||
ind = atomicInc(&s_counter, (unsigned int)(-1));
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
|
||||
{
|
||||
const int subTaskIdx = threadIdx.x >> 3;
|
||||
const int portion = min(s_counter, blockDim.x >> 3);
|
||||
|
||||
pos.x = pos.y = 0;
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
pos = s_st[s_counter - 1 - subTaskIdx];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_counter -= portion;
|
||||
__syncthreads();
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
{
|
||||
pos.x += c_dx[threadIdx.x & 7];
|
||||
pos.y += c_dy[threadIdx.x & 7];
|
||||
|
||||
if (map.ptr(pos.y)[pos.x] == 1)
|
||||
{
|
||||
map.ptr(pos.y)[pos.x] = 2;
|
||||
|
||||
ind = atomicInc(&s_counter, (unsigned int)(-1));
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (s_counter > 0)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
ind = atomicAdd(&counter, s_counter);
|
||||
s_ind = ind - s_counter;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
ind = s_ind;
|
||||
|
||||
for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
|
||||
{
|
||||
st2[ind + i] = s_st[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
|
||||
{
|
||||
void* counter_ptr;
|
||||
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, "cv::gpu::canny::counter") );
|
||||
|
||||
unsigned int count;
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
while (count > 0)
|
||||
{
|
||||
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
|
||||
|
||||
dim3 block(128, 1, 1);
|
||||
dim3 grid(min(count, 65535u), divUp(count, 65535), 1);
|
||||
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
swap(st1, st2);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void getEdges(PtrStepi map, PtrStep dst, int rows, int cols)
|
||||
{
|
||||
const int j = blockIdx.x * 16 + threadIdx.x;
|
||||
const int i = blockIdx.y * 16 + threadIdx.y;
|
||||
|
||||
if (i < rows && j < cols)
|
||||
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
|
||||
}
|
||||
|
||||
void getEdges_gpu(PtrStepi map, PtrStep dst, int rows, int cols)
|
||||
{
|
||||
dim3 block(16, 16, 1);
|
||||
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
|
||||
|
||||
getEdges<<<grid, block>>>(map, dst, rows, cols);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
}
|
||||
}}}
|
||||
Reference in New Issue
Block a user