fixed several bugs in CUDA Canny implementation:

* out of border access in edgesHysteresisLocalKernel
* incorrect usage of atomicAdd
This commit is contained in:
Vladislav Vinogradov 2014-02-07 16:04:29 +04:00
parent bfc27271e2
commit 5dbdadb769
2 changed files with 37 additions and 26 deletions

View File

@ -239,30 +239,35 @@ namespace canny
{
__device__ int counter = 0;
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
__device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
{
return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
}
__global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
{
__shared__ volatile int smem[18][18];
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0;
if (threadIdx.y == 0)
smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0;
if (threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0;
if (threadIdx.x == 0)
smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1)
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == 0)
smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0;
__syncthreads();
@ -317,11 +322,11 @@ namespace canny
if (n > 0)
{
const int ind = ::atomicAdd(&counter, 1);
st[ind] = make_ushort2(x, y);
st[ind] = make_short2(x, y);
}
}
void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
void edgesHysteresisLocal(PtrStepSzi map, short2* st1)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
@ -345,13 +350,13 @@ namespace canny
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
__global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
{
const int stack_size = 512;
__shared__ int s_counter;
__shared__ int s_ind;
__shared__ ushort2 s_st[stack_size];
__shared__ short2 s_st[stack_size];
if (threadIdx.x == 0)
s_counter = 0;
@ -363,14 +368,14 @@ namespace canny
if (ind >= count)
return;
ushort2 pos = st1[ind];
short2 pos = st1[ind];
if (threadIdx.x < 8)
{
pos.x += c_dx[threadIdx.x];
pos.y += c_dy[threadIdx.x];
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
{
map(pos.y, pos.x) = 2;
@ -402,7 +407,7 @@ namespace canny
pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7];
if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
{
map(pos.y, pos.x) = 2;
@ -419,8 +424,10 @@ namespace canny
{
if (threadIdx.x == 0)
{
ind = ::atomicAdd(&counter, s_counter);
s_ind = ind - s_counter;
s_ind = ::atomicAdd(&counter, s_counter);
if (s_ind + s_counter > map.cols * map.rows)
s_counter = 0;
}
__syncthreads();
@ -432,7 +439,7 @@ namespace canny
}
}
void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
@ -454,6 +461,8 @@ namespace canny
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
count = min(count, map.cols * map.rows);
std::swap(st1, st2);
}
}

View File

@ -1491,6 +1491,8 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
{
CV_Assert(image_size.width < std::numeric_limits<short>::max() && image_size.height < std::numeric_limits<short>::max());
if (apperture_size > 0)
{
ensureSizeIsEnough(image_size, CV_32SC1, dx);
@ -1506,8 +1508,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
ensureSizeIsEnough(image_size, CV_32FC1, mag);
ensureSizeIsEnough(image_size, CV_32SC1, map);
ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st1);
ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2);
}
void cv::gpu::CannyBuf::release()
@ -1527,9 +1529,9 @@ namespace canny
void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
void edgesHysteresisLocal(PtrStepSzi map, short2* st1);
void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2);
void getEdges(PtrStepSzi map, PtrStepSzb dst);
}
@ -1543,9 +1545,9 @@ namespace
buf.map.setTo(Scalar::all(0));
calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);
edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
edgesHysteresisLocal(buf.map, buf.st1.ptr<short2>());
edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
edgesHysteresisGlobal(buf.map, buf.st1.ptr<short2>(), buf.st2.ptr<short2>());
getEdges(buf.map, dst);
}