From 5170f0b5da76163d5734f77a166844b0fb14921f Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Fri, 7 Feb 2014 16:04:29 +0400 Subject: [PATCH] fixed several bugs in CUDA Canny implementation: * out of border access in edgesHysteresisLocalKernel * incorrect usage of atomicAdd(cherry picked from commit 5dbdadb769e97f47b64655b5b3144787c57e2740) --- modules/gpu/src/cuda/canny.cu | 49 +++++++++++++++++++++-------------- modules/gpu/src/imgproc.cpp | 14 +++++----- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index aab922f22..2ab260ec2 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -239,30 +239,35 @@ namespace canny { __device__ int counter = 0; - __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st) + __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols) + { + return (y >= 0) && (y < rows) && (x >= 0) && (x < cols); + } + + __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st) { __shared__ volatile int smem[18][18]; const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; - smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0; + smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0; if (threadIdx.y == 0) - smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0; + smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0; if (threadIdx.y == blockDim.y - 1) - smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0; + smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0; if (threadIdx.x == 0) - smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0; + smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0; if (threadIdx.x == blockDim.x - 1) - smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0; + smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0; if (threadIdx.x == 0 && threadIdx.y == 0) - smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0; + smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0; if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) - smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0; + smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0; if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) - smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0; + smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0; if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) - smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0; + smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0; __syncthreads(); @@ -317,11 +322,11 @@ namespace canny if (n > 0) { const int ind = ::atomicAdd(&counter, 1); - st[ind] = make_ushort2(x, y); + st[ind] = make_short2(x, y); } } - void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1) + void edgesHysteresisLocal(PtrStepSzi map, short2* st1) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); @@ -345,13 +350,13 @@ namespace canny __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count) + __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count) { const int stack_size = 512; __shared__ int s_counter; __shared__ int s_ind; - __shared__ ushort2 s_st[stack_size]; + __shared__ short2 s_st[stack_size]; if (threadIdx.x == 0) s_counter = 0; @@ -363,14 +368,14 @@ namespace canny if (ind >= count) return; - ushort2 pos = st1[ind]; + short2 pos = st1[ind]; if (threadIdx.x < 8) { pos.x += c_dx[threadIdx.x]; pos.y += c_dy[threadIdx.x]; - if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1) + if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1) { map(pos.y, pos.x) = 2; @@ -402,7 +407,7 @@ namespace canny pos.x += c_dx[threadIdx.x & 7]; pos.y += c_dy[threadIdx.x & 7]; - if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1) + if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1) { map(pos.y, pos.x) = 2; @@ -419,8 +424,10 @@ namespace canny { if (threadIdx.x == 0) { - ind = ::atomicAdd(&counter, s_counter); - s_ind = ind - s_counter; + s_ind = ::atomicAdd(&counter, s_counter); + + if (s_ind + s_counter > map.cols * map.rows) + s_counter = 0; } __syncthreads(); @@ -432,7 +439,7 @@ namespace canny } } - void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2) + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); @@ -454,6 +461,8 @@ namespace canny cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + count = min(count, map.cols * map.rows); + std::swap(st1, st2); } } diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 97adb685f..66f838f77 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -1491,6 +1491,8 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size) { + CV_Assert(image_size.width < std::numeric_limits::max() && image_size.height < std::numeric_limits::max()); + if (apperture_size > 0) { ensureSizeIsEnough(image_size, CV_32SC1, dx); @@ -1506,8 +1508,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size) ensureSizeIsEnough(image_size, CV_32FC1, mag); ensureSizeIsEnough(image_size, CV_32SC1, map); - ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1); - ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2); + ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st1); + ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2); } void cv::gpu::CannyBuf::release() @@ -1527,9 +1529,9 @@ namespace canny void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh); - void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1); + void edgesHysteresisLocal(PtrStepSzi map, short2* st1); - void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2); + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2); void getEdges(PtrStepSzi map, PtrStepSzb dst); } @@ -1543,9 +1545,9 @@ namespace buf.map.setTo(Scalar::all(0)); calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh); - edgesHysteresisLocal(buf.map, buf.st1.ptr()); + edgesHysteresisLocal(buf.map, buf.st1.ptr()); - edgesHysteresisGlobal(buf.map, buf.st1.ptr(), buf.st2.ptr()); + edgesHysteresisGlobal(buf.map, buf.st1.ptr(), buf.st2.ptr()); getEdges(buf.map, dst); }