diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 4f4289967..5ccab64cb 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -987,12 +987,12 @@ namespace cv struct CV_EXPORTS CannyBuf { - CannyBuf() : counter(NULL) {} + CannyBuf() : counter(1, 1, CV_32S) { } ~CannyBuf() { release(); } - explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(NULL) + explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(1, 1, CV_32S) { create(image_size, apperture_size); } @@ -1004,7 +1004,7 @@ namespace cv oclMat dx_buf, dy_buf; oclMat edgeBuf; oclMat trackBuf1, trackBuf2; - void *counter; + oclMat counter; Ptr filterDX, filterDY; }; diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp index c41d802e5..e0d788bc0 100644 --- a/modules/ocl/src/canny.cpp +++ b/modules/ocl/src/canny.cpp @@ -49,7 +49,7 @@ using namespace cv; using namespace cv::ocl; -cv::ocl::CannyBuf::CannyBuf(const oclMat &dx_, const oclMat &dy_) : dx(dx_), dy(dy_), counter(NULL) +cv::ocl::CannyBuf::CannyBuf(const oclMat &dx_, const oclMat &dy_) : dx(dx_), dy(dy_), counter(1, 1, CV_32SC1) { CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size()); @@ -80,17 +80,8 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size) } ensureSizeIsEnough(2 * (image_size.height + 2), image_size.width + 2, CV_32FC1, edgeBuf); - ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1); - ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2); - - int counter_i [1] = { 0 }; - int err = 0; - if(counter) - { - openCLFree(counter); - } - counter = clCreateBuffer( *((cl_context*)getClContextPtr()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err ); - openCLSafeCall(err); + ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1); + ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2); } void cv::ocl::CannyBuf::release() @@ -102,7 +93,6 @@ void cv::ocl::CannyBuf::release() edgeBuf.release(); trackBuf1.release(); trackBuf2.release(); - openCLFree(counter); } namespace cv @@ -118,9 +108,9 @@ namespace cv void calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh); - void edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols); + void edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols); - void edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols); + void edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols); void getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols); } @@ -322,54 +312,61 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); } -void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols) +void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols) { Context *clCxt = map.clCxt; - string kernelName = "edgesHysteresisLocal"; vector< pair > args; + Mat counterMat(counter.rows, counter.cols, counter.type()); + counterMat.at(0, 0) = 0; + counter.upload(counterMat); + args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&counter)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&map.step)); - args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset)); + cl_int stepBytes = map.step; + args.push_back( make_pair( sizeof(cl_int), (void *)&stepBytes)); + cl_int offsetBytes = map.offset; + args.push_back( make_pair( sizeof(cl_int), (void *)&offsetBytes)); size_t globalThreads[3] = {cols, rows, 1}; size_t localThreads[3] = {16, 16, 1}; - openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisLocal", globalThreads, localThreads, args, -1, -1); } -void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols) +void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols) { - unsigned int count; - openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL)); Context *clCxt = map.clCxt; - string kernelName = "edgesHysteresisGlobal"; vector< pair > args; size_t localThreads[3] = {128, 1, 1}; - int count_i[1] = {0}; - while(count > 0) + while(1 > 0) { - openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL)); + Mat counterMat; counter.download(counterMat); + int count = counterMat.at(0, 0); + CV_Assert(count >= 0); + if (count == 0) + break; + + counterMat.at(0, 0) = 0; + counter.upload(counterMat); args.clear(); - size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1}; + size_t globalThreads[3] = {std::min((unsigned)count, 65535u) * 128, divUp(count, 65535), 1}; args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&counter)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data)); args.push_back( make_pair( sizeof(cl_int), (void *)&rows)); args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); args.push_back( make_pair( sizeof(cl_int), (void *)&count)); args.push_back( make_pair( sizeof(cl_int), (void *)&map.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset)); - openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); - openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL)); + openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisGlobal", globalThreads, localThreads, args, -1, -1); std::swap(st1, st2); } } diff --git a/modules/ocl/src/opencl/imgproc_canny.cl b/modules/ocl/src/opencl/imgproc_canny.cl index ca670b6db..884480658 100644 --- a/modules/ocl/src/opencl/imgproc_canny.cl +++ b/modules/ocl/src/opencl/imgproc_canny.cl @@ -374,6 +374,14 @@ calcMap #undef CANNY_SHIFT #undef TG22 +struct PtrStepSz { + __global int *ptr; + int step; + int rows, cols; +}; +inline int get(struct PtrStepSz data, int y, int x) { return *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)); } +inline void set(struct PtrStepSz data, int y, int x, int value) { *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)) = value; } + ////////////////////////////////////////////////////////////////////////////////////////// // do Hysteresis for pixel whose edge type is 1 // @@ -390,7 +398,7 @@ void __attribute__((reqd_work_group_size(16,16,1))) edgesHysteresisLocal ( - __global int * map, + __global int * map_ptr, __global ushort2 * st, __global unsigned int * counter, int rows, @@ -399,10 +407,11 @@ edgesHysteresisLocal int map_offset ) { +#if 0 map_step /= sizeof(*map); map_offset /= sizeof(*map); - map += map_offset; + const __global int* map = map_ptr + map_offset; __local int smem[18][18]; @@ -482,6 +491,92 @@ edgesHysteresisLocal st[ind] = (ushort2)(gidx + 1, gidy + 1); } } +#else + struct PtrStepSz map = {((__global int *)((__global char*)map_ptr + map_offset)), map_step, rows, cols}; + + __local int smem[18][18]; + + int2 blockIdx = (int2)(get_group_id(0), get_group_id(1)); + int2 blockDim = (int2)(get_local_size(0), get_local_size(1)); + int2 threadIdx = (int2)(get_local_id(0), get_local_id(1)); + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0; + if (threadIdx.y == 0) + smem[0][threadIdx.x + 1] = y > 0 ? get(map, y - 1, x) : 0; + if (threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0; + if (threadIdx.x == 0) + smem[threadIdx.y + 1][0] = x > 0 ? get(map, y, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1) + smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? get(map, y, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == 0) + smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) + smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? get(map, y - 1, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? get(map, y + 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? get(map, y + 1, x + 1) : 0; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x >= map.cols || y >= map.rows) + return; + + int n; + + #pragma unroll + for (int k = 0; k < 16; ++k) + { + n = 0; + + if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) + { + n += smem[threadIdx.y ][threadIdx.x ] == 2; + n += smem[threadIdx.y ][threadIdx.x + 1] == 2; + n += smem[threadIdx.y ][threadIdx.x + 2] == 2; + + n += smem[threadIdx.y + 1][threadIdx.x ] == 2; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; + + n += smem[threadIdx.y + 2][threadIdx.x ] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; + } + + if (n > 0) + smem[threadIdx.y + 1][threadIdx.x + 1] = 2; + } + + const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; + + set(map, y, x, e); + + n = 0; + + if (e == 2) + { + n += smem[threadIdx.y ][threadIdx.x ] == 1; + n += smem[threadIdx.y ][threadIdx.x + 1] == 1; + n += smem[threadIdx.y ][threadIdx.x + 2] == 1; + + n += smem[threadIdx.y + 1][threadIdx.x ] == 1; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; + + n += smem[threadIdx.y + 2][threadIdx.x ] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; + } + + if (n > 0) + { + const int ind = atomic_inc(counter); + st[ind] = (ushort2)(x, y); + } +#endif } __constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};