ocl: Canny: port CUDA-based implementation of edgesHysteresisLocal

This commit is contained in:
Alexander Alekhin 2013-10-28 00:01:56 +04:00
parent 9e527fc9f4
commit eb4f50ca59
2 changed files with 110 additions and 12 deletions

View File

@ -80,8 +80,8 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
}
ensureSizeIsEnough(2 * (image_size.height + 2), image_size.width + 2, CV_32FC1, edgeBuf);
ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1);
ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2);
}
void cv::ocl::CannyBuf::release()
@ -315,33 +315,37 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro
void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols)
{
Context *clCxt = map.clCxt;
string kernelName = "edgesHysteresisLocal";
vector< pair<size_t, const void *> > args;
Mat counterMat(counter.rows, counter.cols, counter.type());
counterMat.at<int>(0, 0) = 0;
counter.upload(counterMat);
args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
cl_int stepBytes = map.step;
args.push_back( make_pair( sizeof(cl_int), (void *)&stepBytes));
cl_int offsetBytes = map.offset;
args.push_back( make_pair( sizeof(cl_int), (void *)&offsetBytes));
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisLocal", globalThreads, localThreads, args, -1, -1);
}
void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols)
{
Mat counterMat; counter.download(counterMat);
Context *clCxt = map.clCxt;
string kernelName = "edgesHysteresisGlobal";
vector< pair<size_t, const void *> > args;
size_t localThreads[3] = {128, 1, 1};
while(1 > 0)
{
Mat counterMat; counter.download(counterMat);
int count = counterMat.at<int>(0, 0);
CV_Assert(count >= 0);
if (count == 0)
@ -362,8 +366,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, ocl
args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
counter.download(counterMat);
openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisGlobal", globalThreads, localThreads, args, -1, -1);
std::swap(st1, st2);
}
}

View File

@ -374,6 +374,14 @@ calcMap
#undef CANNY_SHIFT
#undef TG22
struct PtrStepSz {
__global int *ptr;
int step;
int rows, cols;
};
inline int get(struct PtrStepSz data, int y, int x) { return *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)); }
inline void set(struct PtrStepSz data, int y, int x, int value) { *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)) = value; }
//////////////////////////////////////////////////////////////////////////////////////////
// do Hysteresis for pixel whose edge type is 1
//
@ -390,7 +398,7 @@ void
__attribute__((reqd_work_group_size(16,16,1)))
edgesHysteresisLocal
(
__global int * map,
__global int * map_ptr,
__global ushort2 * st,
__global unsigned int * counter,
int rows,
@ -399,10 +407,11 @@ edgesHysteresisLocal
int map_offset
)
{
#if 0
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
const __global int* map = map_ptr + map_offset;
__local int smem[18][18];
@ -482,6 +491,92 @@ edgesHysteresisLocal
st[ind] = (ushort2)(gidx + 1, gidy + 1);
}
}
#else
struct PtrStepSz map = {((__global int *)((__global char*)map_ptr + map_offset)), map_step, rows, cols};
__local int smem[18][18];
int2 blockIdx = (int2)(get_group_id(0), get_group_id(1));
int2 blockDim = (int2)(get_local_size(0), get_local_size(1));
int2 threadIdx = (int2)(get_local_id(0), get_local_id(1));
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0;
if (threadIdx.y == 0)
smem[0][threadIdx.x + 1] = y > 0 ? get(map, y - 1, x) : 0;
if (threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0;
if (threadIdx.x == 0)
smem[threadIdx.y + 1][0] = x > 0 ? get(map, y, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1)
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? get(map, y, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == 0)
smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? get(map, y - 1, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? get(map, y + 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? get(map, y + 1, x + 1) : 0;
barrier(CLK_LOCAL_MEM_FENCE);
if (x >= map.cols || y >= map.rows)
return;
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
{
n = 0;
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
{
n += smem[threadIdx.y ][threadIdx.x ] == 2;
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
}
if (n > 0)
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
}
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
set(map, y, x, e);
n = 0;
if (e == 2)
{
n += smem[threadIdx.y ][threadIdx.x ] == 1;
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
}
if (n > 0)
{
const int ind = atomic_inc(counter);
st[ind] = (ushort2)(x, y);
}
#endif
}
__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};