diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp index 29ca551fb..386c7e948 100644 --- a/modules/imgproc/src/hough.cpp +++ b/modules/imgproc/src/hough.cpp @@ -652,7 +652,70 @@ HoughLinesProbabilistic( Mat& image, } } } +static bool ocl_makePointsList(InputArray _src, OutputArray _pointsList, InputOutputArray _counters) +{ + UMat src = _src.getUMat(); + _pointsList.create(1, (int) src.total(), CV_32SC1); + UMat pointsList = _pointsList.getUMat(); + UMat counters = _counters.getUMat(); + ocl::Device dev = ocl::Device::getDefault(); + const int pixelsPerWI = 16; + int workgroup_size = min((int) dev.maxWorkGroupSize(), (src.cols + pixelsPerWI - 1)/pixelsPerWI); + ocl::Kernel pointListKernel("make_point_list", ocl::imgproc::hough_lines_oclsrc, + format("-D MAKE_POINTS_LIST -D GROUP_SIZE=%d -D LOCAL_SIZE=%d", workgroup_size, src.cols)); + if (pointListKernel.empty()) + return false; + + pointListKernel.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(pointsList), + ocl::KernelArg::PtrWriteOnly(counters)); + + size_t localThreads[2] = { workgroup_size, 1 }; + size_t globalThreads[2] = { workgroup_size, src.rows }; + + return pointListKernel.run(2, globalThreads, localThreads, false); +} + +static bool ocl_fillAccum(InputArray _pointsList, OutputArray _accum, int total_points, double rho, double theta, int numrho, int numangle) +{ + UMat pointsList = _pointsList.getUMat(); + _accum.create(numangle + 2, numrho + 2, CV_32SC1); + UMat accum = _accum.getUMat(); + ocl::Device dev = ocl::Device::getDefault(); + + float irho = (float) (1 / rho); + int workgroup_size = min((int) dev.maxWorkGroupSize(), total_points); + + ocl::Kernel fillAccumKernel; + size_t localThreads[2]; + size_t globalThreads[2]; + + int local_memory_needed = (numrho + 2)*sizeof(int); + if (local_memory_needed > dev.localMemSize()) + { + accum.setTo(Scalar::all(0)); + fillAccumKernel.create("fill_accum_global", ocl::imgproc::hough_lines_oclsrc, + format("-D FILL_ACCUM_GLOBAL")); + if (fillAccumKernel.empty()) + return false; + globalThreads[0] = workgroup_size; globalThreads[1] = numangle; + fillAccumKernel.args(ocl::KernelArg::ReadOnlyNoSize(pointsList), ocl::KernelArg::WriteOnly(accum), + total_points, irho, (float) theta, numrho, numangle); + return fillAccumKernel.run(2, globalThreads, NULL, false); + } + else + { + fillAccumKernel.create("fill_accum_local", ocl::imgproc::hough_lines_oclsrc, + format("-D FILL_ACCUM_LOCAL -D LOCAL_SIZE=%d -D BUFFER_SIZE=%d", workgroup_size, numrho + 2)); + if (fillAccumKernel.empty()) + return false; + localThreads[0] = workgroup_size; localThreads[1] = 1; + globalThreads[0] = workgroup_size; globalThreads[1] = numangle+2; + fillAccumKernel.args(ocl::KernelArg::ReadOnlyNoSize(pointsList), ocl::KernelArg::WriteOnly(accum), + total_points, irho, (float) theta, numrho, numangle); + return fillAccumKernel.run(2, globalThreads, localThreads, false); + } +} static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, double theta, int threshold, double min_theta, double max_theta) @@ -667,28 +730,13 @@ static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, doub } UMat src = _src.getUMat(); - - float irho = (float) (1 / rho); int numangle = cvRound((max_theta - min_theta) / theta); int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); - ocl::Device dev = ocl::Device::getDefault(); - - // make list of nonzero points - const int pixelsPerWI = 8; - int workgroup_size = min((int) dev.maxWorkGroupSize(), (src.cols + pixelsPerWI - 1)/pixelsPerWI); - ocl::Kernel pointListKernel("make_point_list", ocl::imgproc::hough_lines_oclsrc, - format("-D MAKE_POINTS_LIST -D GROUP_SIZE=%d -D LOCAL_SIZE=%d", workgroup_size, src.cols)); - if (pointListKernel.empty()) - return false; - - UMat pointsList(1, (int) src.total(), CV_32SC1); + + UMat pointsList; UMat counters(1, 2, CV_32SC1, Scalar::all(0)); - pointListKernel.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(pointsList), - ocl::KernelArg::PtrWriteOnly(counters)); - size_t localThreads[2] = { workgroup_size, 1 }; - size_t globalThreads[2] = { workgroup_size, src.rows }; - if (!pointListKernel.run(2, globalThreads, localThreads, false)) + if (!ocl_makePointsList(src, pointsList, counters)) return false; int total_points = counters.getMat(ACCESS_READ).at(0, 0); @@ -698,34 +746,8 @@ static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, doub return true; } - // convert src image to hough space - UMat accum(numangle + 2, numrho + 2, CV_32SC1); - workgroup_size = min((int) dev.maxWorkGroupSize(), total_points); - ocl::Kernel fillAccumKernel; - size_t* fillAccumLT = NULL; - int local_memory_needed = (numrho + 2)*sizeof(int); - if (local_memory_needed > dev.localMemSize()) - { - accum.setTo(Scalar::all(0)); - fillAccumKernel.create("fill_accum_global", ocl::imgproc::hough_lines_oclsrc, - format("-D FILL_ACCUM_GLOBAL")); - globalThreads[0] = workgroup_size; globalThreads[1] = numangle; - } - else - { - fillAccumKernel.create("fill_accum_local", ocl::imgproc::hough_lines_oclsrc, - format("-D FILL_ACCUM_LOCAL -D LOCAL_SIZE=%d -D BUFFER_SIZE=%d", workgroup_size, numrho + 2)); - localThreads[0] = workgroup_size; localThreads[1] = 1; - globalThreads[0] = workgroup_size; globalThreads[1] = numangle+2; - fillAccumLT = localThreads; - } - if (fillAccumKernel.empty()) - return false; - - fillAccumKernel.args(ocl::KernelArg::ReadOnlyNoSize(pointsList), ocl::KernelArg::WriteOnly(accum), - total_points, irho, (float) theta, numrho, numangle); - - if (!fillAccumKernel.run(2, globalThreads, fillAccumLT, false)) + UMat accum; + if (!ocl_fillAccum(pointsList, accum, total_points, rho, theta, numrho, numangle)) return false; const int pixPerWI = 8; @@ -741,7 +763,7 @@ static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, doub getLinesKernel.args(ocl::KernelArg::ReadOnly(accum), ocl::KernelArg::WriteOnlyNoSize(lines), ocl::KernelArg::PtrWriteOnly(counters), linesMax, threshold, (float) rho, (float) theta); - globalThreads[0] = (numrho + pixPerWI - 1)/pixPerWI; globalThreads[1] = numangle; + size_t globalThreads[2] = { (numrho + pixPerWI - 1)/pixPerWI, numangle }; if (!getLinesKernel.run(2, globalThreads, NULL, false)) return false; @@ -753,13 +775,23 @@ static bool ocl_HoughLines(InputArray _src, OutputArray _lines, double rho, doub return true; } +static bool ocl_HoughLinesP(InputArray _src, OutputArray _lines, double rho, double theta, int threshold, + double minLineLength, double maxGap) +{ + CV_Assert(_src.type() == CV_8UC1); + + UMat src = _src.getUMat(); + + return false; +} + } void cv::HoughLines( InputArray _image, OutputArray _lines, double rho, double theta, int threshold, double srn, double stn, double min_theta, double max_theta ) { - CV_OCL_RUN(srn == 0 && stn == 0 && _lines.isUMat(), + CV_OCL_RUN(srn == 0 && stn == 0 && _image.isUMat() && _lines.isUMat(), ocl_HoughLines(_image, _lines, rho, theta, threshold, min_theta, max_theta)); Mat image = _image.getMat(); @@ -778,6 +810,8 @@ void cv::HoughLinesP(InputArray _image, OutputArray _lines, double rho, double theta, int threshold, double minLineLength, double maxGap ) { + CV_OCL_RUN(_image.isUMat() && _lines.isUMat(), ocl_HoughLinesP(_image, _lines, rho, theta, threshold, minLineLength, maxGap)); + Mat image = _image.getMat(); std::vector lines; HoughLinesProbabilistic(image, (float)rho, (float)theta, threshold, cvRound(minLineLength), cvRound(maxGap), lines, INT_MAX); diff --git a/modules/imgproc/src/opencl/hough_lines.cl b/modules/imgproc/src/opencl/hough_lines.cl index 25ebe4708..d402d3741 100644 --- a/modules/imgproc/src/opencl/hough_lines.cl +++ b/modules/imgproc/src/opencl/hough_lines.cl @@ -107,14 +107,13 @@ __kernel void fill_accum_local(__global const uchar * list_ptr, int list_step, i __global const int * list = (__global const int*)(list_ptr + list_offset); const int shift = (numrho - 1) / 2; - for (int i = count_idx; i < total_points; i += LOCAL_SIZE) { const int point = list[i]; const int x = (point & 0xFFFF); - const int y = (point >> 16) & 0xFFFF; + const int y = point >> 16; - int r = convert_int_rte(x * cosVal + y * sinVal) + shift; + int r = convert_int_rte(mad(x, cosVal, y * sinVal)) + shift; atomic_inc(l_accum + r + 1); }