From 3703722a7215e2ed371aeb758e4ea9a5354488f9 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 13 Aug 2012 17:44:23 +0400 Subject: [PATCH 01/12] first naive version --- modules/gpu/include/opencv2/gpu/gpu.hpp | 6 + modules/gpu/src/cuda/hough.cu | 156 ++++++++++++++++++++++++ modules/gpu/src/hough.cpp | 105 ++++++++++++++++ 3 files changed, 267 insertions(+) create mode 100644 modules/gpu/src/cuda/hough.cu create mode 100644 modules/gpu/src/hough.cpp diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index ca9ad8988..cb2e68872 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -820,6 +820,12 @@ private: int nLayers_; }; +CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); +CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); +CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, float theta); +CV_EXPORTS void HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); +CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_voices = noArray()); + ////////////////////////////// Matrix reductions ////////////////////////////// //! computes mean value and standard deviation of all or selected array elements diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu new file mode 100644 index 000000000..0a439f45e --- /dev/null +++ b/modules/gpu/src/cuda/hough.cu @@ -0,0 +1,156 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include +#include "opencv2/gpu/device/common.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace hough + { + __global__ void linesAccum(const DevMem2Db src, PtrStep_ accum, const float theta, const int numangle, const int numrho, const float irho) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= src.cols || y >= src.rows) + return; + + if (src(y, x)) + { + float ang = 0.0f; + for(int n = 0; n < numangle; ++n, ang += theta) + { + float sin_ang; + float cos_ang; + sincosf(ang, &sin_ang, &cos_ang); + + const float tabSin = sin_ang * irho; + const float tabCos = cos_ang * irho; + + int r = __float2int_rn(x * tabCos + y * tabSin); + r += (numrho - 1) / 2; + + atomicInc(accum.ptr(n + 1) + r + 1, (unsigned int)-1); + } + } + } + + void linesAccum_gpu(DevMem2Db src, PtrStep_ accum, float theta, int numangle, int numrho, float irho) + { + const dim3 block(32, 8); + const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + + linesAccum<<>>(src, accum, theta, numangle, numrho, irho); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + + __device__ unsigned int g_counter; + + __global__ void linesGetResult(const DevMem2D_ accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho) + { + __shared__ uint smem[8][32]; + + int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x; + int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y; + + if (r >= accum.cols || n >= accum.rows) + return; + + smem[threadIdx.y][threadIdx.x] = accum(n, r); + __syncthreads(); + + r -= 1; + n -= 1; + + if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 || threadIdx.y == 0 || threadIdx.y == blockDim.y - 1 || r >= accum.cols - 2 || n >= accum.rows - 2) + return; + + if (smem[threadIdx.y][threadIdx.x] > threshold && + smem[threadIdx.y][threadIdx.x] > smem[threadIdx.y - 1][threadIdx.x] && + smem[threadIdx.y][threadIdx.x] >= smem[threadIdx.y + 1][threadIdx.x] && + smem[threadIdx.y][threadIdx.x] > smem[threadIdx.y][threadIdx.x - 1] && + smem[threadIdx.y][threadIdx.x] >= smem[threadIdx.y][threadIdx.x + 1]) + { + float radius = (r - (numrho - 1) * 0.5f) * rho; + float angle = n * theta; + + const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)); + if (ind < maxSize) + { + out[ind] = make_float2(radius, angle); + voices[ind] = smem[threadIdx.y][threadIdx.x]; + } + } + } + + int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, int maxSize, float threshold, float theta, float rho, bool doSort) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); + + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + + const dim3 block(32, 8); + const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2)); + + linesGetResult<<>>(accum, out, voices, maxSize, threshold, theta, rho, accum.cols - 2); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + + uint total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) ); + + if (doSort) + { + thrust::device_ptr out_ptr(out); + thrust::device_ptr voices_ptr(voices); + thrust::sort_by_key(voices_ptr, voices_ptr + total_count, out_ptr, thrust::greater()); + } + + return total_count; + } + } +}}} diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp new file mode 100644 index 000000000..888c02724 --- /dev/null +++ b/modules/gpu/src/hough.cpp @@ -0,0 +1,105 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace hough + { + void linesAccum_gpu(DevMem2Db src, PtrStep_ accum, float theta, int numangle, int numrho, float irho); + int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, int maxSize, float threshold, float theta, float rho, bool doSort); + } +}}} + +void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, float theta) +{ + using namespace cv::gpu::device; + + CV_Assert(src.type() == CV_8UC1); + + const int numangle = cvRound(CV_PI / theta); + const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); + const float irho = 1.0f / rho; + + accum.create(numangle + 2, numrho + 2, CV_32SC1); + accum.setTo(cv::Scalar::all(0)); + + hough::linesAccum_gpu(src, accum, theta, numangle, numrho, irho); +} + +void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) +{ + using namespace cv::gpu::device; + + CV_Assert(accum.type() == CV_32SC1); + + lines.create(2, maxLines, CV_32FC2); + lines.cols = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, threshold, theta, rho, doSort); +} + +void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) +{ + cv::gpu::GpuMat accum; + HoughLines(src, lines, accum, rho, theta, threshold, doSort, maxLines); +} + +void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, float rho, float theta, int threshold, bool doSort, int maxLines) +{ + HoughLinesTransform(src, accum, rho, theta); + HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines); +} + +void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_voices_) +{ + h_lines_.create(1, d_lines.cols, CV_32FC2); + cv::Mat h_lines = h_lines_.getMat(); + d_lines.row(0).download(h_lines); + + if (h_voices_.needed()) + { + h_voices_.create(1, d_lines.cols, CV_32SC1); + cv::Mat h_voices = h_voices_.getMat(); + cv::gpu::GpuMat d_voices(1, d_lines.cols, CV_32SC1, const_cast(d_lines.ptr(1))); + d_voices.download(h_voices); + } +} From 1eefc699274545540d6cf8b232e9088d292bca18 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 14 Aug 2012 11:36:32 +0400 Subject: [PATCH 02/12] added accuracy and performance tests --- modules/gpu/perf/perf_imgproc.cpp | 41 ++++++++++++++++++ modules/gpu/perf_cpu/perf_imgproc.cpp | 37 ++++++++++++++++ modules/gpu/src/hough.cpp | 17 +++++++- modules/gpu/test/test_imgproc.cpp | 62 +++++++++++++++++++++++++++ 4 files changed, 156 insertions(+), 1 deletion(-) diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index b5c986d22..77f607452 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -1331,4 +1331,45 @@ INSTANTIATE_TEST_CASE_P(ImgProc, ImagePyramid_getLayer, testing::Combine( MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)))); +////////////////////////////////////////////////////////////////////// +// HoughLines + +GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, std::string) +{ + const cv::gpu::DeviceInfo devInfo = GET_PARAM(0); + cv::gpu::setDevice(devInfo.deviceID()); + const std::string fileName = GET_PARAM(1); + + const float rho = 1.0f; + const float theta = CV_PI / 180.0f; + const int threshold = 300; + + cv::Mat img_base = readImage(fileName, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(img_base.empty()); + + cv::Mat img; + cv::resize(img_base, img, cv::Size(1920, 1080)); + + cv::Mat edges; + cv::Canny(img, edges, 50, 200); + + cv::gpu::GpuMat d_edges(edges); + cv::gpu::GpuMat d_lines; + cv::gpu::GpuMat d_accum; + cv::gpu::HoughLines(d_edges, d_lines, d_accum, rho, theta, threshold); + + TEST_CYCLE() + { + cv::gpu::HoughLines(d_edges, d_lines, d_accum, rho, theta, threshold); + } +} + +INSTANTIATE_TEST_CASE_P(ImgProc, HoughLines, testing::Combine( + ALL_DEVICES, + testing::Values(std::string("cv/shared/pic1.png"), + std::string("cv/shared/pic3.png"), + std::string("cv/shared/pic4.png"), + std::string("cv/shared/pic5.png"), + std::string("cv/shared/pic6.png")))); + #endif diff --git a/modules/gpu/perf_cpu/perf_imgproc.cpp b/modules/gpu/perf_cpu/perf_imgproc.cpp index b6686b7ed..bc7764683 100644 --- a/modules/gpu/perf_cpu/perf_imgproc.cpp +++ b/modules/gpu/perf_cpu/perf_imgproc.cpp @@ -727,4 +727,41 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine( CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR), CvtColorInfo(4, 4, cv::COLOR_RGBA2mRGBA)))); +////////////////////////////////////////////////////////////////////// +// HoughLines + +GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, std::string) +{ + const std::string fileName = GET_PARAM(1); + + const float rho = 1.0f; + const float theta = CV_PI / 180.0f; + const int threshold = 300; + + cv::Mat img_base = readImage(fileName, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(img_base.empty()); + + cv::Mat img; + cv::resize(img_base, img, cv::Size(1920, 1080)); + + cv::Mat edges; + cv::Canny(img, edges, 50, 200); + + std::vector lines; + cv::HoughLines(edges, lines, rho, theta, threshold); + + TEST_CYCLE() + { + cv::HoughLines(edges, lines, rho, theta, threshold); + } +} + +INSTANTIATE_TEST_CASE_P(ImgProc, HoughLines, testing::Combine( + ALL_DEVICES, + testing::Values(std::string("cv/shared/pic1.png"), + std::string("cv/shared/pic3.png"), + std::string("cv/shared/pic4.png"), + std::string("cv/shared/pic5.png"), + std::string("cv/shared/pic6.png")))); + #endif diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index 888c02724..721e6802a 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -74,7 +74,12 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float CV_Assert(accum.type() == CV_32SC1); lines.create(2, maxLines, CV_32FC2); - lines.cols = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, threshold, theta, rho, doSort); + int count = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, threshold, theta, rho, doSort); + + if (count > 0) + lines.cols = std::min(count, maxLines); + else + lines.release(); } void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) @@ -91,6 +96,16 @@ void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, float void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_voices_) { + if (d_lines.empty()) + { + h_lines_.release(); + if (h_voices_.needed()) + h_voices_.release(); + return; + } + + CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2); + h_lines_.create(1, d_lines.cols, CV_32FC2); cv::Mat h_lines = h_lines_.getMat(); d_lines.row(0).download(h_lines); diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index 388badf45..b0e587e4a 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -1124,4 +1124,66 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerMinEigen, testing::Combine( testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)), testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7)))); +/////////////////////////////////////////////////////////////////////////////////////////////////////// +// HoughLines + +PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, std::string) +{ +}; + +void drawLines(cv::Mat& dst, const std::vector& lines) +{ + for (size_t i = 0; i < lines.size(); ++i) + { + float rho = lines[i][0], theta = lines[i][1]; + cv::Point pt1, pt2; + double a = std::cos(theta), b = std::sin(theta); + double x0 = a*rho, y0 = b*rho; + pt1.x = cvRound(x0 + 1000*(-b)); + pt1.y = cvRound(y0 + 1000*(a)); + pt2.x = cvRound(x0 - 1000*(-b)); + pt2.y = cvRound(y0 - 1000*(a)); + cv::line(dst, pt1, pt2, cv::Scalar::all(255)); + } +} + +TEST_P(HoughLines, Accuracy) +{ + const cv::gpu::DeviceInfo devInfo = GET_PARAM(0); + cv::gpu::setDevice(devInfo.deviceID()); + const std::string fileName = GET_PARAM(1); + + const float rho = 1.0f; + const float theta = CV_PI / 180.0f; + const int threshold = 300; + + cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(img.empty()); + + cv::Mat edges; + cv::Canny(img, edges, 50, 200); + + cv::gpu::GpuMat d_lines; + cv::gpu::HoughLines(loadMat(edges), d_lines, rho, theta, threshold); + std::vector lines; + cv::gpu::HoughLinesDownload(d_lines, lines); + cv::Mat dst(img.size(), CV_8UC1, cv::Scalar::all(0)); + drawLines(dst, lines); + + std::vector lines_gold; + cv::HoughLines(edges, lines_gold, rho, theta, threshold); + cv::Mat dst_gold(img.size(), CV_8UC1, cv::Scalar::all(0)); + drawLines(dst_gold, lines_gold); + + ASSERT_MAT_NEAR(dst_gold, dst, 0.0); +} + +INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine( + ALL_DEVICES, + testing::Values(std::string("../cv/shared/pic1.png"), + std::string("../cv/shared/pic3.png"), + std::string("../cv/shared/pic4.png"), + std::string("../cv/shared/pic5.png"), + std::string("../cv/shared/pic6.png")))); + } // namespace From 9201db32ebe87468d5a0548507c376db39ba35a4 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 14 Aug 2012 12:33:47 +0400 Subject: [PATCH 03/12] fixed overflow bugs, updated perf tests --- modules/gpu/perf/perf_imgproc.cpp | 37 +++++++++++++++------------ modules/gpu/perf_cpu/perf_imgproc.cpp | 34 +++++++++++++----------- modules/gpu/src/cuda/hough.cu | 10 +++++--- modules/gpu/src/hough.cpp | 15 +++++------ 4 files changed, 53 insertions(+), 43 deletions(-) diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 77f607452..3baca6c9d 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -1334,42 +1334,47 @@ INSTANTIATE_TEST_CASE_P(ImgProc, ImagePyramid_getLayer, testing::Combine( ////////////////////////////////////////////////////////////////////// // HoughLines -GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, std::string) +IMPLEMENT_PARAM_CLASS(DoSort, bool) + +GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, cv::Size, DoSort) { + declare.time(30.0); + const cv::gpu::DeviceInfo devInfo = GET_PARAM(0); cv::gpu::setDevice(devInfo.deviceID()); - const std::string fileName = GET_PARAM(1); + const cv::Size size = GET_PARAM(1); + const bool doSort = GET_PARAM(2); const float rho = 1.0f; const float theta = CV_PI / 180.0f; const int threshold = 300; - cv::Mat img_base = readImage(fileName, cv::IMREAD_GRAYSCALE); - ASSERT_FALSE(img_base.empty()); + cv::RNG rng(123456789); - cv::Mat img; - cv::resize(img_base, img, cv::Size(1920, 1080)); + cv::Mat src(size, CV_8UC1, cv::Scalar::all(0)); - cv::Mat edges; - cv::Canny(img, edges, 50, 200); + const int numLines = rng.uniform(500, 2000); + for (int i = 0; i < numLines; ++i) + { + cv::Point p1(rng.uniform(0, src.cols), rng.uniform(0, src.rows)); + cv::Point p2(rng.uniform(0, src.cols), rng.uniform(0, src.rows)); + cv::line(src, p1, p2, cv::Scalar::all(255), 2); + } - cv::gpu::GpuMat d_edges(edges); + cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_lines; cv::gpu::GpuMat d_accum; - cv::gpu::HoughLines(d_edges, d_lines, d_accum, rho, theta, threshold); + cv::gpu::HoughLines(d_src, d_lines, d_accum, rho, theta, threshold, doSort); TEST_CYCLE() { - cv::gpu::HoughLines(d_edges, d_lines, d_accum, rho, theta, threshold); + cv::gpu::HoughLines(d_src, d_lines, d_accum, rho, theta, threshold, doSort); } } INSTANTIATE_TEST_CASE_P(ImgProc, HoughLines, testing::Combine( ALL_DEVICES, - testing::Values(std::string("cv/shared/pic1.png"), - std::string("cv/shared/pic3.png"), - std::string("cv/shared/pic4.png"), - std::string("cv/shared/pic5.png"), - std::string("cv/shared/pic6.png")))); + GPU_TYPICAL_MAT_SIZES, + testing::Values(DoSort(false), DoSort(true)))); #endif diff --git a/modules/gpu/perf_cpu/perf_imgproc.cpp b/modules/gpu/perf_cpu/perf_imgproc.cpp index bc7764683..1b3c0951c 100644 --- a/modules/gpu/perf_cpu/perf_imgproc.cpp +++ b/modules/gpu/perf_cpu/perf_imgproc.cpp @@ -730,38 +730,42 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine( ////////////////////////////////////////////////////////////////////// // HoughLines -GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, std::string) +IMPLEMENT_PARAM_CLASS(DoSort, bool) + +GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, cv::Size, DoSort) { - const std::string fileName = GET_PARAM(1); + declare.time(30.0); + + const cv::Size size = GET_PARAM(1); const float rho = 1.0f; const float theta = CV_PI / 180.0f; const int threshold = 300; - cv::Mat img_base = readImage(fileName, cv::IMREAD_GRAYSCALE); - ASSERT_FALSE(img_base.empty()); + cv::RNG rng(123456789); - cv::Mat img; - cv::resize(img_base, img, cv::Size(1920, 1080)); + cv::Mat src(size, CV_8UC1, cv::Scalar::all(0)); - cv::Mat edges; - cv::Canny(img, edges, 50, 200); + const int numLines = rng.uniform(500, 2000); + for (int i = 0; i < numLines; ++i) + { + cv::Point p1(rng.uniform(0, src.cols), rng.uniform(0, src.rows)); + cv::Point p2(rng.uniform(0, src.cols), rng.uniform(0, src.rows)); + cv::line(src, p1, p2, cv::Scalar::all(255), 2); + } std::vector lines; - cv::HoughLines(edges, lines, rho, theta, threshold); + cv::HoughLines(src, lines, rho, theta, threshold); TEST_CYCLE() { - cv::HoughLines(edges, lines, rho, theta, threshold); + cv::HoughLines(src, lines, rho, theta, threshold); } } INSTANTIATE_TEST_CASE_P(ImgProc, HoughLines, testing::Combine( ALL_DEVICES, - testing::Values(std::string("cv/shared/pic1.png"), - std::string("cv/shared/pic3.png"), - std::string("cv/shared/pic4.png"), - std::string("cv/shared/pic5.png"), - std::string("cv/shared/pic6.png")))); + GPU_TYPICAL_MAT_SIZES, + testing::Values(DoSort(false), DoSort(true)))); #endif diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index 0a439f45e..9f5cd6534 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -75,12 +75,12 @@ namespace cv { namespace gpu { namespace device } } - void linesAccum_gpu(DevMem2Db src, PtrStep_ accum, float theta, int numangle, int numrho, float irho) + void linesAccum_gpu(DevMem2Db src, DevMem2D_ accum, float rho, float theta) { const dim3 block(32, 8); const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); - linesAccum<<>>(src, accum, theta, numangle, numrho, irho); + linesAccum<<>>(src, accum, theta, accum.rows - 2, accum.cols - 2, 1.0f / rho); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -125,7 +125,7 @@ namespace cv { namespace gpu { namespace device } } - int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, int maxSize, float threshold, float theta, float rho, bool doSort) + unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); @@ -143,7 +143,9 @@ namespace cv { namespace gpu { namespace device uint total_count; cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) ); - if (doSort) + total_count = ::min(total_count, maxSize); + + if (doSort && total_count > 0) { thrust::device_ptr out_ptr(out); thrust::device_ptr voices_ptr(voices); diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index 721e6802a..b577ca507 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -46,8 +46,8 @@ namespace cv { namespace gpu { namespace device { namespace hough { - void linesAccum_gpu(DevMem2Db src, PtrStep_ accum, float theta, int numangle, int numrho, float irho); - int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, int maxSize, float threshold, float theta, float rho, bool doSort); + void linesAccum_gpu(DevMem2Db src, DevMem2D_ accum, float rho, float theta); + unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort); } }}} @@ -59,12 +59,11 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, f const int numangle = cvRound(CV_PI / theta); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); - const float irho = 1.0f / rho; - accum.create(numangle + 2, numrho + 2, CV_32SC1); + ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum); accum.setTo(cv::Scalar::all(0)); - hough::linesAccum_gpu(src, accum, theta, numangle, numrho, irho); + hough::linesAccum_gpu(src, accum, rho, theta); } void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) @@ -73,11 +72,11 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float CV_Assert(accum.type() == CV_32SC1); - lines.create(2, maxLines, CV_32FC2); - int count = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, threshold, theta, rho, doSort); + ensureSizeIsEnough(2, maxLines, CV_32FC2, lines); + unsigned int count = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, rho, theta, threshold, doSort); if (count > 0) - lines.cols = std::min(count, maxLines); + lines.cols = count; else lines.release(); } From c3fa7974e6113f87876759f32c223fb88afb0ccb Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 14 Aug 2012 17:00:57 +0400 Subject: [PATCH 04/12] new optimized version --- modules/gpu/include/opencv2/gpu/gpu.hpp | 4 +- modules/gpu/perf/perf_imgproc.cpp | 5 +- modules/gpu/src/cuda/hough.cu | 171 ++++++++++++++++++++---- modules/gpu/src/hough.cpp | 23 ++-- modules/gpu/test/test_imgproc.cpp | 3 +- 5 files changed, 163 insertions(+), 43 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index cb2e68872..170c4d5ff 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -821,8 +821,8 @@ private: }; CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); -CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); -CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, float theta); +CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); +CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta); CV_EXPORTS void HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_voices = noArray()); diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 3baca6c9d..0dbcd34c6 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -1364,11 +1364,12 @@ GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, cv::Size, DoSort) cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_lines; cv::gpu::GpuMat d_accum; - cv::gpu::HoughLines(d_src, d_lines, d_accum, rho, theta, threshold, doSort); + cv::gpu::GpuMat d_buf; + cv::gpu::HoughLines(d_src, d_lines, d_accum, d_buf, rho, theta, threshold, doSort); TEST_CYCLE() { - cv::gpu::HoughLines(d_src, d_lines, d_accum, rho, theta, threshold, doSort); + cv::gpu::HoughLines(d_src, d_lines, d_accum, d_buf, rho, theta, threshold, doSort); } } diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index 9f5cd6534..8c9c075f6 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -42,55 +42,167 @@ #include #include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/emulation.hpp" namespace cv { namespace gpu { namespace device { namespace hough { - __global__ void linesAccum(const DevMem2Db src, PtrStep_ accum, const float theta, const int numangle, const int numrho, const float irho) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; + __device__ unsigned int g_counter; - if (x >= src.cols || y >= src.rows) + const int PIXELS_PER_THREAD = 16; + + __global__ void buildPointList(const DevMem2Db src, unsigned int* list) + { + const int x = blockIdx.x * 32 * PIXELS_PER_THREAD + threadIdx.x; + const int y = blockIdx.y * 4 + threadIdx.y; + + if (y >= src.rows) return; - if (src(y, x)) + volatile int qindex = -1; + __shared__ volatile int s_qindex[4]; + __shared__ volatile int s_qstart[4]; + s_qindex[threadIdx.y] = -1; + + __shared__ volatile unsigned int s_queue[4][32 * PIXELS_PER_THREAD]; + + // fill the queue + for (int i = 0; i < PIXELS_PER_THREAD; ++i) { - float ang = 0.0f; - for(int n = 0; n < numangle; ++n, ang += theta) + const int xx = i * blockDim.x + x; + + if (xx >= src.cols) + break; + + if (src(y, xx)) { - float sin_ang; - float cos_ang; - sincosf(ang, &sin_ang, &cos_ang); + const unsigned int queue_val = (y << 16) | xx; - const float tabSin = sin_ang * irho; - const float tabCos = cos_ang * irho; - - int r = __float2int_rn(x * tabCos + y * tabSin); - r += (numrho - 1) / 2; - - atomicInc(accum.ptr(n + 1) + r + 1, (unsigned int)-1); + do { + qindex++; + s_qindex[threadIdx.y] = qindex; + s_queue[threadIdx.y][qindex] = queue_val; + } while (s_queue[threadIdx.y][qindex] != queue_val); } + + // reload index from smem (last thread to write to smem will have updated it) + qindex = s_qindex[threadIdx.y]; + } + + __syncthreads(); + + // let one thread reserve the space required in the global list + if (threadIdx.x == 0 && threadIdx.y == 0) + { + // find how many items are stored in each list + int total_index = 0; + #pragma unroll + for (int i = 0; i < 4; ++i) + { + s_qstart[i] = total_index; + total_index += (s_qindex[i] + 1u); + } + + //calculate the offset in the global list + const unsigned int global_offset = atomicAdd(&g_counter, total_index); + #pragma unroll + for (int i = 0; i < 4; ++i) + s_qstart[i] += global_offset; + } + + __syncthreads(); + + // copy local queues to global queue + for(int i = 0; i <= qindex; i += 32) + { + if(i + threadIdx.x > qindex) + break; + + unsigned int qvalue = s_queue[threadIdx.y][i + threadIdx.x]; + list[s_qstart[threadIdx.y] + i + threadIdx.x] = qvalue; } } - void linesAccum_gpu(DevMem2Db src, DevMem2D_ accum, float rho, float theta) + unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list) { - const dim3 block(32, 8); - const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - linesAccum<<>>(src, accum, theta, accum.rows - 2, accum.cols - 2, 1.0f / rho); + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + + const dim3 block(32, 4); + const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y)); + + cudaSafeCall( cudaFuncSetCacheConfig(buildPointList, cudaFuncCachePreferShared) ); + + buildPointList<<>>(src, list); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + + unsigned int total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + + return total_count; + } + + __global__ void linesAccum(const unsigned int* list, const unsigned int count, PtrStep_ accum, + const float irho, const float theta, const int numrho) + { + extern __shared__ unsigned int smem[]; + + for (int i = threadIdx.x; i < numrho; i += blockDim.x) + smem[i] = 0; + __syncthreads(); + + const int n = blockIdx.x; + const float ang = n * theta; + + float sin_ang; + float cos_ang; + sincosf(ang, &sin_ang, &cos_ang); + + const float tabSin = sin_ang * irho; + const float tabCos = cos_ang * irho; + + for (int i = threadIdx.x; i < count; i += blockDim.x) + { + // read one element from global memory + const unsigned int qvalue = list[i]; + const unsigned int x = (qvalue & 0x0000FFFF); + const unsigned int y = (qvalue >> 16) & 0x0000FFFF; + + int r = __float2int_rn(x * tabCos + y * tabSin); + r += (numrho - 1) / 2; + + Emulation::smem::atomicInc(&smem[r], (unsigned int)(-1)); + } + __syncthreads(); + + for (int i = threadIdx.x; i < numrho; i += blockDim.x) + accum(n + 1, i + 1) = smem[i]; + } + + void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_ accum, float rho, float theta) + { + const dim3 block(1024); + const dim3 grid(accum.rows - 2); + + cudaSafeCall( cudaFuncSetCacheConfig(linesAccum, cudaFuncCachePreferShared) ); + + size_t smem_size = (accum.cols - 2) * sizeof(unsigned int); + + linesAccum<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); } - __device__ unsigned int g_counter; - - __global__ void linesGetResult(const DevMem2D_ accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho) + __global__ void linesGetResult(const DevMem2D_ accum, float2* out, int* voices, const int maxSize, + const float threshold, const float theta, const float rho, const int numrho) { - __shared__ uint smem[8][32]; + __shared__ unsigned int smem[8][32]; int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x; int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y; @@ -125,7 +237,8 @@ namespace cv { namespace gpu { namespace device } } - unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort) + unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, + float rho, float theta, float threshold, bool doSort) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); @@ -140,8 +253,8 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); - uint total_count; - cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) ); + unsigned int total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); total_count = ::min(total_count, maxSize); diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index b577ca507..f4d4399d2 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -46,16 +46,23 @@ namespace cv { namespace gpu { namespace device { namespace hough { - void linesAccum_gpu(DevMem2Db src, DevMem2D_ accum, float rho, float theta); + unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list); + void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_ accum, float rho, float theta); unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort); } }}} -void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, float theta) +void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta) { - using namespace cv::gpu::device; + using namespace cv::gpu::device::hough; CV_Assert(src.type() == CV_8UC1); + CV_Assert(src.cols < std::numeric_limits::max()); + CV_Assert(src.rows < std::numeric_limits::max()); + + ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf); + + unsigned int count = buildPointList_gpu(src, buf.ptr()); const int numangle = cvRound(CV_PI / theta); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); @@ -63,7 +70,7 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, float rho, f ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum); accum.setTo(cv::Scalar::all(0)); - hough::linesAccum_gpu(src, accum, rho, theta); + linesAccum_gpu(buf.ptr(), count, accum, rho, theta); } void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) @@ -83,13 +90,13 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) { - cv::gpu::GpuMat accum; - HoughLines(src, lines, accum, rho, theta, threshold, doSort, maxLines); + cv::gpu::GpuMat accum, buf; + HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines); } -void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, float rho, float theta, int threshold, bool doSort, int maxLines) +void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines) { - HoughLinesTransform(src, accum, rho, theta); + HoughLinesTransform(src, accum, buf, rho, theta); HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines); } diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index b0e587e4a..4d67de59d 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -1155,7 +1155,7 @@ TEST_P(HoughLines, Accuracy) const float rho = 1.0f; const float theta = CV_PI / 180.0f; - const int threshold = 300; + const int threshold = 50; cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); @@ -1182,7 +1182,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine( ALL_DEVICES, testing::Values(std::string("../cv/shared/pic1.png"), std::string("../cv/shared/pic3.png"), - std::string("../cv/shared/pic4.png"), std::string("../cv/shared/pic5.png"), std::string("../cv/shared/pic6.png")))); From 34b9bd61a232f064dcadbce5fe0a69997c88e035 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 14 Aug 2012 17:42:15 +0400 Subject: [PATCH 05/12] fixed failure --- modules/gpu/src/cuda/hough.cu | 62 ++++++++++++++--------------------- modules/gpu/src/hough.cpp | 21 ++++++++++-- 2 files changed, 43 insertions(+), 40 deletions(-) diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index 8c9c075f6..34450cd87 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -54,40 +54,30 @@ namespace cv { namespace gpu { namespace device __global__ void buildPointList(const DevMem2Db src, unsigned int* list) { - const int x = blockIdx.x * 32 * PIXELS_PER_THREAD + threadIdx.x; - const int y = blockIdx.y * 4 + threadIdx.y; + __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; + __shared__ unsigned int s_qsize[4]; + __shared__ unsigned int s_start[4]; + + const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; if (y >= src.rows) return; - volatile int qindex = -1; - __shared__ volatile int s_qindex[4]; - __shared__ volatile int s_qstart[4]; - s_qindex[threadIdx.y] = -1; + if (threadIdx.x == 0) + s_qsize[threadIdx.y] = 0; - __shared__ volatile unsigned int s_queue[4][32 * PIXELS_PER_THREAD]; + __syncthreads(); // fill the queue - for (int i = 0; i < PIXELS_PER_THREAD; ++i) + for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x) { - const int xx = i * blockDim.x + x; - - if (xx >= src.cols) - break; - if (src(y, xx)) { - const unsigned int queue_val = (y << 16) | xx; - - do { - qindex++; - s_qindex[threadIdx.y] = qindex; - s_queue[threadIdx.y][qindex] = queue_val; - } while (s_queue[threadIdx.y][qindex] != queue_val); + const unsigned int val = (y << 16) | xx; + int qidx = Emulation::smem::atomicInc(&s_qsize[threadIdx.y], (unsigned int)(-1)); + s_queues[threadIdx.y][qidx] = val; } - - // reload index from smem (last thread to write to smem will have updated it) - qindex = s_qindex[threadIdx.y]; } __syncthreads(); @@ -96,31 +86,27 @@ namespace cv { namespace gpu { namespace device if (threadIdx.x == 0 && threadIdx.y == 0) { // find how many items are stored in each list - int total_index = 0; - #pragma unroll - for (int i = 0; i < 4; ++i) + unsigned int total_size = 0; + for (int i = 0; i < blockDim.y; ++i) { - s_qstart[i] = total_index; - total_index += (s_qindex[i] + 1u); + s_start[i] = total_size; + total_size += s_qsize[i]; } //calculate the offset in the global list - const unsigned int global_offset = atomicAdd(&g_counter, total_index); - #pragma unroll - for (int i = 0; i < 4; ++i) - s_qstart[i] += global_offset; + const unsigned int global_offset = atomicAdd(&g_counter, total_size); + for (int i = 0; i < blockDim.y; ++i) + s_start[i] += global_offset; } __syncthreads(); // copy local queues to global queue - for(int i = 0; i <= qindex; i += 32) + const unsigned int qsize = s_qsize[threadIdx.y]; + for(int i = threadIdx.x; i < qsize; i += blockDim.x) { - if(i + threadIdx.x > qindex) - break; - - unsigned int qvalue = s_queue[threadIdx.y][i + threadIdx.x]; - list[s_qstart[threadIdx.y] + i + threadIdx.x] = qvalue; + unsigned int val = s_queues[threadIdx.y][i]; + list[s_start[threadIdx.y] + i] = val; } } diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index f4d4399d2..e78637c69 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -61,8 +61,24 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, CV_Assert(src.rows < std::numeric_limits::max()); ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf); - unsigned int count = buildPointList_gpu(src, buf.ptr()); +// unsigned int count = 0; +// { +// cv::Mat h_src(src); +// cv::Mat h_buf(1, src.size().area(), CV_32SC1); +// for (int y = 0; y < h_src.rows; ++y) +// { +// for (int x = 0; x < h_src.cols; ++x) +// { +// if (h_src.at(y, x)) +// { +// const unsigned int val = (y << 16) | x; +// h_buf.ptr()[count++] = val; +// } +// } +// } +// buf.upload(h_buf); +// } const int numangle = cvRound(CV_PI / theta); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); @@ -70,7 +86,8 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum); accum.setTo(cv::Scalar::all(0)); - linesAccum_gpu(buf.ptr(), count, accum, rho, theta); + if (count > 0) + linesAccum_gpu(buf.ptr(), count, accum, rho, theta); } void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) From 7ae94c571c9b3837c6faa973eacf183c4e9fea2f Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 14 Aug 2012 17:54:42 +0400 Subject: [PATCH 06/12] fixed build without cuda --- modules/gpu/src/hough.cpp | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index e78637c69..94dbe8be3 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -42,6 +42,16 @@ #include "precomp.hpp" +#if !defined (HAVE_CUDA) + +void cv::gpu::HoughLinesTransform(const GpuMat&, GpuMat&, GpuMat&, float, float) { throw_nogpu(); } +void cv::gpu::HoughLinesGet(const GpuMat&, GpuMat&, float, float, int, bool, int) { throw_nogpu(); } +void cv::gpu::HoughLines(const GpuMat&, GpuMat&, float, float, int, bool, int) { throw_nogpu(); } +void cv::gpu::HoughLines(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, float, int, bool, int) { throw_nogpu(); } +void cv::gpu::HoughLinesDownload(const GpuMat&, OutputArray, OutputArray) { throw_nogpu(); } + +#else /* !defined (HAVE_CUDA) */ + namespace cv { namespace gpu { namespace device { namespace hough @@ -62,23 +72,6 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf); unsigned int count = buildPointList_gpu(src, buf.ptr()); -// unsigned int count = 0; -// { -// cv::Mat h_src(src); -// cv::Mat h_buf(1, src.size().area(), CV_32SC1); -// for (int y = 0; y < h_src.rows; ++y) -// { -// for (int x = 0; x < h_src.cols; ++x) -// { -// if (h_src.at(y, x)) -// { -// const unsigned int val = (y << 16) | x; -// h_buf.ptr()[count++] = val; -// } -// } -// } -// buf.upload(h_buf); -// } const int numangle = cvRound(CV_PI / theta); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); @@ -141,3 +134,5 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou d_voices.download(h_voices); } } + +#endif /* !defined (HAVE_CUDA) */ From 7928cec6704919fbc1280cec2d24e2ac3342b07e Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 15 Aug 2012 13:18:35 +0400 Subject: [PATCH 07/12] added linesAccumGlobal kernel --- modules/core/include/opencv2/core/gpumat.hpp | 3 + modules/core/src/gpumat.cpp | 2 +- modules/gpu/src/cuda/hough.cu | 116 ++++++++++++------ modules/gpu/src/hough.cpp | 18 ++- .../gpu/src/opencv2/gpu/device/emulation.hpp | 8 +- 5 files changed, 98 insertions(+), 49 deletions(-) diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index ef86c5a20..989335925 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -112,6 +112,8 @@ namespace cv { namespace gpu int multiProcessorCount() const { return multi_processor_count_; } + size_t sharedMemPerBlock() const { return sharedMemPerBlock_; } + size_t freeMemory() const; size_t totalMemory() const; @@ -133,6 +135,7 @@ namespace cv { namespace gpu int multi_processor_count_; int majorVersion_; int minorVersion_; + size_t sharedMemPerBlock_; }; CV_EXPORTS void printCudaDeviceInfo(int device); diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 25a3e7699..c901bf492 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -42,7 +42,6 @@ #include "precomp.hpp" #include "opencv2/core/gpumat.hpp" - #include #ifdef HAVE_CUDA @@ -301,6 +300,7 @@ void cv::gpu::DeviceInfo::query() multi_processor_count_ = prop.multiProcessorCount; majorVersion_ = prop.major; minorVersion_ = prop.minor; + sharedMemPerBlock_ = prop.sharedMemPerBlock; } void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory) const diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index 34450cd87..d5f7d216c 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -48,15 +48,18 @@ namespace cv { namespace gpu { namespace device { namespace hough { - __device__ unsigned int g_counter; + __device__ int g_counter; + + //////////////////////////////////////////////////////////////////////// + // buildPointList const int PIXELS_PER_THREAD = 16; __global__ void buildPointList(const DevMem2Db src, unsigned int* list) { - __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; - __shared__ unsigned int s_qsize[4]; - __shared__ unsigned int s_start[4]; + __shared__ int s_queues[4][32 * PIXELS_PER_THREAD]; + __shared__ int s_qsize[4]; + __shared__ int s_start[4]; const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -75,7 +78,7 @@ namespace cv { namespace gpu { namespace device if (src(y, xx)) { const unsigned int val = (y << 16) | xx; - int qidx = Emulation::smem::atomicInc(&s_qsize[threadIdx.y], (unsigned int)(-1)); + const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1); s_queues[threadIdx.y][qidx] = val; } } @@ -86,15 +89,15 @@ namespace cv { namespace gpu { namespace device if (threadIdx.x == 0 && threadIdx.y == 0) { // find how many items are stored in each list - unsigned int total_size = 0; + int total_size = 0; for (int i = 0; i < blockDim.y; ++i) { s_start[i] = total_size; total_size += s_qsize[i]; } - //calculate the offset in the global list - const unsigned int global_offset = atomicAdd(&g_counter, total_size); + // calculate the offset in the global list + const int global_offset = atomicAdd(&g_counter, total_size); for (int i = 0; i < blockDim.y; ++i) s_start[i] += global_offset; } @@ -102,20 +105,20 @@ namespace cv { namespace gpu { namespace device __syncthreads(); // copy local queues to global queue - const unsigned int qsize = s_qsize[threadIdx.y]; + const int qsize = s_qsize[threadIdx.y]; for(int i = threadIdx.x; i < qsize; i += blockDim.x) { - unsigned int val = s_queues[threadIdx.y][i]; + const unsigned int val = s_queues[threadIdx.y][i]; list[s_start[threadIdx.y] + i] = val; } } - unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list) + int buildPointList_gpu(DevMem2Db src, unsigned int* list) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); const dim3 block(32, 4); const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y)); @@ -127,19 +130,48 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); - unsigned int total_count; - cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + int total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); return total_count; } - __global__ void linesAccum(const unsigned int* list, const unsigned int count, PtrStep_ accum, - const float irho, const float theta, const int numrho) - { - extern __shared__ unsigned int smem[]; + //////////////////////////////////////////////////////////////////////// + // linesAccum - for (int i = threadIdx.x; i < numrho; i += blockDim.x) + __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho) + { + const int n = blockIdx.x; + const float ang = n * theta; + + float sin_ang; + float cos_ang; + sincosf(ang, &sin_ang, &cos_ang); + + const float tabSin = sin_ang * irho; + const float tabCos = cos_ang * irho; + + for (int i = threadIdx.x; i < count; i += blockDim.x) + { + const unsigned int qvalue = list[i]; + + const int x = (qvalue & 0x0000FFFF); + const int y = (qvalue >> 16) & 0x0000FFFF; + + int r = __float2int_rn(x * tabCos + y * tabSin); + r += (numrho - 1) / 2; + + ::atomicAdd(accum.ptr(n + 1) + r + 1, 1); + } + } + + __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho) + { + extern __shared__ int smem[]; + + for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x) smem[i] = 0; + __syncthreads(); const int n = blockIdx.x; @@ -154,41 +186,48 @@ namespace cv { namespace gpu { namespace device for (int i = threadIdx.x; i < count; i += blockDim.x) { - // read one element from global memory const unsigned int qvalue = list[i]; - const unsigned int x = (qvalue & 0x0000FFFF); - const unsigned int y = (qvalue >> 16) & 0x0000FFFF; + + const int x = (qvalue & 0x0000FFFF); + const int y = (qvalue >> 16) & 0x0000FFFF; int r = __float2int_rn(x * tabCos + y * tabSin); r += (numrho - 1) / 2; - Emulation::smem::atomicInc(&smem[r], (unsigned int)(-1)); + Emulation::smem::atomicAdd(&smem[r + 1], 1); } + __syncthreads(); for (int i = threadIdx.x; i < numrho; i += blockDim.x) - accum(n + 1, i + 1) = smem[i]; + accum(n + 1, i) = smem[i]; } - void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_ accum, float rho, float theta) + void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock) { const dim3 block(1024); const dim3 grid(accum.rows - 2); - cudaSafeCall( cudaFuncSetCacheConfig(linesAccum, cudaFuncCachePreferShared) ); + cudaSafeCall( cudaFuncSetCacheConfig(linesAccumShared, cudaFuncCachePreferShared) ); - size_t smem_size = (accum.cols - 2) * sizeof(unsigned int); + size_t smemSize = (accum.cols - 2) * sizeof(int); + + if (smemSize < sharedMemPerBlock - 1000) + linesAccumShared<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); + else + linesAccumGlobal<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); - linesAccum<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); } - __global__ void linesGetResult(const DevMem2D_ accum, float2* out, int* voices, const int maxSize, - const float threshold, const float theta, const float rho, const int numrho) + //////////////////////////////////////////////////////////////////////// + // linesGetResult + + __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho) { - __shared__ unsigned int smem[8][32]; + __shared__ int smem[8][32]; int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x; int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y; @@ -211,10 +250,10 @@ namespace cv { namespace gpu { namespace device smem[threadIdx.y][threadIdx.x] > smem[threadIdx.y][threadIdx.x - 1] && smem[threadIdx.y][threadIdx.x] >= smem[threadIdx.y][threadIdx.x + 1]) { - float radius = (r - (numrho - 1) * 0.5f) * rho; - float angle = n * theta; + const float radius = (r - (numrho - 1) * 0.5f) * rho; + const float angle = n * theta; - const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)); + const int ind = ::atomicAdd(&g_counter, 1); if (ind < maxSize) { out[ind] = make_float2(radius, angle); @@ -223,13 +262,12 @@ namespace cv { namespace gpu { namespace device } } - unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, - float rho, float theta, float threshold, bool doSort) + int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); const dim3 block(32, 8); const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2)); @@ -239,8 +277,8 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); - unsigned int total_count; - cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + int total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); total_count = ::min(total_count, maxSize); diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index 94dbe8be3..38e9c0166 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -56,9 +56,9 @@ namespace cv { namespace gpu { namespace device { namespace hough { - unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list); - void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_ accum, float rho, float theta); - unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort); + int buildPointList_gpu(DevMem2Db src, unsigned int* list); + void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock); + int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort); } }}} @@ -71,16 +71,21 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, CV_Assert(src.rows < std::numeric_limits::max()); ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf); - unsigned int count = buildPointList_gpu(src, buf.ptr()); + + const int count = buildPointList_gpu(src, buf.ptr()); const int numangle = cvRound(CV_PI / theta); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); + CV_Assert(numangle > 0 && numrho > 0); + ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum); accum.setTo(cv::Scalar::all(0)); + cv::gpu::DeviceInfo devInfo; + if (count > 0) - linesAccum_gpu(buf.ptr(), count, accum, rho, theta); + linesAccum_gpu(buf.ptr(), count, accum, rho, theta, devInfo.sharedMemPerBlock()); } void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) @@ -90,7 +95,8 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float CV_Assert(accum.type() == CV_32SC1); ensureSizeIsEnough(2, maxLines, CV_32FC2, lines); - unsigned int count = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, rho, theta, threshold, doSort); + + int count = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, rho, theta, threshold, doSort); if (count > 0) lines.cols = count; diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp index e116c50ad..1a6f5794c 100644 --- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp +++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp @@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device } template - static __device__ __forceinline__ void atomicAdd(T* address, T val) + static __device__ __forceinline__ T atomicAdd(T* address, T val) { #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) T count; @@ -110,8 +110,10 @@ namespace cv { namespace gpu { namespace device count = tag | (count + val); *address = count; } while (*address != count); + + return (count & TAG_MASK) - val; #else - ::atomicAdd(address, val); + return ::atomicAdd(address, val); #endif } @@ -134,4 +136,4 @@ namespace cv { namespace gpu { namespace device }; }}} // namespace cv { namespace gpu { namespace device -#endif /* OPENCV_GPU_EMULATION_HPP_ */ \ No newline at end of file +#endif /* OPENCV_GPU_EMULATION_HPP_ */ From 456890e9e4915745e33d84b17fac6223c44185ed Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 15 Aug 2012 13:48:28 +0400 Subject: [PATCH 08/12] fixed warning C4512 --- modules/video/src/bgfg_gmg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/video/src/bgfg_gmg.cpp b/modules/video/src/bgfg_gmg.cpp index 448f386bf..f2e52b593 100644 --- a/modules/video/src/bgfg_gmg.cpp +++ b/modules/video/src/bgfg_gmg.cpp @@ -197,7 +197,7 @@ namespace void operator() (const cv::Range& range) const; private: - const cv::Mat frame_; + cv::Mat frame_; mutable cv::Mat_ fgmask_; From 36fe62d7a5129525d49d7356f2d6e969e74dac51 Mon Sep 17 00:00:00 2001 From: Andrey Kamaev Date: Wed, 15 Aug 2012 15:02:20 +0400 Subject: [PATCH 09/12] #2245 Fix BFMatcher::radiusMatchImpl witch Hamming distance (thanks to Jukka Holappa) --- modules/features2d/src/matchers.cpp | 62 ++++++++++++++--------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/modules/features2d/src/matchers.cpp b/modules/features2d/src/matchers.cpp index d2f71d003..091feaaa8 100755 --- a/modules/features2d/src/matchers.cpp +++ b/modules/features2d/src/matchers.cpp @@ -270,7 +270,7 @@ void DescriptorMatcher::knnMatch( const Mat& queryDescriptors, vector 0 ); - + checkMasks( masks, queryDescriptors.rows ); train(); @@ -285,7 +285,7 @@ void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, vector std::numeric_limits::epsilon() ); - + checkMasks( masks, queryDescriptors.rows ); train(); @@ -315,9 +315,9 @@ bool DescriptorMatcher::isMaskedOut( const vector& masks, int queryIdx ) return !masks.empty() && outCount == masks.size() ; } - + /////////////////////////////////////////////////////////////////////////////////////////////////////// - + BFMatcher::BFMatcher( int _normType, bool _crossCheck ) { normType = _normType; @@ -342,24 +342,24 @@ void BFMatcher::knnMatchImpl( const Mat& queryDescriptors, vector { const int IMGIDX_SHIFT = 18; const int IMGIDX_ONE = (1 << IMGIDX_SHIFT); - + if( queryDescriptors.empty() || trainDescCollection.empty() ) { matches.clear(); return; } CV_Assert( queryDescriptors.type() == trainDescCollection[0].type() ); - + matches.reserve(queryDescriptors.rows); - + Mat dist, nidx; - + int iIdx, imgCount = (int)trainDescCollection.size(), update = 0; int dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 || (normType == NORM_L1 && queryDescriptors.type() == CV_8U) ? CV_32S : CV_32F; - + CV_Assert( (int64)imgCount*IMGIDX_ONE < INT_MAX ); - + for( iIdx = 0; iIdx < imgCount; iIdx++ ) { CV_Assert( trainDescCollection[iIdx].rows < IMGIDX_ONE ); @@ -367,23 +367,23 @@ void BFMatcher::knnMatchImpl( const Mat& queryDescriptors, vector normType, knn, masks.empty() ? Mat() : masks[iIdx], update, crossCheck); update += IMGIDX_ONE; } - + if( dtype == CV_32S ) { Mat temp; dist.convertTo(temp, CV_32F); dist = temp; } - + for( int qIdx = 0; qIdx < queryDescriptors.rows; qIdx++ ) { const float* distptr = dist.ptr(qIdx); const int* nidxptr = nidx.ptr(qIdx); - + matches.push_back( vector() ); vector& mq = matches.back(); mq.reserve(knn); - + for( int k = 0; k < nidx.cols; k++ ) { if( nidxptr[k] < 0 ) @@ -391,13 +391,13 @@ void BFMatcher::knnMatchImpl( const Mat& queryDescriptors, vector mq.push_back( DMatch(qIdx, nidxptr[k] & (IMGIDX_ONE - 1), nidxptr[k] >> IMGIDX_SHIFT, distptr[k]) ); } - + if( mq.empty() && compactResult ) matches.pop_back(); } } - + void BFMatcher::radiusMatchImpl( const Mat& queryDescriptors, vector >& matches, float maxDistance, const vector& masks, bool compactResult ) { @@ -407,14 +407,14 @@ void BFMatcher::radiusMatchImpl( const Mat& queryDescriptors, vector(qIdx); - + const float* distptr = distf.ptr(qIdx); + vector& mq = matches[qIdx]; - for( int k = 0; k < dist.cols; k++ ) + for( int k = 0; k < distf.cols; k++ ) { if( distptr[k] <= maxDistance ) mq.push_back( DMatch(qIdx, k, iIdx, distptr[k]) ); } } } - + int qIdx0 = 0; for( int qIdx = 0; qIdx < queryDescriptors.rows; qIdx++ ) { if( matches[qIdx].empty() && compactResult ) continue; - + if( qIdx0 < qIdx ) std::swap(matches[qIdx], matches[qIdx0]); - + std::sort( matches[qIdx0].begin(), matches[qIdx0].end() ); qIdx0++; } } - + /////////////////////////////////////////////////////////////////////////////////////////////////////// - + /* * Factory function for DescriptorMatcher creating */ @@ -1025,7 +1025,7 @@ void GenericDescriptorMatcher::knnMatch( const Mat& queryImage, vector KeyPointsFilter::runByImageBorder( queryKeypoints, queryImage.size(), 0 ); KeyPointsFilter::runByKeypointSize( queryKeypoints, std::numeric_limits::epsilon() ); - + train(); knnMatchImpl( queryImage, queryKeypoints, matches, knn, masks, compactResult ); } @@ -1041,7 +1041,7 @@ void GenericDescriptorMatcher::radiusMatch( const Mat& queryImage, vector::epsilon() ); - + train(); radiusMatchImpl( queryImage, queryKeypoints, matches, maxDistance, masks, compactResult ); } @@ -1065,7 +1065,7 @@ Ptr GenericDescriptorMatcher::create( const string& ge { Ptr descriptorMatcher = Algorithm::create("DescriptorMatcher." + genericDescritptorMatcherType); - + if( !paramsFilename.empty() && !descriptorMatcher.empty() ) { FileStorage fs = FileStorage( paramsFilename, FileStorage::READ ); From db79022b852a3cd4e02b9381b5bb0a75664e1de1 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 15 Aug 2012 14:31:27 +0400 Subject: [PATCH 10/12] fixed shared memory size for linesAccumShared --- modules/gpu/src/cuda/hough.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index d5f7d216c..7b33e59b7 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -210,7 +210,7 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaFuncSetCacheConfig(linesAccumShared, cudaFuncCachePreferShared) ); - size_t smemSize = (accum.cols - 2) * sizeof(int); + size_t smemSize = (accum.cols - 1) * sizeof(int); if (smemSize < sharedMemPerBlock - 1000) linesAccumShared<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); From 97731c152d403e6b3b0c39951c678499b032ad88 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 15 Aug 2012 15:05:18 +0400 Subject: [PATCH 11/12] fixed typo --- modules/gpu/include/opencv2/gpu/gpu.hpp | 2 +- modules/gpu/src/cuda/hough.cu | 12 ++++++------ modules/gpu/src/hough.cpp | 18 +++++++++--------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 170c4d5ff..a7f0ab32d 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -824,7 +824,7 @@ CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float th CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta); CV_EXPORTS void HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); -CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_voices = noArray()); +CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray()); ////////////////////////////// Matrix reductions ////////////////////////////// diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index 7b33e59b7..388223e31 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -225,7 +225,7 @@ namespace cv { namespace gpu { namespace device //////////////////////////////////////////////////////////////////////// // linesGetResult - __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho) + __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float threshold, const float theta, const float rho, const int numrho) { __shared__ int smem[8][32]; @@ -257,12 +257,12 @@ namespace cv { namespace gpu { namespace device if (ind < maxSize) { out[ind] = make_float2(radius, angle); - voices[ind] = smem[threadIdx.y][threadIdx.x]; + votes[ind] = smem[threadIdx.y][threadIdx.x]; } } } - int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort) + int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); @@ -272,7 +272,7 @@ namespace cv { namespace gpu { namespace device const dim3 block(32, 8); const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2)); - linesGetResult<<>>(accum, out, voices, maxSize, threshold, theta, rho, accum.cols - 2); + linesGetResult<<>>(accum, out, votes, maxSize, threshold, theta, rho, accum.cols - 2); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); @@ -285,8 +285,8 @@ namespace cv { namespace gpu { namespace device if (doSort && total_count > 0) { thrust::device_ptr out_ptr(out); - thrust::device_ptr voices_ptr(voices); - thrust::sort_by_key(voices_ptr, voices_ptr + total_count, out_ptr, thrust::greater()); + thrust::device_ptr votes_ptr(votes); + thrust::sort_by_key(votes_ptr, votes_ptr + total_count, out_ptr, thrust::greater()); } return total_count; diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index 38e9c0166..71d8ac07f 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -58,7 +58,7 @@ namespace cv { namespace gpu { namespace device { int buildPointList_gpu(DevMem2Db src, unsigned int* list); void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock); - int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort); + int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort); } }}} @@ -116,13 +116,13 @@ void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines); } -void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_voices_) +void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_) { if (d_lines.empty()) { h_lines_.release(); - if (h_voices_.needed()) - h_voices_.release(); + if (h_votes_.needed()) + h_votes_.release(); return; } @@ -132,12 +132,12 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou cv::Mat h_lines = h_lines_.getMat(); d_lines.row(0).download(h_lines); - if (h_voices_.needed()) + if (h_votes_.needed()) { - h_voices_.create(1, d_lines.cols, CV_32SC1); - cv::Mat h_voices = h_voices_.getMat(); - cv::gpu::GpuMat d_voices(1, d_lines.cols, CV_32SC1, const_cast(d_lines.ptr(1))); - d_voices.download(h_voices); + h_votes_.create(1, d_lines.cols, CV_32SC1); + cv::Mat h_votes = h_votes_.getMat(); + cv::gpu::GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast(d_lines.ptr(1))); + d_votes.download(h_votes); } } From a9fba14898129003d3cad2853ab838e800b47df2 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 15 Aug 2012 15:06:09 +0400 Subject: [PATCH 12/12] added docs for gpu::HoughLines --- modules/gpu/doc/image_processing.rst | 91 ++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/modules/gpu/doc/image_processing.rst b/modules/gpu/doc/image_processing.rst index 1ad421c90..eb2561ec4 100644 --- a/modules/gpu/doc/image_processing.rst +++ b/modules/gpu/doc/image_processing.rst @@ -885,3 +885,94 @@ Finds edges in an image using the [Canny86]_ algorithm. .. seealso:: :ocv:func:`Canny` + + +gpu::HoughLines +--------------- +Finds lines in a binary image using the classical Hough transform. + +.. ocv:function:: void gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096) + +.. ocv:function:: void gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096) + + :param src: 8-bit, single-channel binary source image. + + :param lines: Output vector of lines. Each line is represented by a two-element vector :math:`(\rho, \theta)` . :math:`\rho` is the distance from the coordinate origin :math:`(0,0)` (top-left corner of the image). :math:`\theta` is the line rotation angle in radians ( :math:`0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}` ). + + :param rho: Distance resolution of the accumulator in pixels. + + :param theta: Angle resolution of the accumulator in radians. + + :param threshold: Accumulator threshold parameter. Only those lines are returned that get enough votes ( :math:`>\texttt{threshold}` ). + + :param doSort: Performs lines sort by votes. + + :param maxLines: Maximum number of output lines. + + :param accum: Optional buffer for accumulator to avoid extra memory allocations (for many calls with the same sizes). + + :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes). + +.. seealso:: :ocv:func:`HoughLines` + + + +gpu::HoughLinesTransform +------------------------ +Performs classical Hough transform for line detection. + +.. ocv:function:: void gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta) + + :param src: 8-bit, single-channel binary source image. + + :param accum: Output accumulator array. + + :param buf: Buffer to avoid extra memory allocations (for many calls with the same sizes). + + :param rho: Distance resolution of the accumulator in pixels. + + :param theta: Angle resolution of the accumulator in radians. + + :param threshold: Accumulator threshold parameter. Only those lines are returned that get enough votes ( :math:`>\texttt{threshold}` ). + +.. seealso:: :ocv:func:`gpu::HoughLines` + + + +gpu::HoughLinesGet +------------------ +Finds lines in Hough space. + +.. ocv:function:: void gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096) + + :param accum: Accumulator array. + + :param lines: Output vector of lines. Each line is represented by a two-element vector :math:`(\rho, \theta)` . :math:`\rho` is the distance from the coordinate origin :math:`(0,0)` (top-left corner of the image). :math:`\theta` is the line rotation angle in radians ( :math:`0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}` ). + + :param rho: Distance resolution of the accumulator in pixels. + + :param theta: Angle resolution of the accumulator in radians. + + :param threshold: Accumulator threshold parameter. Only those lines are returned that get enough votes ( :math:`>\texttt{threshold}` ). + + :param doSort: Performs lines sort by votes. + + :param maxLines: Maximum number of output lines. + +.. seealso:: :ocv:func:`gpu::HoughLines` + + + +gpu::HoughLinesDownload +----------------------- +Downloads results from :ocv:func:`gpu::HoughLines` to host memory. + +.. ocv:function:: void gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) + + :param d_lines: Result of :ocv:func:`gpu::HoughLines` . + + :param h_lines: Output host array. + + :param h_votes: Optional output array for line's votes. + +.. seealso:: :ocv:func:`gpu::HoughLines`