From 220d937d9a27951a3d66e2c8daaa1399b12d83fc Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 30 Dec 2014 15:36:58 +0300 Subject: [PATCH 1/2] removed buffered versions of histogram functions used BufferPool mechanism instead --- .../include/opencv2/core/private.cuda.hpp | 6 ++ .../include/opencv2/cudaimgproc.hpp | 50 ++-------------- modules/cudaimgproc/perf/perf_histogram.cpp | 9 +-- modules/cudaimgproc/src/histogram.cpp | 60 +++++++++---------- samples/gpu/performance/tests.cpp | 5 +- 5 files changed, 46 insertions(+), 84 deletions(-) diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp index a97388bd0..9fff4ee28 100644 --- a/modules/core/include/opencv2/core/private.cuda.hpp +++ b/modules/core/include/opencv2/core/private.cuda.hpp @@ -130,6 +130,12 @@ namespace cv { namespace cuda class NppStreamHandler { public: + inline explicit NppStreamHandler(Stream& newStream) + { + oldStream = nppGetStream(); + nppSetStream(StreamAccessor::getStream(newStream)); + } + inline explicit NppStreamHandler(cudaStream_t newStream) { oldStream = nppGetStream(); diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp index 1ec288fa9..7aa74aa38 100644 --- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp +++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp @@ -205,19 +205,11 @@ CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stre @param src Source image with CV_8UC1 type. @param dst Destination image. -@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes). @param stream Stream for the asynchronous version. @sa equalizeHist */ -CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, InputOutputArray buf, Stream& stream = Stream::Null()); - -/** @overload */ -static inline void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::equalizeHist(src, dst, buf, stream); -} +CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null()); /** @brief Base class for Contrast Limited Adaptive Histogram Equalization. : */ @@ -259,27 +251,11 @@ a four-channel image, all channels are processed separately. @param histSize Size of the histogram. @param lowerLevel Lower boundary of lowest-level bin. @param upperLevel Upper boundary of highest-level bin. -@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes). @param stream Stream for the asynchronous version. */ -CV_EXPORTS void histEven(InputArray src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); - +CV_EXPORTS void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); /** @overload */ -static inline void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream); -} - -/** @overload */ -CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); - -/** @overload */ -static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream); -} +CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); /** @brief Calculates a histogram with bins determined by the levels array. @@ -287,27 +263,11 @@ static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int For a four-channel image, all channels are processed separately. @param hist Destination histogram with one row, (levels.cols-1) columns, and the CV_32SC1 type. @param levels Number of levels in the histogram. -@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes). @param stream Stream for the asynchronous version. */ -CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, InputOutputArray buf, Stream& stream = Stream::Null()); - +CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null()); /** @overload */ -static inline void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::histRange(src, hist, levels, buf, stream); -} - -/** @overload */ -CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream = Stream::Null()); - -/** @overload */ -static inline void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null()) -{ - GpuMat buf; - cuda::histRange(src, hist, levels, buf, stream); -} +CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null()); //! @} cudaimgproc_hist diff --git a/modules/cudaimgproc/perf/perf_histogram.cpp b/modules/cudaimgproc/perf/perf_histogram.cpp index 0e020394a..c638ce0ce 100644 --- a/modules/cudaimgproc/perf/perf_histogram.cpp +++ b/modules/cudaimgproc/perf/perf_histogram.cpp @@ -63,9 +63,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC1, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat dst; - cv::cuda::GpuMat d_buf; - TEST_CYCLE() cv::cuda::histEven(d_src, dst, d_buf, 30, 0, 180); + TEST_CYCLE() cv::cuda::histEven(d_src, dst, 30, 0, 180); CUDA_SANITY_CHECK(dst); } @@ -106,9 +105,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC4, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat d_hist[4]; - cv::cuda::GpuMat d_buf; - TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, d_buf, histSize, lowerLevel, upperLevel); + TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, histSize, lowerLevel, upperLevel); cv::Mat cpu_hist0, cpu_hist1, cpu_hist2, cpu_hist3; d_hist[0].download(cpu_hist0); @@ -167,9 +165,8 @@ PERF_TEST_P(Sz, EqualizeHist, { const cv::cuda::GpuMat d_src(src); cv::cuda::GpuMat dst; - cv::cuda::GpuMat d_buf; - TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst, d_buf); + TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst); CUDA_SANITY_CHECK(dst); } diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index d63e57de3..a965242f8 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -49,7 +49,7 @@ using namespace cv::cuda; void cv::cuda::calcHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); } -void cv::cuda::equalizeHist(InputArray, OutputArray, InputOutputArray, Stream&) { throw_no_cuda(); } +void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); } cv::Ptr cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr(); } @@ -93,7 +93,7 @@ namespace hist void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream); } -void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray _buf, Stream& _stream) +void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream) { GpuMat src = _src.getGpuMat(); @@ -107,8 +107,8 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray size_t bufSize = intBufSize + 2 * 256 * sizeof(int); - ensureSizeIsEnough(1, static_cast(bufSize), CV_8UC1, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(_stream); + GpuMat buf = pool.getBuffer(1, static_cast(bufSize), CV_8UC1); GpuMat hist(1, 256, CV_32SC1, buf.data); GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int)); @@ -288,7 +288,7 @@ namespace { typedef typename NppHistogramEvenFuncC1::src_t src_t; - static void hist(const GpuMat& src, OutputArray _hist, InputOutputArray _buf, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream) + static void hist(const GpuMat& src, OutputArray _hist, int histSize, int lowerLevel, int upperLevel, Stream& stream) { const int levels = histSize + 1; @@ -302,15 +302,15 @@ namespace int buf_size; get_buf_size(sz, levels, &buf_size); - ensureSizeIsEnough(1, buf_size, CV_8UC1, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); NppStreamHandler h(stream); nppSafeCall( func(src.ptr(), static_cast(src.step), sz, hist.ptr(), levels, lowerLevel, upperLevel, buf.ptr()) ); - if (stream == 0) + if (!stream) cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -319,7 +319,7 @@ namespace { typedef typename NppHistogramEvenFuncC4::src_t src_t; - static void hist(const GpuMat& src, GpuMat hist[4],InputOutputArray _buf, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream) + static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream) { int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1}; hist[0].create(1, histSize[0], CV_32S); @@ -336,14 +336,14 @@ namespace int buf_size; get_buf_size(sz, levels, &buf_size); - ensureSizeIsEnough(1, buf_size, CV_8U, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); NppStreamHandler h(stream); nppSafeCall( func(src.ptr(), static_cast(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr()) ); - if (stream == 0) + if (!stream) cudaSafeCall( cudaDeviceSynchronize() ); } }; @@ -392,7 +392,7 @@ namespace typedef typename NppHistogramRangeFuncC1::level_t level_t; enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1::LEVEL_TYPE_CODE}; - static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, InputOutputArray _buf, cudaStream_t stream) + static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, Stream& stream) { CV_Assert( levels.type() == LEVEL_TYPE_CODE && levels.rows == 1 ); @@ -406,8 +406,8 @@ namespace int buf_size; get_buf_size(sz, levels.cols, &buf_size); - ensureSizeIsEnough(1, buf_size, CV_8U, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); NppStreamHandler h(stream); @@ -424,7 +424,7 @@ namespace typedef typename NppHistogramRangeFuncC1::level_t level_t; enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1::LEVEL_TYPE_CODE}; - static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4],InputOutputArray _buf, cudaStream_t stream) + static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream) { CV_Assert( levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1 ); CV_Assert( levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1 ); @@ -447,8 +447,8 @@ namespace int buf_size; get_buf_size(sz, nLevels, &buf_size); - ensureSizeIsEnough(1, buf_size, CV_8U, _buf); - GpuMat buf = _buf.getGpuMat(); + BufferPool pool(stream); + GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1); NppStreamHandler h(stream); @@ -493,9 +493,9 @@ namespace } } -void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream) +void cv::cuda::histEven(InputArray _src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream) { - typedef void (*hist_t)(const GpuMat& src, OutputArray hist, InputOutputArray buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream); + typedef void (*hist_t)(const GpuMat& src, OutputArray hist, int levels, int lowerLevel, int upperLevel, Stream& stream); static const hist_t hist_callers[] = { NppHistogramEvenC1::hist, @@ -514,12 +514,12 @@ void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf, CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 ); - hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream)); + hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream); } -void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream) +void cv::cuda::histEven(InputArray _src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream) { - typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], InputOutputArray buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream); + typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], Stream& stream); static const hist_t hist_callers[] = { NppHistogramEvenC4::hist, @@ -532,12 +532,12 @@ void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, i CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 ); - hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream)); + hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream); } -void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, InputOutputArray buf, Stream& stream) +void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, Stream& stream) { - typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, InputOutputArray buf, cudaStream_t stream); + typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, Stream& stream); static const hist_t hist_callers[] = { NppHistogramRangeC1::hist, @@ -553,12 +553,12 @@ void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1 ); - hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream)); + hist_callers[src.depth()](src, hist, levels, stream); } -void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream) +void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], Stream& stream) { - typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, cudaStream_t stream); + typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream); static const hist_t hist_callers[] = { NppHistogramRangeC4::hist, @@ -573,7 +573,7 @@ void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4] CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4 ); - hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream)); + hist_callers[src.depth()](src, hist, levels, stream); } #endif /* !defined (HAVE_CUDA) */ diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index af3f874e1..2e7faa334 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -1053,12 +1053,11 @@ TEST(equalizeHist) cuda::GpuMat d_src(src); cuda::GpuMat d_dst; - cuda::GpuMat d_buf; - cuda::equalizeHist(d_src, d_dst, d_buf); + cuda::equalizeHist(d_src, d_dst); CUDA_ON; - cuda::equalizeHist(d_src, d_dst, d_buf); + cuda::equalizeHist(d_src, d_dst); CUDA_OFF; } } From f50a0612254be1116140886ba733fa4861737038 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 30 Dec 2014 15:37:14 +0300 Subject: [PATCH 2/2] added stream parameter to all cudaimgproc routines --- .../include/opencv2/cudaimgproc.hpp | 36 ++++++++----- modules/cudaimgproc/src/canny.cpp | 50 +++++++++---------- modules/cudaimgproc/src/cuda/canny.cu | 50 +++++++++++-------- modules/cudaimgproc/src/gftt.cpp | 7 ++- modules/cudaimgproc/src/histogram.cpp | 6 +-- modules/cudaimgproc/src/hough_circles.cpp | 7 ++- modules/cudaimgproc/src/hough_lines.cpp | 21 +++++--- modules/cudaimgproc/src/hough_segments.cpp | 7 ++- modules/cudaimgproc/src/mssegmentation.cpp | 9 ++-- 9 files changed, 115 insertions(+), 78 deletions(-) diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp index 7aa74aa38..52bfcef7a 100644 --- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp +++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp @@ -240,8 +240,9 @@ CV_EXPORTS Ptr createCLAHE(double clipLimit = 40.0, Size tileGridSi @param nLevels Number of computed levels. nLevels must be at least 2. @param lowerLevel Lower boundary value of the lowest level. @param upperLevel Upper boundary value of the greatest level. +@param stream Stream for the asynchronous version. */ -CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel); +CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); /** @brief Calculates a histogram with evenly distributed bins. @@ -281,15 +282,17 @@ public: /** @brief Finds edges in an image using the @cite Canny86 algorithm. @param image Single-channel 8-bit input image. - @param edges Output edge map. It has the same size and type as image . + @param edges Output edge map. It has the same size and type as image. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray image, OutputArray edges) = 0; + virtual void detect(InputArray image, OutputArray edges, Stream& stream = Stream::Null()) = 0; /** @overload @param dx First derivative of image in the vertical direction. Support only CV_32S type. @param dy First derivative of image in the horizontal direction. Support only CV_32S type. - @param edges Output edge map. It has the same size and type as image . + @param edges Output edge map. It has the same size and type as image. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray dx, InputArray dy, OutputArray edges) = 0; + virtual void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream = Stream::Null()) = 0; virtual void setLowThreshold(double low_thresh) = 0; virtual double getLowThreshold() const = 0; @@ -336,18 +339,20 @@ public: \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of the image). \f$\theta\f$ is the line rotation angle in radians ( \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ). + @param stream Stream for the asynchronous version. @sa HoughLines */ - virtual void detect(InputArray src, OutputArray lines) = 0; + virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0; /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory. @param d_lines Result of cuda::HoughLinesDetector::detect . @param h_lines Output host array. @param h_votes Optional output array for line's votes. + @param stream Stream for the asynchronous version. */ - virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) = 0; + virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray(), Stream& stream = Stream::Null()) = 0; virtual void setRho(float rho) = 0; virtual float getRho() const = 0; @@ -391,10 +396,11 @@ public: @param lines Output vector of lines. Each line is represented by a 4-element vector \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected line segment. + @param stream Stream for the asynchronous version. @sa HoughLinesP */ - virtual void detect(InputArray src, OutputArray lines) = 0; + virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0; virtual void setRho(float rho) = 0; virtual float getRho() const = 0; @@ -435,10 +441,11 @@ public: @param src 8-bit, single-channel grayscale input image. @param circles Output vector of found circles. Each vector is encoded as a 3-element floating-point vector \f$(x, y, radius)\f$ . + @param stream Stream for the asynchronous version. @sa HoughCircles */ - virtual void detect(InputArray src, OutputArray circles) = 0; + virtual void detect(InputArray src, OutputArray circles, Stream& stream = Stream::Null()) = 0; virtual void setDp(float dp) = 0; virtual float getDp() const = 0; @@ -553,8 +560,9 @@ public: positions). @param mask Optional region of interest. If the image is not empty (it needs to have the type CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected. + @param stream Stream for the asynchronous version. */ - virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0; + virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray(), Stream& stream = Stream::Null()) = 0; }; /** @brief Creates implementation for cuda::CornersDetector . @@ -590,7 +598,7 @@ as src . @param sp Spatial window radius. @param sr Color window radius. @param criteria Termination criteria. See TermCriteria. -@param stream +@param stream Stream for the asynchronous version. It maps each point of the source image into another point. As a result, you have a new color and new position of each point. @@ -610,7 +618,7 @@ src size. The type is CV_16SC2 . @param sp Spatial window radius. @param sr Color window radius. @param criteria Termination criteria. See TermCriteria. -@param stream +@param stream Stream for the asynchronous version. @sa cuda::meanShiftFiltering */ @@ -626,9 +634,11 @@ CV_EXPORTS void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dsts @param sr Color window radius. @param minsize Minimum segment size. Smaller segments are merged. @param criteria Termination criteria. See TermCriteria. +@param stream Stream for the asynchronous version. */ CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize, - TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), + Stream& stream = Stream::Null()); /////////////////////////// Match Template //////////////////////////// diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp index eed4a284e..1e52bd295 100644 --- a/modules/cudaimgproc/src/canny.cpp +++ b/modules/cudaimgproc/src/canny.cpp @@ -53,16 +53,16 @@ Ptr cv::cuda::createCannyEdgeDetector(double, double, int, bo namespace canny { - void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); - void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream); + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream); - void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh); + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream); - void edgesHysteresisLocal(PtrStepSzi map, short2* st1); + void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream); - void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2); + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream); - void getEdges(PtrStepSzi map, PtrStepSzb dst); + void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream); } namespace @@ -76,8 +76,8 @@ namespace old_apperture_size_ = -1; } - void detect(InputArray image, OutputArray edges); - void detect(InputArray dx, InputArray dy, OutputArray edges); + void detect(InputArray image, OutputArray edges, Stream& stream); + void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream); void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; } double getLowThreshold() const { return low_thresh_; } @@ -111,7 +111,7 @@ namespace private: void createBuf(Size image_size); - void CannyCaller(GpuMat& edges); + void CannyCaller(GpuMat& edges, Stream& stream); double low_thresh_; double high_thresh_; @@ -128,7 +128,7 @@ namespace int old_apperture_size_; }; - void CannyImpl::detect(InputArray _image, OutputArray _edges) + void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream) { GpuMat image = _image.getGpuMat(); @@ -150,24 +150,24 @@ namespace image.locateROI(wholeSize, ofs); GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step); - canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); } else { #ifndef HAVE_OPENCV_CUDAFILTERS throw_no_cuda(); #else - filterDX_->apply(image, dx_); - filterDY_->apply(image, dy_); + filterDX_->apply(image, dx_, stream); + filterDY_->apply(image, dy_, stream); - canny::calcMagnitude(dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); #endif } - CannyCaller(edges); + CannyCaller(edges, stream); } - void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges) + void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges, Stream& stream) { GpuMat dx = _dx.getGpuMat(); GpuMat dy = _dy.getGpuMat(); @@ -176,8 +176,8 @@ namespace CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() ); CV_Assert( deviceSupports(SHARED_ATOMICS) ); - dx.copyTo(dx_); - dy.copyTo(dy_); + dx.copyTo(dx_, stream); + dy.copyTo(dy_, stream); if (low_thresh_ > high_thresh_) std::swap(low_thresh_, high_thresh_); @@ -187,9 +187,9 @@ namespace _edges.create(dx.size(), CV_8UC1); GpuMat edges = _edges.getGpuMat(); - canny::calcMagnitude(dx_, dy_, mag_, L2gradient_); + canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream)); - CannyCaller(edges); + CannyCaller(edges, stream); } void CannyImpl::createBuf(Size image_size) @@ -215,16 +215,16 @@ namespace ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2_); } - void CannyImpl::CannyCaller(GpuMat& edges) + void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream) { map_.setTo(Scalar::all(0)); - canny::calcMap(dx_, dy_, mag_, map_, static_cast(low_thresh_), static_cast(high_thresh_)); + canny::calcMap(dx_, dy_, mag_, map_, static_cast(low_thresh_), static_cast(high_thresh_), StreamAccessor::getStream(stream)); - canny::edgesHysteresisLocal(map_, st1_.ptr()); + canny::edgesHysteresisLocal(map_, st1_.ptr(), StreamAccessor::getStream(stream)); - canny::edgesHysteresisGlobal(map_, st1_.ptr(), st2_.ptr()); + canny::edgesHysteresisGlobal(map_, st1_.ptr(), st2_.ptr(), StreamAccessor::getStream(stream)); - canny::getEdges(map_, edges); + canny::getEdges(map_, edges, StreamAccessor::getStream(stream)); } } diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu index 3d770e179..e0ba51569 100644 --- a/modules/cudaimgproc/src/cuda/canny.cu +++ b/modules/cudaimgproc/src/cuda/canny.cu @@ -120,7 +120,7 @@ namespace canny mag(y, x) = norm(dxVal, dyVal); } - void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); @@ -131,30 +131,31 @@ namespace canny if (L2Grad) { L2 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); } else { L1 norm; - calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); } cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } - void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) { if (L2Grad) { L2 norm; - transform(dx, dy, mag, norm, WithOutMask(), 0); + transform(dx, dy, mag, norm, WithOutMask(), stream); } else { L1 norm; - transform(dx, dy, mag, norm, WithOutMask(), 0); + transform(dx, dy, mag, norm, WithOutMask(), stream); } } } @@ -217,17 +218,18 @@ namespace canny map(y, x) = edge_type; } - void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream) { const dim3 block(16, 16); const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); bindTexture(&tex_mag, mag); - calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); + calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -328,20 +330,21 @@ namespace canny } } - void edgesHysteresisLocal(PtrStepSzi map, short2* st1) + void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); const dim3 block(16, 16); const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - edgesHysteresisLocalKernel<<>>(map, st1); + edgesHysteresisLocalKernel<<>>(map, st1); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); } } @@ -441,27 +444,30 @@ namespace canny } } - void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2) + void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); while (count > 0) { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); const dim3 block(128); const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); + edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + if (stream == NULL) + cudaSafeCall( cudaDeviceSynchronize() ); - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + cudaSafeCall( cudaStreamSynchronize(stream) ); count = min(count, map.cols * map.rows); @@ -499,9 +505,9 @@ namespace cv { namespace cuda { namespace device namespace canny { - void getEdges(PtrStepSzi map, PtrStepSzb dst) + void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream) { - transform(map, dst, GetEdges(), WithOutMask(), 0); + transform(map, dst, GetEdges(), WithOutMask(), stream); } } diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp index 243665083..162ee469c 100644 --- a/modules/cudaimgproc/src/gftt.cpp +++ b/modules/cudaimgproc/src/gftt.cpp @@ -68,7 +68,7 @@ namespace GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance, int blockSize, bool useHarrisDetector, double harrisK); - void detect(InputArray image, OutputArray corners, InputArray mask = noArray()); + void detect(InputArray image, OutputArray corners, InputArray mask, Stream& stream); private: int maxCorners_; @@ -96,8 +96,11 @@ namespace cuda::createMinEigenValCorner(srcType, blockSize, 3); } - void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask) + void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::gfft; GpuMat image = _image.getGpuMat(); diff --git a/modules/cudaimgproc/src/histogram.cpp b/modules/cudaimgproc/src/histogram.cpp index a965242f8..e942e9eb8 100644 --- a/modules/cudaimgproc/src/histogram.cpp +++ b/modules/cudaimgproc/src/histogram.cpp @@ -53,7 +53,7 @@ void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); cv::Ptr cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr(); } -void cv::cuda::evenLevels(OutputArray, int, int, int) { throw_no_cuda(); } +void cv::cuda::evenLevels(OutputArray, int, int, int, Stream&) { throw_no_cuda(); } void cv::cuda::histEven(InputArray, OutputArray, InputOutputArray, int, int, int, Stream&) { throw_no_cuda(); } void cv::cuda::histEven(InputArray, GpuMat*, InputOutputArray, int*, int*, int*, Stream&) { throw_no_cuda(); } @@ -460,7 +460,7 @@ namespace }; } -void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel) +void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream) { const int kind = _levels.kind(); @@ -475,7 +475,7 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr(), nLevels, lowerLevel, upperLevel) ); if (kind == _InputArray::CUDA_GPU_MAT) - _levels.getGpuMatRef().upload(host_levels); + _levels.getGpuMatRef().upload(host_levels, stream); } namespace hist diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp index 3f9b9334c..6bdaf16a2 100644 --- a/modules/cudaimgproc/src/hough_circles.cpp +++ b/modules/cudaimgproc/src/hough_circles.cpp @@ -74,7 +74,7 @@ namespace public: HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles); - void detect(InputArray src, OutputArray circles); + void detect(InputArray src, OutputArray circles, Stream& stream); void setDp(float dp) { dp_ = dp; } float getDp() const { return dp_; } @@ -154,8 +154,11 @@ namespace filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1); } - void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles) + void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_circles; diff --git a/modules/cudaimgproc/src/hough_lines.cpp b/modules/cudaimgproc/src/hough_lines.cpp index b9f159a9c..7b9c08294 100644 --- a/modules/cudaimgproc/src/hough_lines.cpp +++ b/modules/cudaimgproc/src/hough_lines.cpp @@ -75,8 +75,8 @@ namespace { } - void detect(InputArray src, OutputArray lines); - void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()); + void detect(InputArray src, OutputArray lines, Stream& stream); + void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream); void setRho(float rho) { rho_ = rho; } float getRho() const { return rho_; } @@ -125,8 +125,11 @@ namespace GpuMat result_; }; - void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines) + void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_lines; @@ -170,7 +173,7 @@ namespace result_.copyTo(lines); } - void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes) + void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream) { GpuMat d_lines = _d_lines.getGpuMat(); @@ -184,12 +187,18 @@ namespace CV_Assert( d_lines.rows == 2 && d_lines.type() == CV_32FC2 ); - d_lines.row(0).download(h_lines); + if (stream) + d_lines.row(0).download(h_lines, stream); + else + d_lines.row(0).download(h_lines); if (h_votes.needed()) { GpuMat d_votes(1, d_lines.cols, CV_32SC1, d_lines.ptr(1)); - d_votes.download(h_votes); + if (stream) + d_votes.download(h_votes, stream); + else + d_votes.download(h_votes); } } } diff --git a/modules/cudaimgproc/src/hough_segments.cpp b/modules/cudaimgproc/src/hough_segments.cpp index 2434f6d26..e3e34ec3d 100644 --- a/modules/cudaimgproc/src/hough_segments.cpp +++ b/modules/cudaimgproc/src/hough_segments.cpp @@ -79,7 +79,7 @@ namespace { } - void detect(InputArray src, OutputArray lines); + void detect(InputArray src, OutputArray lines, Stream& stream); void setRho(float rho) { rho_ = rho; } float getRho() const { return rho_; } @@ -128,8 +128,11 @@ namespace GpuMat result_; }; - void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines) + void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream) { + // TODO : implement async version + (void) stream; + using namespace cv::cuda::device::hough; using namespace cv::cuda::device::hough_lines; using namespace cv::cuda::device::hough_segments; diff --git a/modules/cudaimgproc/src/mssegmentation.cpp b/modules/cudaimgproc/src/mssegmentation.cpp index ad5819800..54926f377 100644 --- a/modules/cudaimgproc/src/mssegmentation.cpp +++ b/modules/cudaimgproc/src/mssegmentation.cpp @@ -43,7 +43,7 @@ #if !defined HAVE_CUDA || defined(CUDA_DISABLER) -void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria) { throw_no_cuda(); } +void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria, Stream&) { throw_no_cuda(); } #else @@ -222,7 +222,7 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs) } // anonymous namespace -void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria) +void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria, Stream& stream) { GpuMat src = _src.getGpuMat(); @@ -235,7 +235,10 @@ void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, // Perform mean shift procedure and obtain region and spatial maps GpuMat d_rmap, d_spmap; - cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria); + cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria, stream); + + stream.waitForCompletion(); + Mat rmap(d_rmap); Mat spmap(d_spmap);