Merge pull request #3566 from jet47:cuda-imgproc-refactoring

2015-01-13 11:21:51 +00:00
parent 60cedd7a68 f50a061225
commit 2ecca8f59c
12 changed files with 161 additions and 162 deletions
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -130,6 +130,12 @@ namespace cv { namespace cuda
    class NppStreamHandler
    {
    public:
+        inline explicit NppStreamHandler(Stream& newStream)
+        {
+            oldStream = nppGetStream();
+            nppSetStream(StreamAccessor::getStream(newStream));
+        }
+
        inline explicit NppStreamHandler(cudaStream_t newStream)
        {
            oldStream = nppGetStream();
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -205,19 +205,11 @@ CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stre

@param src Source image with CV_8UC1 type.
@param dst Destination image.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
@param stream Stream for the asynchronous version.

@sa equalizeHist
 */
-CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, InputOutputArray buf, Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::equalizeHist(src, dst, buf, stream);
-}
+CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null());

 /** @brief Base class for Contrast Limited Adaptive Histogram Equalization. :
 */
@@ -248,8 +240,9 @@ CV_EXPORTS Ptr<cuda::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSi
@param nLevels Number of computed levels. nLevels must be at least 2.
@param lowerLevel Lower boundary value of the lowest level.
@param upperLevel Upper boundary value of the greatest level.
+@param stream Stream for the asynchronous version.
 */
-CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel);
+CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());

 /** @brief Calculates a histogram with evenly distributed bins.

@@ -259,27 +252,11 @@ a four-channel image, all channels are processed separately.
@param histSize Size of the histogram.
@param lowerLevel Lower boundary of lowest-level bin.
@param upperLevel Upper boundary of highest-level bin.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
@param stream Stream for the asynchronous version.
 */
-CV_EXPORTS void histEven(InputArray src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
-
+CV_EXPORTS void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
 /** @overload */
-static inline void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
-}
-
-/** @overload */
-CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
-}
+CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());

 /** @brief Calculates a histogram with bins determined by the levels array.

@@ -287,27 +264,11 @@ static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int
 For a four-channel image, all channels are processed separately.
@param hist Destination histogram with one row, (levels.cols-1) columns, and the CV_32SC1 type.
@param levels Number of levels in the histogram.
-@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
@param stream Stream for the asynchronous version.
 */
-CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, InputOutputArray buf, Stream& stream = Stream::Null());
-
+CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null());
 /** @overload */
-static inline void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histRange(src, hist, levels, buf, stream);
-}
-
-/** @overload */
-CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream = Stream::Null());
-
-/** @overload */
-static inline void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null())
-{
-    GpuMat buf;
-    cuda::histRange(src, hist, levels, buf, stream);
-}
+CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());

 //! @} cudaimgproc_hist

@@ -321,15 +282,17 @@ public:
    /** @brief Finds edges in an image using the @cite Canny86 algorithm.

    @param image Single-channel 8-bit input image.
-    @param edges Output edge map. It has the same size and type as image .
+    @param edges Output edge map. It has the same size and type as image.
+    @param stream Stream for the asynchronous version.
     */
-    virtual void detect(InputArray image, OutputArray edges) = 0;
+    virtual void detect(InputArray image, OutputArray edges, Stream& stream = Stream::Null()) = 0;
    /** @overload
    @param dx First derivative of image in the vertical direction. Support only CV_32S type.
    @param dy First derivative of image in the horizontal direction. Support only CV_32S type.
-    @param edges Output edge map. It has the same size and type as image .
+    @param edges Output edge map. It has the same size and type as image.
+    @param stream Stream for the asynchronous version.
    */
-    virtual void detect(InputArray dx, InputArray dy, OutputArray edges) = 0;
+    virtual void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream = Stream::Null()) = 0;

    virtual void setLowThreshold(double low_thresh) = 0;
    virtual double getLowThreshold() const = 0;
@@ -376,18 +339,20 @@ public:
    \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of
    the image). \f$\theta\f$ is the line rotation angle in radians (
    \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ).
+    @param stream Stream for the asynchronous version.

    @sa HoughLines
     */
-    virtual void detect(InputArray src, OutputArray lines) = 0;
+    virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0;

    /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory.

    @param d_lines Result of cuda::HoughLinesDetector::detect .
    @param h_lines Output host array.
    @param h_votes Optional output array for line's votes.
+    @param stream Stream for the asynchronous version.
     */
-    virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) = 0;
+    virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray(), Stream& stream = Stream::Null()) = 0;

    virtual void setRho(float rho) = 0;
    virtual float getRho() const = 0;
@@ -431,10 +396,11 @@ public:
    @param lines Output vector of lines. Each line is represented by a 4-element vector
    \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected
    line segment.
+    @param stream Stream for the asynchronous version.

    @sa HoughLinesP
     */
-    virtual void detect(InputArray src, OutputArray lines) = 0;
+    virtual void detect(InputArray src, OutputArray lines, Stream& stream = Stream::Null()) = 0;

    virtual void setRho(float rho) = 0;
    virtual float getRho() const = 0;
@@ -475,10 +441,11 @@ public:
    @param src 8-bit, single-channel grayscale input image.
    @param circles Output vector of found circles. Each vector is encoded as a 3-element
    floating-point vector \f$(x, y, radius)\f$ .
+    @param stream Stream for the asynchronous version.

    @sa HoughCircles
     */
-    virtual void detect(InputArray src, OutputArray circles) = 0;
+    virtual void detect(InputArray src, OutputArray circles, Stream& stream = Stream::Null()) = 0;

    virtual void setDp(float dp) = 0;
    virtual float getDp() const = 0;
@@ -593,8 +560,9 @@ public:
    positions).
    @param mask Optional region of interest. If the image is not empty (it needs to have the type
    CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+    @param stream Stream for the asynchronous version.
     */
-    virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0;
+    virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray(), Stream& stream = Stream::Null()) = 0;
 };

 /** @brief Creates implementation for cuda::CornersDetector .
@@ -630,7 +598,7 @@ as src .
@param sp Spatial window radius.
@param sr Color window radius.
@param criteria Termination criteria. See TermCriteria.
-@param stream
+@param stream Stream for the asynchronous version.

 It maps each point of the source image into another point. As a result, you have a new color and new
 position of each point.
@@ -650,7 +618,7 @@ src size. The type is CV_16SC2 .
@param sp Spatial window radius.
@param sr Color window radius.
@param criteria Termination criteria. See TermCriteria.
-@param stream
+@param stream Stream for the asynchronous version.

@sa cuda::meanShiftFiltering
 */
@@ -666,9 +634,11 @@ CV_EXPORTS void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dsts
@param sr Color window radius.
@param minsize Minimum segment size. Smaller segments are merged.
@param criteria Termination criteria. See TermCriteria.
+@param stream Stream for the asynchronous version.
 */
 CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize,
-                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
+                                      Stream& stream = Stream::Null());

 /////////////////////////// Match Template ////////////////////////////

--- a/modules/cudaimgproc/perf/perf_histogram.cpp
+++ b/modules/cudaimgproc/perf/perf_histogram.cpp
@@ -63,9 +63,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC1,
    {
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;

-        TEST_CYCLE() cv::cuda::histEven(d_src, dst, d_buf, 30, 0, 180);
+        TEST_CYCLE() cv::cuda::histEven(d_src, dst, 30, 0, 180);

        CUDA_SANITY_CHECK(dst);
    }
@@ -106,9 +105,8 @@ PERF_TEST_P(Sz_Depth, HistEvenC4,
    {
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat d_hist[4];
-        cv::cuda::GpuMat d_buf;

-        TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, d_buf, histSize, lowerLevel, upperLevel);
+        TEST_CYCLE() cv::cuda::histEven(d_src, d_hist, histSize, lowerLevel, upperLevel);

        cv::Mat cpu_hist0, cpu_hist1, cpu_hist2, cpu_hist3;
        d_hist[0].download(cpu_hist0);
@@ -167,9 +165,8 @@ PERF_TEST_P(Sz, EqualizeHist,
    {
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;

-        TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst, d_buf);
+        TEST_CYCLE() cv::cuda::equalizeHist(d_src, dst);

        CUDA_SANITY_CHECK(dst);
    }
--- a/modules/cudaimgproc/src/canny.cpp
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -53,16 +53,16 @@ Ptr<CannyEdgeDetector> cv::cuda::createCannyEdgeDetector(double, double, int, bo

 namespace canny
 {
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream);
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream);

-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);

-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1);
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream);

-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2);
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream);

-    void getEdges(PtrStepSzi map, PtrStepSzb dst);
+    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
 }

 namespace
@@ -76,8 +76,8 @@ namespace
            old_apperture_size_ = -1;
        }

-        void detect(InputArray image, OutputArray edges);
-        void detect(InputArray dx, InputArray dy, OutputArray edges);
+        void detect(InputArray image, OutputArray edges, Stream& stream);
+        void detect(InputArray dx, InputArray dy, OutputArray edges, Stream& stream);

        void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; }
        double getLowThreshold() const { return low_thresh_; }
@@ -111,7 +111,7 @@ namespace

    private:
        void createBuf(Size image_size);
-        void CannyCaller(GpuMat& edges);
+        void CannyCaller(GpuMat& edges, Stream& stream);

        double low_thresh_;
        double high_thresh_;
@@ -128,7 +128,7 @@ namespace
        int old_apperture_size_;
    };

-    void CannyImpl::detect(InputArray _image, OutputArray _edges)
+    void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
    {
        GpuMat image = _image.getGpuMat();

@@ -150,24 +150,24 @@ namespace
            image.locateROI(wholeSize, ofs);
            GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step);

-            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_);
+            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
        }
        else
        {
 #ifndef HAVE_OPENCV_CUDAFILTERS
            throw_no_cuda();
 #else
-            filterDX_->apply(image, dx_);
-            filterDY_->apply(image, dy_);
+            filterDX_->apply(image, dx_, stream);
+            filterDY_->apply(image, dy_, stream);

-            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));
 #endif
        }

-        CannyCaller(edges);
+        CannyCaller(edges, stream);
    }

-    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges)
+    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges, Stream& stream)
    {
        GpuMat dx = _dx.getGpuMat();
        GpuMat dy = _dy.getGpuMat();
@@ -176,8 +176,8 @@ namespace
        CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() );
        CV_Assert( deviceSupports(SHARED_ATOMICS) );

-        dx.copyTo(dx_);
-        dy.copyTo(dy_);
+        dx.copyTo(dx_, stream);
+        dy.copyTo(dy_, stream);

        if (low_thresh_ > high_thresh_)
            std::swap(low_thresh_, high_thresh_);
@@ -187,9 +187,9 @@ namespace
        _edges.create(dx.size(), CV_8UC1);
        GpuMat edges = _edges.getGpuMat();

-        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_, StreamAccessor::getStream(stream));

-        CannyCaller(edges);
+        CannyCaller(edges, stream);
    }

    void CannyImpl::createBuf(Size image_size)
@@ -215,16 +215,16 @@ namespace
        ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2_);
    }

-    void CannyImpl::CannyCaller(GpuMat& edges)
+    void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
    {
        map_.setTo(Scalar::all(0));
-        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_));
+        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));

-        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>());
+        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), StreamAccessor::getStream(stream));

-        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>());
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), StreamAccessor::getStream(stream));

-        canny::getEdges(map_, edges);
+        canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
    }
 }

--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -120,7 +120,7 @@ namespace canny
        mag(y, x) = norm(dxVal, dyVal);
    }

-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
@@ -131,30 +131,31 @@ namespace canny
        if (L2Grad)
        {
            L2 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
        }
        else
        {
            L1 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
        }

        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall(cudaThreadSynchronize());
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
    {
        if (L2Grad)
        {
            L2 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
+            transform(dx, dy, mag, norm, WithOutMask(), stream);
        }
        else
        {
            L1 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
+            transform(dx, dy, mag, norm, WithOutMask(), stream);
        }
    }
 }
@@ -217,17 +218,18 @@ namespace canny
        map(y, x) = edge_type;
    }

-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));

        bindTexture(&tex_mag, mag);

-        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+        calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -328,20 +330,21 @@ namespace canny
        }
    }

-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1)
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
    {
        void* counter_ptr;
        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );

-        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+        cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );

        const dim3 block(16, 16);
        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));

-        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
+        edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -441,27 +444,30 @@ namespace canny
        }
    }

-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2)
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
    {
        void* counter_ptr;
        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );

        int count;
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+        cudaSafeCall( cudaStreamSynchronize(stream) );

        while (count > 0)
        {
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );

            const dim3 block(128);
            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);

-            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
+            edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );

-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaStreamSynchronize(stream) );

            count = min(count, map.cols * map.rows);

@@ -499,9 +505,9 @@ namespace cv { namespace cuda { namespace device

 namespace canny
 {
-    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream)
    {
-        transform(map, dst, GetEdges(), WithOutMask(), 0);
+        transform(map, dst, GetEdges(), WithOutMask(), stream);
    }
 }

--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -68,7 +68,7 @@ namespace
        GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
                                    int blockSize, bool useHarrisDetector, double harrisK);

-        void detect(InputArray image, OutputArray corners, InputArray mask = noArray());
+        void detect(InputArray image, OutputArray corners, InputArray mask, Stream& stream);

    private:
        int maxCorners_;
@@ -96,8 +96,11 @@ namespace
                    cuda::createMinEigenValCorner(srcType, blockSize, 3);
    }

-    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask)
+    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
    {
+        // TODO : implement async version
+        (void) stream;
+
        using namespace cv::cuda::device::gfft;

        GpuMat image = _image.getGpuMat();
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -49,11 +49,11 @@ using namespace cv::cuda;

 void cv::cuda::calcHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }

-void cv::cuda::equalizeHist(InputArray, OutputArray, InputOutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::equalizeHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }

 cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::cuda::CLAHE>(); }

-void cv::cuda::evenLevels(OutputArray, int, int, int) { throw_no_cuda(); }
+void cv::cuda::evenLevels(OutputArray, int, int, int, Stream&) { throw_no_cuda(); }

 void cv::cuda::histEven(InputArray, OutputArray, InputOutputArray, int, int, int, Stream&) { throw_no_cuda(); }
 void cv::cuda::histEven(InputArray, GpuMat*, InputOutputArray, int*, int*, int*, Stream&) { throw_no_cuda(); }
@@ -93,7 +93,7 @@ namespace hist
    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
 }

-void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray _buf, Stream& _stream)
+void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
 {
    GpuMat src = _src.getGpuMat();

@@ -107,8 +107,8 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray

    size_t bufSize = intBufSize + 2 * 256 * sizeof(int);

-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, _buf);
-    GpuMat buf = _buf.getGpuMat();
+    BufferPool pool(_stream);
+    GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);

    GpuMat hist(1, 256, CV_32SC1, buf.data);
    GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int));
@@ -288,7 +288,7 @@ namespace
    {
        typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;

-        static void hist(const GpuMat& src, OutputArray _hist, InputOutputArray _buf, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+        static void hist(const GpuMat& src, OutputArray _hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
        {
            const int levels = histSize + 1;

@@ -302,15 +302,15 @@ namespace
            int buf_size;
            get_buf_size(sz, levels, &buf_size);

-            ensureSizeIsEnough(1, buf_size, CV_8UC1, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);

            NppStreamHandler h(stream);

            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
                lowerLevel, upperLevel, buf.ptr<Npp8u>()) );

-            if (stream == 0)
+            if (!stream)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
@@ -319,7 +319,7 @@ namespace
    {
        typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;

-        static void hist(const GpuMat& src, GpuMat hist[4],InputOutputArray _buf, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream)
+        static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
        {
            int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
            hist[0].create(1, histSize[0], CV_32S);
@@ -336,14 +336,14 @@ namespace
            int buf_size;
            get_buf_size(sz, levels, &buf_size);

-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);

            NppStreamHandler h(stream);

            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr<Npp8u>()) );

-            if (stream == 0)
+            if (!stream)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
@@ -392,7 +392,7 @@ namespace
        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};

-        static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, InputOutputArray _buf, cudaStream_t stream)
+        static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, Stream& stream)
        {
            CV_Assert( levels.type() == LEVEL_TYPE_CODE && levels.rows == 1 );

@@ -406,8 +406,8 @@ namespace
            int buf_size;
            get_buf_size(sz, levels.cols, &buf_size);

-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);

            NppStreamHandler h(stream);

@@ -424,7 +424,7 @@ namespace
        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};

-        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4],InputOutputArray _buf, cudaStream_t stream)
+        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
        {
            CV_Assert( levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1 );
            CV_Assert( levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1 );
@@ -447,8 +447,8 @@ namespace
            int buf_size;
            get_buf_size(sz, nLevels, &buf_size);

-            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
-            GpuMat buf = _buf.getGpuMat();
+            BufferPool pool(stream);
+            GpuMat buf = pool.getBuffer(1, buf_size, CV_8UC1);

            NppStreamHandler h(stream);

@@ -460,7 +460,7 @@ namespace
    };
 }

-void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel)
+void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel, Stream& stream)
 {
    const int kind = _levels.kind();

@@ -475,7 +475,7 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
    nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );

    if (kind == _InputArray::CUDA_GPU_MAT)
-        _levels.getGpuMatRef().upload(host_levels);
+        _levels.getGpuMatRef().upload(host_levels, stream);
 }

 namespace hist
@@ -493,9 +493,9 @@ namespace
    }
 }

-void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
+void cv::cuda::histEven(InputArray _src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, InputOutputArray buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, int levels, int lowerLevel, int upperLevel, Stream& stream);
    static const hist_t hist_callers[] =
    {
        NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
@@ -514,12 +514,12 @@ void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf,

    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );

-    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream);
 }

-void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
+void cv::cuda::histEven(InputArray _src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], InputOutputArray buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], Stream& stream);
    static const hist_t hist_callers[] =
    {
        NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
@@ -532,12 +532,12 @@ void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, i

    CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );

-    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, stream);
 }

-void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, InputOutputArray buf, Stream& stream)
+void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, InputOutputArray buf, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, Stream& stream);
    static const hist_t hist_callers[] =
    {
        NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
@@ -553,12 +553,12 @@ void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels,

    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1 );

-    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, levels, stream);
 }

-void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream)
+void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
 {
-    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, cudaStream_t stream);
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream);
    static const hist_t hist_callers[] =
    {
        NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
@@ -573,7 +573,7 @@ void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4]

    CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4 );

-    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+    hist_callers[src.depth()](src, hist, levels, stream);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/hough_circles.cpp
+++ b/modules/cudaimgproc/src/hough_circles.cpp
@@ -74,7 +74,7 @@ namespace
    public:
        HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles);

-        void detect(InputArray src, OutputArray circles);
+        void detect(InputArray src, OutputArray circles, Stream& stream);

        void setDp(float dp) { dp_ = dp; }
        float getDp() const { return dp_; }
@@ -154,8 +154,11 @@ namespace
        filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
    }

-    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles)
+    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles, Stream& stream)
    {
+        // TODO : implement async version
+        (void) stream;
+
        using namespace cv::cuda::device::hough;
        using namespace cv::cuda::device::hough_circles;

--- a/modules/cudaimgproc/src/hough_lines.cpp
+++ b/modules/cudaimgproc/src/hough_lines.cpp
@@ -75,8 +75,8 @@ namespace
        {
        }

-        void detect(InputArray src, OutputArray lines);
-        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
+        void detect(InputArray src, OutputArray lines, Stream& stream);
+        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream);

        void setRho(float rho) { rho_ = rho; }
        float getRho() const { return rho_; }
@@ -125,8 +125,11 @@ namespace
        GpuMat result_;
    };

-    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines)
+    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
    {
+        // TODO : implement async version
+        (void) stream;
+
        using namespace cv::cuda::device::hough;
        using namespace cv::cuda::device::hough_lines;

@@ -170,7 +173,7 @@ namespace
        result_.copyTo(lines);
    }

-    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes)
+    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes, Stream& stream)
    {
        GpuMat d_lines = _d_lines.getGpuMat();

@@ -184,12 +187,18 @@ namespace

        CV_Assert( d_lines.rows == 2 && d_lines.type() == CV_32FC2 );

-        d_lines.row(0).download(h_lines);
+        if (stream)
+            d_lines.row(0).download(h_lines, stream);
+        else
+            d_lines.row(0).download(h_lines);

        if (h_votes.needed())
        {
            GpuMat d_votes(1, d_lines.cols, CV_32SC1, d_lines.ptr<int>(1));
-            d_votes.download(h_votes);
+            if (stream)
+                d_votes.download(h_votes, stream);
+            else
+                d_votes.download(h_votes);
        }
    }
 }
--- a/modules/cudaimgproc/src/hough_segments.cpp
+++ b/modules/cudaimgproc/src/hough_segments.cpp
@@ -79,7 +79,7 @@ namespace
        {
        }

-        void detect(InputArray src, OutputArray lines);
+        void detect(InputArray src, OutputArray lines, Stream& stream);

        void setRho(float rho) { rho_ = rho; }
        float getRho() const { return rho_; }
@@ -128,8 +128,11 @@ namespace
        GpuMat result_;
    };

-    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines)
+    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines, Stream& stream)
    {
+        // TODO : implement async version
+        (void) stream;
+
        using namespace cv::cuda::device::hough;
        using namespace cv::cuda::device::hough_lines;
        using namespace cv::cuda::device::hough_segments;
--- a/modules/cudaimgproc/src/mssegmentation.cpp
+++ b/modules/cudaimgproc/src/mssegmentation.cpp
@@ -43,7 +43,7 @@

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria) { throw_no_cuda(); }
+void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria, Stream&) { throw_no_cuda(); }

 #else

@@ -222,7 +222,7 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
 } // anonymous namespace


-void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria)
+void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria, Stream& stream)
 {
    GpuMat src = _src.getGpuMat();

@@ -235,7 +235,10 @@ void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp,

    // Perform mean shift procedure and obtain region and spatial maps
    GpuMat d_rmap, d_spmap;
-    cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
+    cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria, stream);
+
+    stream.waitForCompletion();
+
    Mat rmap(d_rmap);
    Mat spmap(d_spmap);

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -1053,12 +1053,11 @@ TEST(equalizeHist)

        cuda::GpuMat d_src(src);
        cuda::GpuMat d_dst;
-        cuda::GpuMat d_buf;

-        cuda::equalizeHist(d_src, d_dst, d_buf);
+        cuda::equalizeHist(d_src, d_dst);

        CUDA_ON;
-        cuda::equalizeHist(d_src, d_dst, d_buf);
+        cuda::equalizeHist(d_src, d_dst);
        CUDA_OFF;
    }
 }