From d1fc3e6b5aacdf87435c478938b90c44e323464c Mon Sep 17 00:00:00 2001
From: Anatoly Baksheev <no@email>
Date: Tue, 10 Aug 2010 09:44:50 +0000
Subject: [PATCH] cv::gpu::CudaStream -> cv::gpu::Stream some refactoring added
 gpu module to compilation

---
 OpenCVConfig.cmake.in                         |  2 +-
 modules/CMakeLists.txt                        |  2 +-
 modules/gpu/include/opencv2/gpu/gpu.hpp       | 36 +++++-----
 .../include/opencv2/gpu/stream_accessor.hpp   |  2 +-
 modules/gpu/src/beliefpropagation_gpu.cpp     |  4 +-
 modules/gpu/src/cuda/imgproc.cu               | 67 ++++++++++--------
 modules/gpu/src/cudastream.cpp                | 70 +++++++++----------
 modules/gpu/src/matrix_operations.cpp         |  4 +-
 modules/gpu/src/stereobm_gpu.cpp              |  4 +-
 tests/gpu/src/operator_async_call.cpp         |  2 +-
 10 files changed, 101 insertions(+), 92 deletions(-)

diff --git a/OpenCVConfig.cmake.in b/OpenCVConfig.cmake.in
index 812711fd9..e6ed61bf3 100644
--- a/OpenCVConfig.cmake.in
+++ b/OpenCVConfig.cmake.in
@@ -43,7 +43,7 @@ SET(OpenCV_LIB_DIR "@CMAKE_LIB_DIRS_CONFIGCMAKE@")
 # ====================================================================
 # Link libraries: e.g.   opencv_core220.so, opencv_imgproc220d.lib, etc...
 # ====================================================================
-set(OPENCV_LIB_COMPONENTS opencv_core opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect opencv_video opencv_highgui opencv_ml opencv_legacy opencv_contrib)
+set(OPENCV_LIB_COMPONENTS opencv_core opencv_imgproc opencv_features2d opencv_gpu opencv_calib3d opencv_objdetect opencv_video opencv_highgui opencv_ml opencv_legacy opencv_contrib)
 SET(OpenCV_LIBS "")
 foreach(__CVLIB ${OPENCV_LIB_COMPONENTS})
 	# CMake>=2.6 supports the notation "debug XXd optimized XX"
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 0a6df840d..a9e3eb932 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -24,4 +24,4 @@ add_subdirectory(haartraining)
 add_subdirectory(traincascade)
 
 
-#add_subdirectory(gpu)
+add_subdirectory(gpu)
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 41daf0766..f389f8fd5 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -67,7 +67,7 @@ namespace cv
         CV_EXPORTS void getGpuMemInfo(size_t *free, size_t* total);
 
         //////////////////////////////// GpuMat ////////////////////////////////
-        class CudaStream;
+        class Stream;
         class MatPL;
 
         //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
@@ -111,12 +111,12 @@ namespace cv
 
             //! pefroms blocking upload data to GpuMat. .
             void upload(const cv::Mat& m);
-            void upload(const MatPL& m, CudaStream& stream);
+            void upload(const MatPL& m, Stream& stream);
 
             //! Downloads data from device to host memory. Blocking calls.
             operator Mat() const;
             void download(cv::Mat& m) const;
-            void download(MatPL& m, CudaStream& stream) const;
+            void download(MatPL& m, Stream& stream) const;
 
             //! returns a new GpuMatrix header for the specified row
             GpuMat row(int y) const;
@@ -291,14 +291,14 @@ namespace cv
         // Passed to each function that supports async kernel execution.
         // Reference counting is enabled
 
-        class CV_EXPORTS CudaStream
+        class CV_EXPORTS Stream
         {
         public:
-            CudaStream();
-            ~CudaStream();
+            Stream();
+            ~Stream();
 
-            CudaStream(const CudaStream&);
-            CudaStream& operator=(const CudaStream&);
+            Stream(const Stream&);
+            Stream& operator=(const Stream&);
 
             bool queryIfComplete();
             void waitForCompletion();
@@ -355,7 +355,7 @@ namespace cv
             void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity);
 
             //! Acync version
-            void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream & stream);
+            void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const Stream & stream);
 
             //! Some heuristics that tries to estmate
             // if current GPU will be faster then CPU in this algorithm.
@@ -390,18 +390,18 @@ namespace cv
             enum { DEFAULT_LEVELS = 5  };
 
             //! the default constructor
-            explicit StereoBeliefPropagation_GPU(int ndisp_  = DEFAULT_NDISP, 
-                                                 int iters_  = DEFAULT_ITERS, 
-                                                 int levels_ = DEFAULT_LEVELS,
-                                                 int msg_type_ = MSG_TYPE_AUTO,
+            explicit StereoBeliefPropagation_GPU(int ndisp  = DEFAULT_NDISP, 
+                                                 int iters  = DEFAULT_ITERS, 
+                                                 int levels = DEFAULT_LEVELS,
+                                                 int msg_type = MSG_TYPE_AUTO,
                                                  float msg_scale = 1.0f);
             //! the full constructor taking the number of disparities, number of BP iterations on each level,
             //! number of levels, truncation of data cost, data weight, 
             //! truncation of discontinuity cost and discontinuity single jump
-            StereoBeliefPropagation_GPU(int ndisp_, int iters_, int levels_, 
-                                        float max_data_term_, float data_weight_,
-                                        float max_disc_term_, float disc_single_jump_,
-                                        int msg_type_ = MSG_TYPE_AUTO,
+            StereoBeliefPropagation_GPU(int ndisp, int iters, int levels, 
+                                        float max_data_term, float data_weight,
+                                        float max_disc_term, float disc_single_jump,
+                                        int msg_type = MSG_TYPE_AUTO,
                                         float msg_scale = 1.0f);
 
             //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
@@ -409,7 +409,7 @@ namespace cv
             void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity);
             
             //! Acync version
-            void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream& stream);
+            void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, const Stream& stream);
 
             //! Some heuristics that tries to estmate
             //! if current GPU will be faster then CPU in this algorithm.
diff --git a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
index 389b7cd55..cd92ca0f7 100644
--- a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
+++ b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
@@ -56,7 +56,7 @@ namespace cv
         // In this case you have to install Cuda Toolkit.
         struct StreamAccessor
         {
-            CV_EXPORTS static cudaStream_t getStream(const CudaStream& stream);
+            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
         };
     }
 }
diff --git a/modules/gpu/src/beliefpropagation_gpu.cpp b/modules/gpu/src/beliefpropagation_gpu.cpp
index 3ebf246c3..6d1a20250 100644
--- a/modules/gpu/src/beliefpropagation_gpu.cpp
+++ b/modules/gpu/src/beliefpropagation_gpu.cpp
@@ -52,7 +52,7 @@ cv::gpu::StereoBeliefPropagation_GPU::StereoBeliefPropagation_GPU(int, int, int,
 cv::gpu::StereoBeliefPropagation_GPU::StereoBeliefPropagation_GPU(int, int, int, float, float, float, float, int, float) { throw_nogpu(); }
 
 void cv::gpu::StereoBeliefPropagation_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::StereoBeliefPropagation_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, const CudaStream&) { throw_nogpu(); }
+void cv::gpu::StereoBeliefPropagation_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
 
 bool cv::gpu::StereoBeliefPropagation_GPU::checkIfGpuCallReasonable() { throw_nogpu(); return false; }
 
@@ -282,7 +282,7 @@ void cv::gpu::StereoBeliefPropagation_GPU::operator()(const GpuMat& left, const
     ::stereo_bp_gpu_operator(ndisp, iters, levels, max_data_term, data_weight, max_disc_term, disc_single_jump, msg_type, msg_scale, u, d, l, r, u2, d2, l2, r2, datas, out, left, right, disp, 0);
 }
 
-void cv::gpu::StereoBeliefPropagation_GPU::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, const CudaStream& stream)
+void cv::gpu::StereoBeliefPropagation_GPU::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, const Stream& stream)
 {
     ::stereo_bp_gpu_operator(ndisp, iters, levels, max_data_term, data_weight, max_disc_term, disc_single_jump, msg_type, msg_scale, u, d, l, r, u2, d2, l2, r2, datas, out, left, right, disp, StreamAccessor::getStream(stream));
 }
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index b475a22b6..a3ec30208 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -44,9 +44,11 @@
 
 using namespace cv::gpu;
 
+
+/////////////////////////////////// Remap ///////////////////////////////////////////////
 namespace imgproc
 {
-    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex1;
+    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
 
     __global__ void kernel_remap(const float *mapx, const float *mapy, size_t map_step, unsigned char* out, size_t out_step, int width, int height)
     {    
@@ -58,12 +60,40 @@ namespace imgproc
 
             float xcoo = mapx[idx];
             float ycoo = mapy[idx];
-            
-            out[y * out_step + x] = (unsigned char)(255.f * tex2D(tex1, xcoo, ycoo));            
+
+            out[y * out_step + x] = (unsigned char)(255.f * tex2D(tex_remap, xcoo, ycoo));            
         }
     }
 
-    texture< uchar4, 2, cudaReadModeElementType > tex_meanshift;
+}
+
+namespace cv { namespace gpu { namespace impl 
+{
+    extern "C" void remap_gpu(const DevMem2D& src, const DevMem2D_<float>& xmap, const DevMem2D_<float>& ymap, DevMem2D dst)
+    {
+        dim3 block(16, 16, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(dst.cols, block.x);
+        grid.y = divUp(dst.rows, block.y);
+
+        imgproc::tex_remap.filterMode = cudaFilterModeLinear;	    
+        imgproc::tex_remap.addressMode[0] = imgproc::tex_remap.addressMode[1] = cudaAddressModeWrap;
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+        cudaSafeCall( cudaBindTexture2D(0, imgproc::tex_remap, src.ptr, desc, dst.cols, dst.rows, src.step) );
+
+        imgproc::kernel_remap<<<grid, block>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
+
+        cudaSafeCall( cudaThreadSynchronize() );  
+        cudaSafeCall( cudaUnbindTexture(imgproc::tex_remap) );
+    }
+}}}
+
+
+/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
+
+namespace imgproc
+{
+    texture<uchar4, 2> tex_meanshift;
 
     extern "C" __global__ void meanshift_kernel( unsigned char* out, int out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
     {
@@ -72,9 +102,8 @@ namespace imgproc
 
         if( x0 < cols && y0 < rows )
         {
-
             int isr2 = sr*sr;
-            uchar4 c = tex2D( tex_meanshift, x0, y0 );
+            uchar4 c = tex2D(tex_meanshift, x0, y0 );
             // iterate meanshift procedure
             for( int iter = 0; iter < maxIter; iter++ )
             {
@@ -137,26 +166,6 @@ namespace imgproc
 
 namespace cv { namespace gpu { namespace impl 
 {
-    using namespace imgproc;
-
-    extern "C" void remap_gpu(const DevMem2D& src, const DevMem2D_<float>& xmap, const DevMem2D_<float>& ymap, DevMem2D dst)
-    {
-        dim3 block(16, 16, 1);
-        dim3 grid(1, 1, 1);
-        grid.x = divUp(dst.cols, block.x);
-        grid.y = divUp(dst.rows, block.y);
-
-        tex1.filterMode = cudaFilterModeLinear;	    
-        tex1.addressMode[0] = tex1.addressMode[1] = cudaAddressModeWrap;
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-        cudaSafeCall( cudaBindTexture2D(0, tex1, src.ptr, desc, dst.cols, dst.rows, src.step) );
-
-        kernel_remap<<<grid, block>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
-
-        cudaSafeCall( cudaThreadSynchronize() );  
-        cudaSafeCall( cudaUnbindTexture(tex1) );
-    }
-
     extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, float sp, float sr, int maxIter, float eps)
     {                        
         dim3  grid(1, 1, 1);
@@ -165,11 +174,11 @@ namespace cv { namespace gpu { namespace impl
         grid.y = divUp(src.rows, threads.y);
 
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-        cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
+        cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
 
-        meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+        imgproc::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
         cudaSafeCall( cudaThreadSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+        cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) );        
     }
 }}}
 
diff --git a/modules/gpu/src/cudastream.cpp b/modules/gpu/src/cudastream.cpp
index 560319764..0f9647324 100644
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -48,28 +48,28 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA)
 
-void cv::gpu::CudaStream::create() { throw_nogpu(); }
-void cv::gpu::CudaStream::release() { throw_nogpu(); }
-cv::gpu::CudaStream::CudaStream() : impl(0) { throw_nogpu(); }
-cv::gpu::CudaStream::~CudaStream() { throw_nogpu(); }
-cv::gpu::CudaStream::CudaStream(const CudaStream& /*stream*/) { throw_nogpu(); }
-CudaStream& cv::gpu::CudaStream::operator=(const CudaStream& /*stream*/) { throw_nogpu(); return *this; }
-bool cv::gpu::CudaStream::queryIfComplete() { throw_nogpu(); return true; }
-void cv::gpu::CudaStream::waitForCompletion() { throw_nogpu(); }
-void cv::gpu::CudaStream::enqueueDownload(const GpuMat& /*src*/, Mat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::CudaStream::enqueueDownload(const GpuMat& /*src*/, MatPL& /*dst*/) { throw_nogpu(); }
-void cv::gpu::CudaStream::enqueueUpload(const MatPL& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::CudaStream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::CudaStream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::CudaStream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
-void cv::gpu::CudaStream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
-void cv::gpu::CudaStream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
+void cv::gpu::Stream::create() { throw_nogpu(); }
+void cv::gpu::Stream::release() { throw_nogpu(); }
+cv::gpu::Stream::Stream() : impl(0) { throw_nogpu(); }
+cv::gpu::Stream::~Stream() { throw_nogpu(); }
+cv::gpu::Stream::Stream(const Stream& /*stream*/) { throw_nogpu(); }
+Stream& cv::gpu::Stream::operator=(const Stream& /*stream*/) { throw_nogpu(); return *this; }
+bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return true; }
+void cv::gpu::Stream::waitForCompletion() { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, Mat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, MatPL& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const MatPL& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
 #include "opencv2/gpu/stream_accessor.hpp"
 
-struct CudaStream::Impl
+struct Stream::Impl
 {
     cudaStream_t stream;
     int ref_counter;
@@ -85,9 +85,9 @@ namespace
     };
 }
 
-CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const CudaStream& stream) { return stream.impl->stream; };
+CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; };
 
-void cv::gpu::CudaStream::create()
+void cv::gpu::Stream::create()
 {
     if (impl)
         release();
@@ -95,13 +95,13 @@ void cv::gpu::CudaStream::create()
     cudaStream_t stream;
     cudaSafeCall( cudaStreamCreate( &stream ) );
 
-    impl = (CudaStream::Impl*)fastMalloc(sizeof(CudaStream::Impl));
+    impl = (Stream::Impl*)fastMalloc(sizeof(Stream::Impl));
 
     impl->stream = stream;
     impl->ref_counter = 1;
 }
 
-void cv::gpu::CudaStream::release()
+void cv::gpu::Stream::release()
 {
     if( impl && CV_XADD(&impl->ref_counter, -1) == 1 )
     {
@@ -110,15 +110,15 @@ void cv::gpu::CudaStream::release()
     }
 }
 
-cv::gpu::CudaStream::CudaStream() : impl(0) { create(); }
-cv::gpu::CudaStream::~CudaStream() { release(); }
+cv::gpu::Stream::Stream() : impl(0) { create(); }
+cv::gpu::Stream::~Stream() { release(); }
 
-cv::gpu::CudaStream::CudaStream(const CudaStream& stream) : impl(stream.impl)
+cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
 {
     if( impl )
         CV_XADD(&impl->ref_counter, 1);
 }
-CudaStream& cv::gpu::CudaStream::operator=(const CudaStream& stream)
+Stream& cv::gpu::Stream::operator=(const Stream& stream)
 {
     if( this != &stream )
     {
@@ -131,7 +131,7 @@ CudaStream& cv::gpu::CudaStream::operator=(const CudaStream& stream)
     return *this;
 }
 
-bool cv::gpu::CudaStream::queryIfComplete()
+bool cv::gpu::Stream::queryIfComplete()
 {
     cudaError_t err = cudaStreamQuery( impl->stream );
 
@@ -142,31 +142,31 @@ bool cv::gpu::CudaStream::queryIfComplete()
     return false;
 }
 
-void cv::gpu::CudaStream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( impl->stream ) ); }
+void cv::gpu::Stream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( impl->stream ) ); }
 
-void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, Mat& dst)
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
 {
     // if not -> allocation will be done, but after that dst will not point to page locked memory
     CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() )
     devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);
 }
-void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }
 
-void cv::gpu::CudaStream::enqueueUpload(const MatPL& src, GpuMat& dst){ devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
-void cv::gpu::CudaStream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
-void cv::gpu::CudaStream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }
+void cv::gpu::Stream::enqueueUpload(const MatPL& src, GpuMat& dst){ devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
+void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
+void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }
 
-void cv::gpu::CudaStream::enqueueMemSet(const GpuMat& src, Scalar val)
+void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val)
 {
     impl::set_to_without_mask(src, src.depth(), val.val, src.channels(), impl->stream);
 }
 
-void cv::gpu::CudaStream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask)
+void cv::gpu::Stream::enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask)
 {
     impl::set_to_with_mask(src, src.depth(), val.val, mask, src.channels(), impl->stream);
 }
 
-void cv::gpu::CudaStream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
+void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
 {
     bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
 
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp
index 88d2e2145..cdab363d8 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -82,7 +82,7 @@ void cv::gpu::GpuMat::upload(const Mat& m)
     cudaSafeCall( cudaMemcpy2D(data, step, m.data, m.step, cols * elemSize(), rows, cudaMemcpyHostToDevice) );
 }
 
-void cv::gpu::GpuMat::upload(const MatPL& m, CudaStream& stream)
+void cv::gpu::GpuMat::upload(const MatPL& m, Stream& stream)
 {
     CV_DbgAssert(!m.empty());
     stream.enqueueUpload(m, *this);
@@ -95,7 +95,7 @@ void cv::gpu::GpuMat::download(cv::Mat& m) const
     cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost) );
 }
 
-void cv::gpu::GpuMat::download(MatPL& m, CudaStream& stream) const
+void cv::gpu::GpuMat::download(MatPL& m, Stream& stream) const
 {
     CV_DbgAssert(!m.empty());
     stream.enqueueDownload(*this, m);
diff --git a/modules/gpu/src/stereobm_gpu.cpp b/modules/gpu/src/stereobm_gpu.cpp
index 91d2516e9..2c1f56b6d 100644
--- a/modules/gpu/src/stereobm_gpu.cpp
+++ b/modules/gpu/src/stereobm_gpu.cpp
@@ -52,7 +52,7 @@ cv::gpu::StereoBM_GPU::StereoBM_GPU(int, int, int) { throw_nogpu(); }
 
 bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() { throw_nogpu(); return false; }
 void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, const CudaStream&) { throw_nogpu(); }
+void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -134,7 +134,7 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right
     ::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, 0);
 }
 
-void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream& stream)
+void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const Stream& stream)
 {
     ::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));
 }
diff --git a/tests/gpu/src/operator_async_call.cpp b/tests/gpu/src/operator_async_call.cpp
index 17068815d..024622619 100644
--- a/tests/gpu/src/operator_async_call.cpp
+++ b/tests/gpu/src/operator_async_call.cpp
@@ -58,7 +58,7 @@ bool CV_GpuMatASyncCall::compare_matrix(cv::Mat & cpumat, gpu::GpuMat & gpumat)
 
     //int64 time = getTickCount();
 
-    CudaStream stream;
+    Stream stream;
     stream.enqueueCopy(gmat0, gmat1);
     stream.enqueueCopy(gmat0, gmat2);
     stream.enqueueCopy(gmat0, gmat3);