Merge pull request #974 from jet47:gpu-core-refactoring

2013-06-11 12:05:56 +04:00 · 2013-06-11 12:05:56 +04:00 · 81c6b46fc6
commit 81c6b46fc6
parent e16af9bdc9 4f0d72bf43
96 changed files with 5335 additions and 4101 deletions
--- a/doc/check_docs2.py
+++ b/doc/check_docs2.py
@ -201,9 +201,9 @@ def process_module(module, path):
            hdrlist.append(os.path.join(root, filename))

    if module == "gpu":
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda_devptrs.hpp"))
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpumat.hpp"))
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "stream_accessor.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu_types.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu_stream_accessor.hpp"))

    decls = []
    for hname in hdrlist:
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -493,6 +493,9 @@ namespace ogl
 namespace gpu
 {
    class CV_EXPORTS GpuMat;
+    class CV_EXPORTS CudaMem;
+    class CV_EXPORTS Stream;
+    class CV_EXPORTS Event;
 }

 } // cv
--- a/modules/core/include/opencv2/core/cuda/common.hpp
+++ b/modules/core/include/opencv2/core/cuda/common.hpp
@ -44,7 +44,7 @@
 #define __OPENCV_GPU_COMMON_HPP__

 #include <cuda_runtime.h>
-#include "opencv2/core/cuda_devptrs.hpp"
+#include "opencv2/core/gpu_types.hpp"
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/base.hpp"

--- a/modules/core/include/opencv2/core/gpu.hpp
+++ b/modules/core/include/opencv2/core/gpu.hpp
@ -0,0 +1,691 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_GPU_HPP__
+#define __OPENCV_CORE_GPU_HPP__
+
+#ifndef __cplusplus
+#  error gpu.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/gpu_types.hpp"
+
+namespace cv { namespace gpu {
+
+//////////////////////////////// GpuMat ///////////////////////////////
+
+// Smart pointer for GPU memory with reference counting.
+// Its interface is mostly similar with cv::Mat.
+
+class CV_EXPORTS GpuMat
+{
+public:
+    //! default constructor
+    GpuMat();
+
+    //! constructs GpuMat of the specified size and type
+    GpuMat(int rows, int cols, int type);
+    GpuMat(Size size, int type);
+
+    //! constucts GpuMat and fills it with the specified value _s
+    GpuMat(int rows, int cols, int type, Scalar s);
+    GpuMat(Size size, int type, Scalar s);
+
+    //! copy constructor
+    GpuMat(const GpuMat& m);
+
+    //! constructor for GpuMat headers pointing to user-allocated data
+    GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
+    GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
+
+    //! creates a GpuMat header for a part of the bigger matrix
+    GpuMat(const GpuMat& m, Range rowRange, Range colRange);
+    GpuMat(const GpuMat& m, Rect roi);
+
+    //! builds GpuMat from host memory (Blocking call)
+    explicit GpuMat(InputArray arr);
+
+    //! destructor - calls release()
+    ~GpuMat();
+
+    //! assignment operators
+    GpuMat& operator =(const GpuMat& m);
+
+    //! allocates new GpuMat data unless the GpuMat already has specified size and type
+    void create(int rows, int cols, int type);
+    void create(Size size, int type);
+
+    //! decreases reference counter, deallocate the data when reference counter reaches 0
+    void release();
+
+    //! swaps with other smart pointer
+    void swap(GpuMat& mat);
+
+    //! pefroms upload data to GpuMat (Blocking call)
+    void upload(InputArray arr);
+
+    //! pefroms upload data to GpuMat (Non-Blocking call)
+    void upload(InputArray arr, Stream& stream);
+
+    //! pefroms download data from device to host memory (Blocking call)
+    void download(OutputArray dst) const;
+
+    //! pefroms download data from device to host memory (Non-Blocking call)
+    void download(OutputArray dst, Stream& stream) const;
+
+    //! returns deep copy of the GpuMat, i.e. the data is copied
+    GpuMat clone() const;
+
+    //! copies the GpuMat content to device memory (Blocking call)
+    void copyTo(OutputArray dst) const;
+
+    //! copies the GpuMat content to device memory (Non-Blocking call)
+    void copyTo(OutputArray dst, Stream& stream) const;
+
+    //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
+    void copyTo(OutputArray dst, InputArray mask) const;
+
+    //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
+    void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+
+    //! sets some of the GpuMat elements to s (Blocking call)
+    GpuMat& setTo(Scalar s);
+
+    //! sets some of the GpuMat elements to s (Non-Blocking call)
+    GpuMat& setTo(Scalar s, Stream& stream);
+
+    //! sets some of the GpuMat elements to s, according to the mask (Blocking call)
+    GpuMat& setTo(Scalar s, InputArray mask);
+
+    //! sets some of the GpuMat elements to s, according to the mask (Non-Blocking call)
+    GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
+
+    //! converts GpuMat to another datatype (Blocking call)
+    void convertTo(OutputArray dst, int rtype) const;
+
+    //! converts GpuMat to another datatype (Non-Blocking call)
+    void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+
+    //! converts GpuMat to another datatype with scaling (Blocking call)
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+
+    //! converts GpuMat to another datatype with scaling (Non-Blocking call)
+    void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
+
+    //! converts GpuMat to another datatype with scaling (Non-Blocking call)
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+
+    void assignTo(GpuMat& m, int type=-1) const;
+
+    //! returns pointer to y-th row
+    uchar* ptr(int y = 0);
+    const uchar* ptr(int y = 0) const;
+
+    //! template version of the above method
+    template<typename _Tp> _Tp* ptr(int y = 0);
+    template<typename _Tp> const _Tp* ptr(int y = 0) const;
+
+    template <typename _Tp> operator PtrStepSz<_Tp>() const;
+    template <typename _Tp> operator PtrStep<_Tp>() const;
+
+    //! returns a new GpuMat header for the specified row
+    GpuMat row(int y) const;
+
+    //! returns a new GpuMat header for the specified column
+    GpuMat col(int x) const;
+
+    //! ... for the specified row span
+    GpuMat rowRange(int startrow, int endrow) const;
+    GpuMat rowRange(Range r) const;
+
+    //! ... for the specified column span
+    GpuMat colRange(int startcol, int endcol) const;
+    GpuMat colRange(Range r) const;
+
+    //! extracts a rectangular sub-GpuMat (this is a generalized form of row, rowRange etc.)
+    GpuMat operator ()(Range rowRange, Range colRange) const;
+    GpuMat operator ()(Rect roi) const;
+
+    //! creates alternative GpuMat header for the same data, with different
+    //! number of channels and/or different number of rows
+    GpuMat reshape(int cn, int rows = 0) const;
+
+    //! locates GpuMat header within a parent GpuMat
+    void locateROI(Size& wholeSize, Point& ofs) const;
+
+    //! moves/resizes the current GpuMat ROI inside the parent GpuMat
+    GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
+
+    //! returns true iff the GpuMat data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    bool isContinuous() const;
+
+    //! returns element size in bytes
+    size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    size_t elemSize1() const;
+
+    //! returns element type
+    int type() const;
+
+    //! returns element type
+    int depth() const;
+
+    //! returns number of channels
+    int channels() const;
+
+    //! returns step/elemSize1()
+    size_t step1() const;
+
+    //! returns GpuMat size : width == number of columns, height == number of rows
+    Size size() const;
+
+    //! returns true if GpuMat data is NULL
+    bool empty() const;
+
+    /*! includes several bit-fields:
+    - the magic signature
+    - continuity flag
+    - depth
+    - number of channels
+    */
+    int flags;
+
+    //! the number of rows and columns
+    int rows, cols;
+
+    //! a distance between successive rows in bytes; includes the gap if any
+    size_t step;
+
+    //! pointer to the data
+    uchar* data;
+
+    //! pointer to the reference counter;
+    //! when GpuMat points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    //! helper fields used in locateROI and adjustROI
+    uchar* datastart;
+    uchar* dataend;
+};
+
+//! creates continuous matrix
+CV_EXPORTS void createContinuous(int rows, int cols, int type, OutputArray arr);
+
+//! ensures that size of the given matrix is not less than (rows, cols) size
+//! and matrix type is match specified one too
+CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
+
+CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat);
+
+//////////////////////////////// CudaMem ////////////////////////////////
+
+// CudaMem is limited cv::Mat with page locked memory allocation.
+// Page locked memory is only needed for async and faster coping to GPU.
+// It is convertable to cv::Mat header without reference counting
+// so you can use it with other opencv functions.
+
+class CV_EXPORTS CudaMem
+{
+public:
+    enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };
+
+    explicit CudaMem(AllocType alloc_type = PAGE_LOCKED);
+
+    CudaMem(const CudaMem& m);
+
+    CudaMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
+    CudaMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
+
+    //! creates from host memory with coping data
+    explicit CudaMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
+
+    ~CudaMem();
+
+    CudaMem& operator =(const CudaMem& m);
+
+    //! swaps with other smart pointer
+    void swap(CudaMem& b);
+
+    //! returns deep copy of the matrix, i.e. the data is copied
+    CudaMem clone() const;
+
+    //! allocates new matrix data unless the matrix already has specified size and type.
+    void create(int rows, int cols, int type);
+    void create(Size size, int type);
+
+    //! creates alternative CudaMem header for the same data, with different
+    //! number of channels and/or different number of rows
+    CudaMem reshape(int cn, int rows = 0) const;
+
+    //! decrements reference counter and released memory if needed.
+    void release();
+
+    //! returns matrix header with disabled reference counting for CudaMem data.
+    Mat createMatHeader() const;
+
+    //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+    GpuMat createGpuMatHeader() const;
+
+    // Please see cv::Mat for descriptions
+    bool isContinuous() const;
+    size_t elemSize() const;
+    size_t elemSize1() const;
+    int type() const;
+    int depth() const;
+    int channels() const;
+    size_t step1() const;
+    Size size() const;
+    bool empty() const;
+
+    // Please see cv::Mat for descriptions
+    int flags;
+    int rows, cols;
+    size_t step;
+
+    uchar* data;
+    int* refcount;
+
+    uchar* datastart;
+    uchar* dataend;
+
+    AllocType alloc_type;
+};
+
+//! page-locks the matrix m memory and maps it for the device(s)
+CV_EXPORTS void registerPageLocked(Mat& m);
+
+//! unmaps the memory of matrix m, and makes it pageable again
+CV_EXPORTS void unregisterPageLocked(Mat& m);
+
+///////////////////////////////// Stream //////////////////////////////////
+
+// Encapculates Cuda Stream. Provides interface for async coping.
+// Passed to each function that supports async kernel execution.
+// Reference counting is enabled.
+
+class CV_EXPORTS Stream
+{
+    typedef void (Stream::*bool_type)() const;
+    void this_type_does_not_support_comparisons() const {}
+
+public:
+    typedef void (*StreamCallback)(int status, void* userData);
+
+    //! creates a new asynchronous stream
+    Stream();
+
+    //! queries an asynchronous stream for completion status
+    bool queryIfComplete() const;
+
+    //! waits for stream tasks to complete
+    void waitForCompletion();
+
+    //! makes a compute stream wait on an event
+    void waitEvent(const Event& event);
+
+    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+    void enqueueHostCallback(StreamCallback callback, void* userData);
+
+    //! return Stream object for default CUDA stream
+    static Stream& Null();
+
+    //! returns true if stream object is not default (!= 0)
+    operator bool_type() const;
+
+    // obsolete methods
+
+    void enqueueDownload(const GpuMat& src, OutputArray dst);
+
+    void enqueueUpload(InputArray src, GpuMat& dst);
+
+    void enqueueCopy(const GpuMat& src, OutputArray dst);
+
+    void enqueueMemSet(GpuMat& src, Scalar val);
+    void enqueueMemSet(GpuMat& src, Scalar val, InputArray mask);
+
+    void enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha = 1.0, double beta = 0.0);
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    Stream(const Ptr<Impl>& impl);
+
+    friend struct StreamAccessor;
+};
+
+class CV_EXPORTS Event
+{
+public:
+    enum CreateFlags
+    {
+        DEFAULT        = 0x00,  /**< Default event flag */
+        BLOCKING_SYNC  = 0x01,  /**< Event uses blocking synchronization */
+        DISABLE_TIMING = 0x02,  /**< Event will not record timing data */
+        INTERPROCESS   = 0x04   /**< Event is suitable for interprocess use. DisableTiming must be set */
+    };
+
+    explicit Event(CreateFlags flags = DEFAULT);
+
+    //! records an event
+    void record(Stream& stream = Stream::Null());
+
+    //! queries an event's status
+    bool queryIfComplete() const;
+
+    //! waits for an event to complete
+    void waitForCompletion();
+
+    //! computes the elapsed time between events
+    static float elapsedTime(const Event& start, const Event& end);
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+
+    friend struct EventAccessor;
+};
+
+//////////////////////////////// Initialization & Info ////////////////////////
+
+//! this is the only function that do not throw exceptions if the library is compiled without CUDA
+CV_EXPORTS int getCudaEnabledDeviceCount();
+
+//! set device to be used for GPU executions for the calling host thread
+CV_EXPORTS void setDevice(int device);
+
+//! returns which device is currently being used for the calling host thread
+CV_EXPORTS int getDevice();
+
+//! explicitly destroys and cleans up all resources associated with the current device in the current process
+//! any subsequent API call to this device will reinitialize the device
+CV_EXPORTS void resetDevice();
+
+enum FeatureSet
+{
+    FEATURE_SET_COMPUTE_10 = 10,
+    FEATURE_SET_COMPUTE_11 = 11,
+    FEATURE_SET_COMPUTE_12 = 12,
+    FEATURE_SET_COMPUTE_13 = 13,
+    FEATURE_SET_COMPUTE_20 = 20,
+    FEATURE_SET_COMPUTE_21 = 21,
+    FEATURE_SET_COMPUTE_30 = 30,
+    FEATURE_SET_COMPUTE_35 = 35,
+
+    GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
+    SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
+    NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
+    WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30,
+    DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35
+};
+
+//! checks whether current device supports the given feature
+CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
+
+//! information about what GPU archs this OpenCV GPU module was compiled for
+class CV_EXPORTS TargetArchs
+{
+public:
+    static bool builtWith(FeatureSet feature_set);
+
+    static bool has(int major, int minor);
+    static bool hasPtx(int major, int minor);
+    static bool hasBin(int major, int minor);
+
+    static bool hasEqualOrLessPtx(int major, int minor);
+    static bool hasEqualOrGreater(int major, int minor);
+    static bool hasEqualOrGreaterPtx(int major, int minor);
+    static bool hasEqualOrGreaterBin(int major, int minor);
+};
+
+//! information about the given GPU.
+class CV_EXPORTS DeviceInfo
+{
+public:
+    //! creates DeviceInfo object for the current GPU
+    DeviceInfo();
+
+    //! creates DeviceInfo object for the given GPU
+    DeviceInfo(int device_id);
+
+    //! device number.
+    int deviceID() const;
+
+    //! ASCII string identifying device
+    const char* name() const;
+
+    //! global memory available on device in bytes
+    size_t totalGlobalMem() const;
+
+    //! shared memory available per block in bytes
+    size_t sharedMemPerBlock() const;
+
+    //! 32-bit registers available per block
+    int regsPerBlock() const;
+
+    //! warp size in threads
+    int warpSize() const;
+
+    //! maximum pitch in bytes allowed by memory copies
+    size_t memPitch() const;
+
+    //! maximum number of threads per block
+    int maxThreadsPerBlock() const;
+
+    //! maximum size of each dimension of a block
+    Vec3i maxThreadsDim() const;
+
+    //! maximum size of each dimension of a grid
+    Vec3i maxGridSize() const;
+
+    //! clock frequency in kilohertz
+    int clockRate() const;
+
+    //! constant memory available on device in bytes
+    size_t totalConstMem() const;
+
+    //! major compute capability
+    int major() const;
+
+    //! minor compute capability
+    int minor() const;
+
+    //! alignment requirement for textures
+    size_t textureAlignment() const;
+
+    //! pitch alignment requirement for texture references bound to pitched memory
+    size_t texturePitchAlignment() const;
+
+    //! number of multiprocessors on device
+    int multiProcessorCount() const;
+
+    //! specified whether there is a run time limit on kernels
+    bool kernelExecTimeoutEnabled() const;
+
+    //! device is integrated as opposed to discrete
+    bool integrated() const;
+
+    //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
+    bool canMapHostMemory() const;
+
+    enum ComputeMode
+    {
+        ComputeModeDefault,         /**< default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */
+        ComputeModeExclusive,       /**< compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */
+        ComputeModeProhibited,      /**< compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */
+        ComputeModeExclusiveProcess /**< compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */
+    };
+
+    //! compute mode
+    ComputeMode computeMode() const;
+
+    //! maximum 1D texture size
+    int maxTexture1D() const;
+
+    //! maximum 1D mipmapped texture size
+    int maxTexture1DMipmap() const;
+
+    //! maximum size for 1D textures bound to linear memory
+    int maxTexture1DLinear() const;
+
+    //! maximum 2D texture dimensions
+    Vec2i maxTexture2D() const;
+
+    //! maximum 2D mipmapped texture dimensions
+    Vec2i maxTexture2DMipmap() const;
+
+    //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
+    Vec3i maxTexture2DLinear() const;
+
+    //! maximum 2D texture dimensions if texture gather operations have to be performed
+    Vec2i maxTexture2DGather() const;
+
+    //! maximum 3D texture dimensions
+    Vec3i maxTexture3D() const;
+
+    //! maximum Cubemap texture dimensions
+    int maxTextureCubemap() const;
+
+    //! maximum 1D layered texture dimensions
+    Vec2i maxTexture1DLayered() const;
+
+    //! maximum 2D layered texture dimensions
+    Vec3i maxTexture2DLayered() const;
+
+    //! maximum Cubemap layered texture dimensions
+    Vec2i maxTextureCubemapLayered() const;
+
+    //! maximum 1D surface size
+    int maxSurface1D() const;
+
+    //! maximum 2D surface dimensions
+    Vec2i maxSurface2D() const;
+
+    //! maximum 3D surface dimensions
+    Vec3i maxSurface3D() const;
+
+    //! maximum 1D layered surface dimensions
+    Vec2i maxSurface1DLayered() const;
+
+    //! maximum 2D layered surface dimensions
+    Vec3i maxSurface2DLayered() const;
+
+    //! maximum Cubemap surface dimensions
+    int maxSurfaceCubemap() const;
+
+    //! maximum Cubemap layered surface dimensions
+    Vec2i maxSurfaceCubemapLayered() const;
+
+    //! alignment requirements for surfaces
+    size_t surfaceAlignment() const;
+
+    //! device can possibly execute multiple kernels concurrently
+    bool concurrentKernels() const;
+
+    //! device has ECC support enabled
+    bool ECCEnabled() const;
+
+    //! PCI bus ID of the device
+    int pciBusID() const;
+
+    //! PCI device ID of the device
+    int pciDeviceID() const;
+
+    //! PCI domain ID of the device
+    int pciDomainID() const;
+
+    //! true if device is a Tesla device using TCC driver, false otherwise
+    bool tccDriver() const;
+
+    //! number of asynchronous engines
+    int asyncEngineCount() const;
+
+    //! device shares a unified address space with the host
+    bool unifiedAddressing() const;
+
+    //! peak memory clock frequency in kilohertz
+    int memoryClockRate() const;
+
+    //! global memory bus width in bits
+    int memoryBusWidth() const;
+
+    //! size of L2 cache in bytes
+    int l2CacheSize() const;
+
+    //! maximum resident threads per multiprocessor
+    int maxThreadsPerMultiProcessor() const;
+
+    //! gets free and total device memory
+    void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
+    size_t freeMemory() const;
+    size_t totalMemory() const;
+
+    //! checks whether device supports the given feature
+    bool supports(FeatureSet feature_set) const;
+
+    //! checks whether the GPU module can be run on the given device
+    bool isCompatible() const;
+
+private:
+    int device_id_;
+};
+
+CV_EXPORTS void printCudaDeviceInfo(int device);
+CV_EXPORTS void printShortCudaDeviceInfo(int device);
+
+}} // namespace cv { namespace gpu {
+
+namespace cv {
+
+template <> CV_EXPORTS void Ptr<cv::gpu::Stream::Impl>::delete_obj();
+template <> CV_EXPORTS void Ptr<cv::gpu::Event::Impl>::delete_obj();
+
+}
+
+#include "opencv2/core/gpu.inl.hpp"
+
+#endif /* __OPENCV_CORE_GPU_HPP__ */
--- a/modules/core/include/opencv2/core/gpu.inl.hpp
+++ b/modules/core/include/opencv2/core/gpu.inl.hpp
@ -0,0 +1,641 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_GPUINL_HPP__
+#define __OPENCV_CORE_GPUINL_HPP__
+
+#include "opencv2/core/gpu.hpp"
+
+namespace cv { namespace gpu {
+
+//////////////////////////////// GpuMat ///////////////////////////////
+
+inline
+GpuMat::GpuMat()
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    if (rows_ > 0 && cols_ > 0)
+    {
+        create(rows_, cols_, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_, Scalar s_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    if (size_.height > 0 && size_.width > 0)
+    {
+        create(size_.height, size_.width, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(const GpuMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)
+{
+    if (refcount)
+        CV_XADD(refcount, 1);
+}
+
+inline
+GpuMat::GpuMat(InputArray arr) :
+    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    upload(arr);
+}
+
+inline
+GpuMat::~GpuMat()
+{
+    release();
+}
+
+inline
+GpuMat& GpuMat::operator =(const GpuMat& m)
+{
+    if (this != &m)
+    {
+        GpuMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void GpuMat::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+void GpuMat::swap(GpuMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+}
+
+inline
+GpuMat GpuMat::clone() const
+{
+    GpuMat m;
+    copyTo(m);
+    return m;
+}
+
+inline
+void GpuMat::copyTo(OutputArray dst, InputArray mask) const
+{
+    copyTo(dst, mask, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s)
+{
+    return setTo(s, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s, InputArray mask)
+{
+    return setTo(s, mask, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype) const
+{
+    convertTo(dst, rtype, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, double beta) const
+{
+    convertTo(dst, rtype, alpha, beta, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const
+{
+    convertTo(dst, rtype, alpha, 0.0, stream);
+}
+
+inline
+void GpuMat::assignTo(GpuMat& m, int _type) const
+{
+    if (_type < 0)
+        m = *this;
+    else
+        convertTo(m, _type);
+}
+
+inline
+uchar* GpuMat::ptr(int y)
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+inline
+const uchar* GpuMat::ptr(int y) const
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+template<typename _Tp> inline
+_Tp* GpuMat::ptr(int y)
+{
+    return (_Tp*)ptr(y);
+}
+
+template<typename _Tp> inline
+const _Tp* GpuMat::ptr(int y) const
+{
+    return (const _Tp*)ptr(y);
+}
+
+template <class T> inline
+GpuMat::operator PtrStepSz<T>() const
+{
+    return PtrStepSz<T>(rows, cols, (T*)data, step);
+}
+
+template <class T> inline
+GpuMat::operator PtrStep<T>() const
+{
+    return PtrStep<T>((T*)data, step);
+}
+
+inline
+GpuMat GpuMat::row(int y) const
+{
+    return GpuMat(*this, Range(y, y+1), Range::all());
+}
+
+inline
+GpuMat GpuMat::col(int x) const
+{
+    return GpuMat(*this, Range::all(), Range(x, x+1));
+}
+
+inline
+GpuMat GpuMat::rowRange(int startrow, int endrow) const
+{
+    return GpuMat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+GpuMat GpuMat::rowRange(Range r) const
+{
+    return GpuMat(*this, r, Range::all());
+}
+
+inline
+GpuMat GpuMat::colRange(int startcol, int endcol) const
+{
+    return GpuMat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+GpuMat GpuMat::colRange(Range r) const
+{
+    return GpuMat(*this, Range::all(), r);
+}
+
+inline
+GpuMat GpuMat::operator ()(Range rowRange_, Range colRange_) const
+{
+    return GpuMat(*this, rowRange_, colRange_);
+}
+
+inline
+GpuMat GpuMat::operator ()(Rect roi) const
+{
+    return GpuMat(*this, roi);
+}
+
+inline
+bool GpuMat::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t GpuMat::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int GpuMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int GpuMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int GpuMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t GpuMat::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size GpuMat::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool GpuMat::empty() const
+{
+    return data == 0;
+}
+
+static inline
+GpuMat createContinuous(int rows, int cols, int type)
+{
+    GpuMat m;
+    createContinuous(rows, cols, type, m);
+    return m;
+}
+
+static inline
+void createContinuous(Size size, int type, OutputArray arr)
+{
+    createContinuous(size.height, size.width, type, arr);
+}
+
+static inline
+GpuMat createContinuous(Size size, int type)
+{
+    GpuMat m;
+    createContinuous(size, type, m);
+    return m;
+}
+
+static inline
+void ensureSizeIsEnough(Size size, int type, OutputArray arr)
+{
+    ensureSizeIsEnough(size.height, size.width, type, arr);
+}
+
+static inline
+void swap(GpuMat& a, GpuMat& b)
+{
+    a.swap(b);
+}
+
+//////////////////////////////// CudaMem ////////////////////////////////
+
+inline
+CudaMem::CudaMem(AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+}
+
+inline
+CudaMem::CudaMem(const CudaMem& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
+{
+    if( refcount )
+        CV_XADD(refcount, 1);
+}
+
+inline
+CudaMem::CudaMem(int rows_, int cols_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+CudaMem::CudaMem(Size size_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+CudaMem::CudaMem(InputArray arr, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    arr.getMat().copyTo(*this);
+}
+
+inline
+CudaMem::~CudaMem()
+{
+    release();
+}
+
+inline
+CudaMem& CudaMem::operator =(const CudaMem& m)
+{
+    if (this != &m)
+    {
+        CudaMem temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void CudaMem::swap(CudaMem& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(alloc_type, b.alloc_type);
+}
+
+inline
+CudaMem CudaMem::clone() const
+{
+    CudaMem m(size(), type(), alloc_type);
+    createMatHeader().copyTo(m);
+    return m;
+}
+
+inline
+void CudaMem::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+Mat CudaMem::createMatHeader() const
+{
+    return Mat(size(), type(), data, step);
+}
+
+inline
+bool CudaMem::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t CudaMem::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t CudaMem::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int CudaMem::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int CudaMem::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int CudaMem::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t CudaMem::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size CudaMem::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool CudaMem::empty() const
+{
+    return data == 0;
+}
+
+static inline
+void swap(CudaMem& a, CudaMem& b)
+{
+    a.swap(b);
+}
+
+//////////////////////////////// Stream ///////////////////////////////
+
+inline
+void Stream::enqueueDownload(const GpuMat& src, OutputArray dst)
+{
+    src.download(dst, *this);
+}
+
+inline
+void Stream::enqueueUpload(InputArray src, GpuMat& dst)
+{
+    dst.upload(src, *this);
+}
+
+inline
+void Stream::enqueueCopy(const GpuMat& src, OutputArray dst)
+{
+    src.copyTo(dst, *this);
+}
+
+inline
+void Stream::enqueueMemSet(GpuMat& src, Scalar val)
+{
+    src.setTo(val, *this);
+}
+
+inline
+void Stream::enqueueMemSet(GpuMat& src, Scalar val, InputArray mask)
+{
+    src.setTo(val, mask, *this);
+}
+
+inline
+void Stream::enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha, double beta)
+{
+    src.convertTo(dst, dtype, alpha, beta, *this);
+}
+
+inline
+Stream::Stream(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+//////////////////////////////// Initialization & Info ////////////////////////
+
+inline
+bool TargetArchs::has(int major, int minor)
+{
+    return hasPtx(major, minor) || hasBin(major, minor);
+}
+
+inline
+bool TargetArchs::hasEqualOrGreater(int major, int minor)
+{
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+}
+
+inline
+DeviceInfo::DeviceInfo()
+{
+    device_id_ = getDevice();
+}
+
+inline
+DeviceInfo::DeviceInfo(int device_id)
+{
+    CV_Assert( device_id >= 0 && device_id < getCudaEnabledDeviceCount() );
+    device_id_ = device_id;
+}
+
+inline
+int DeviceInfo::deviceID() const
+{
+    return device_id_;
+}
+
+inline
+size_t DeviceInfo::freeMemory() const
+{
+    size_t _totalMemory, _freeMemory;
+    queryMemory(_totalMemory, _freeMemory);
+    return _freeMemory;
+}
+
+inline
+size_t DeviceInfo::totalMemory() const
+{
+    size_t _totalMemory, _freeMemory;
+    queryMemory(_totalMemory, _freeMemory);
+    return _totalMemory;
+}
+
+inline
+bool DeviceInfo::supports(FeatureSet feature_set) const
+{
+    int version = major() * 10 + minor();
+    return version >= feature_set;
+}
+
+}} // namespace cv { namespace gpu {
+
+//////////////////////////////// Mat ////////////////////////////////
+
+namespace cv {
+
+inline
+Mat::Mat(const gpu::GpuMat& m)
+    : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows)
+{
+    m.download(*this);
+}
+
+}
+
+#endif // __OPENCV_CORE_GPUINL_HPP__
--- a/modules/core/include/opencv2/core/gpu_stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/gpu_stream_accessor.hpp
@ -40,28 +40,38 @@
 //
 //M*/

-#ifndef __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
-#define __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
+#ifndef __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__
+#define __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__

-#include <cuda_runtime.h>
-#include "opencv2/core/cvdef.h"
+#ifndef __cplusplus
+#  error gpu_stream_accessor.hpp header must be compiled as C++
+#endif

 // This is only header file that depends on Cuda. All other headers are independent.
 // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
 // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
 // In this case you have to install Cuda Toolkit.

+#include <cuda_runtime.h>
+#include "opencv2/core/cvdef.h"
+
 namespace cv
 {
    namespace gpu
    {
        class Stream;
+        class Event;

        struct StreamAccessor
        {
            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
        };
+
+        struct EventAccessor
+        {
+            CV_EXPORTS static cudaEvent_t getEvent(const Event& event);
+        };
    }
 }

-#endif /* __OPENCV_CUDA_STREAM_ACCESSOR_HPP__ */
+#endif /* __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__ */
--- a/modules/core/include/opencv2/core/cuda_devptrs.hpp
+++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp
@ -40,10 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_DEVPTRS_HPP__
-#define __OPENCV_CORE_DEVPTRS_HPP__
+#ifndef __OPENCV_CORE_GPU_TYPES_HPP__
+#define __OPENCV_CORE_GPU_TYPES_HPP__

-#ifdef __cplusplus
+#ifndef __cplusplus
+#  error gpu_types.hpp header must be compiled as C++
+#endif

 #ifdef __CUDACC__
    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
@ -58,7 +60,7 @@ namespace cv
        // Simple lightweight structures that encapsulates information about an image on device.
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile

-        template<typename T> struct DevPtr
+        template <typename T> struct DevPtr
        {
            typedef T elem_type;
            typedef int index_type;
@ -75,7 +77,7 @@ namespace cv
            __CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
        };

-        template<typename T> struct PtrSz : public DevPtr<T>
+        template <typename T> struct PtrSz : public DevPtr<T>
        {
            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
@ -83,12 +85,12 @@ namespace cv
            size_t size;
        };

-        template<typename T> struct PtrStep : public DevPtr<T>
+        template <typename T> struct PtrStep : public DevPtr<T>
        {
            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
            __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}

-            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
+            //! stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!!
            size_t step;

            __CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
@ -118,36 +120,7 @@ namespace cv
        typedef PtrStep<unsigned char> PtrStepb;
        typedef PtrStep<float> PtrStepf;
        typedef PtrStep<int> PtrStepi;
-
-
-#if defined __GNUC__
-    #define __CV_GPU_DEPR_BEFORE__
-    #define __CV_GPU_DEPR_AFTER__ __attribute__ ((deprecated))
-#elif defined(__MSVC__) //|| defined(__CUDACC__)
-    #pragma deprecated(DevMem2D_)
-    #define __CV_GPU_DEPR_BEFORE__ __declspec(deprecated)
-    #define __CV_GPU_DEPR_AFTER__
-#else
-    #define __CV_GPU_DEPR_BEFORE__
-    #define __CV_GPU_DEPR_AFTER__
-#endif
-
-        template <typename T> struct __CV_GPU_DEPR_BEFORE__ DevMem2D_ : public PtrStepSz<T>
-        {
-            DevMem2D_() {}
-            DevMem2D_(int rows_, int cols_, T* data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}
-
-            template <typename U>
-            explicit __CV_GPU_DEPR_BEFORE__ DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}
-        } __CV_GPU_DEPR_AFTER__ ;
-
-        typedef DevMem2D_<unsigned char> DevMem2Db;
-        typedef DevMem2Db DevMem2D;
-        typedef DevMem2D_<float> DevMem2Df;
-        typedef DevMem2D_<int> DevMem2Di;
    }
 }

-#endif // __cplusplus
-
-#endif /* __OPENCV_CORE_DEVPTRS_HPP__ */
+#endif /* __OPENCV_CORE_GPU_TYPES_HPP__ */
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -1,722 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-#ifndef __OPENCV_GPUMAT_HPP__
-#define __OPENCV_GPUMAT_HPP__
-
-#include "opencv2/core.hpp"
-#include "opencv2/core/cuda_devptrs.hpp"
-
-namespace cv { namespace gpu
-{
-//////////////////////////////// CudaMem ////////////////////////////////
-// CudaMem is limited cv::Mat with page locked memory allocation.
-// Page locked memory is only needed for async and faster coping to GPU.
-// It is convertable to cv::Mat header without reference counting
-// so you can use it with other opencv functions.
-
-// Page-locks the matrix m memory and maps it for the device(s)
-CV_EXPORTS void registerPageLocked(Mat& m);
-
-// Unmaps the memory of matrix m, and makes it pageable again.
-CV_EXPORTS void unregisterPageLocked(Mat& m);
-
-class CV_EXPORTS CudaMem
-{
-public:
-    enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };
-
-    CudaMem();
-    CudaMem(const CudaMem& m);
-
-    CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED);
-    CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-
-    //! creates from cv::Mat with coping data
-    explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
-
-    ~CudaMem();
-
-    CudaMem& operator = (const CudaMem& m);
-
-    //! returns deep copy of the matrix, i.e. the data is copied
-    CudaMem clone() const;
-
-    //! allocates new matrix data unless the matrix already has specified size and type.
-    void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-    void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-    //! decrements reference counter and released memory if needed.
-    void release();
-
-    //! returns matrix header with disabled reference counting for CudaMem data.
-    Mat createMatHeader() const;
-    operator Mat() const;
-
-    //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
-    GpuMat createGpuMatHeader() const;
-    operator GpuMat() const;
-
-    //returns if host memory can be mapperd to gpu address space;
-    static bool canMapHostMemory();
-
-    // Please see cv::Mat for descriptions
-    bool isContinuous() const;
-    size_t elemSize() const;
-    size_t elemSize1() const;
-    int type() const;
-    int depth() const;
-    int channels() const;
-    size_t step1() const;
-    Size size() const;
-    bool empty() const;
-
-
-    // Please see cv::Mat for descriptions
-    int flags;
-    int rows, cols;
-    size_t step;
-
-    uchar* data;
-    int* refcount;
-
-    uchar* datastart;
-    uchar* dataend;
-
-    int alloc_type;
-};
-
-
-//////////////////////////////// CudaStream ////////////////////////////////
-// Encapculates Cuda Stream. Provides interface for async coping.
-// Passed to each function that supports async kernel execution.
-// Reference counting is enabled
-
-class CV_EXPORTS Stream
-{
-public:
-    Stream();
-    ~Stream();
-
-    Stream(const Stream&);
-    Stream& operator =(const Stream&);
-
-    bool queryIfComplete();
-    void waitForCompletion();
-
-    //! downloads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
-    void enqueueDownload(const GpuMat& src, CudaMem& dst);
-    void enqueueDownload(const GpuMat& src, Mat& dst);
-
-    //! uploads asynchronously
-    // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
-    void enqueueUpload(const CudaMem& src, GpuMat& dst);
-    void enqueueUpload(const Mat& src, GpuMat& dst);
-
-    //! copy asynchronously
-    void enqueueCopy(const GpuMat& src, GpuMat& dst);
-
-    //! memory set asynchronously
-    void enqueueMemSet(GpuMat& src, Scalar val);
-    void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask);
-
-    //! converts matrix type, ex from float to uchar depending on type
-    void enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double a = 1, double b = 0);
-
-    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
-    typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
-    void enqueueHostCallback(StreamCallback callback, void* userData);
-
-    static Stream& Null();
-
-    operator bool() const;
-
-private:
-    struct Impl;
-
-    explicit Stream(Impl* impl);
-    void create();
-    void release();
-
-    Impl *impl;
-
-    friend struct StreamAccessor;
-};
-
-//////////////////////////////// Initialization & Info ////////////////////////
-
-//! This is the only function that do not throw exceptions if the library is compiled without Cuda.
-CV_EXPORTS int getCudaEnabledDeviceCount();
-
-//! Functions below throw cv::Expception if the library is compiled without Cuda.
-
-CV_EXPORTS void setDevice(int device);
-
-CV_EXPORTS int getDevice();
-
-//! Explicitly destroys and cleans up all resources associated with the current device in the current process.
-//! Any subsequent API call to this device will reinitialize the device.
-CV_EXPORTS void resetDevice();
-
-enum FeatureSet
-{
-    FEATURE_SET_COMPUTE_10 = 10,
-    FEATURE_SET_COMPUTE_11 = 11,
-    FEATURE_SET_COMPUTE_12 = 12,
-    FEATURE_SET_COMPUTE_13 = 13,
-    FEATURE_SET_COMPUTE_20 = 20,
-    FEATURE_SET_COMPUTE_21 = 21,
-    FEATURE_SET_COMPUTE_30 = 30,
-    FEATURE_SET_COMPUTE_35 = 35,
-
-    GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
-    SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
-    NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
-    WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30,
-    DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35
-};
-
-// Checks whether current device supports the given feature
-CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
-
-// Gives information about what GPU archs this OpenCV GPU module was
-// compiled for
-class CV_EXPORTS TargetArchs
-{
-public:
-    static bool builtWith(FeatureSet feature_set);
-    static bool has(int major, int minor);
-    static bool hasPtx(int major, int minor);
-    static bool hasBin(int major, int minor);
-    static bool hasEqualOrLessPtx(int major, int minor);
-    static bool hasEqualOrGreater(int major, int minor);
-    static bool hasEqualOrGreaterPtx(int major, int minor);
-    static bool hasEqualOrGreaterBin(int major, int minor);
-private:
-    TargetArchs();
-};
-
-// Gives information about the given GPU
-class CV_EXPORTS DeviceInfo
-{
-public:
-    // Creates DeviceInfo object for the current GPU
-    DeviceInfo() : device_id_(getDevice()) { query(); }
-
-    // Creates DeviceInfo object for the given GPU
-    DeviceInfo(int device_id) : device_id_(device_id) { query(); }
-
-    String name() const { return name_; }
-
-    // Return compute capability versions
-    int majorVersion() const { return majorVersion_; }
-    int minorVersion() const { return minorVersion_; }
-
-    int multiProcessorCount() const { return multi_processor_count_; }
-
-    size_t sharedMemPerBlock() const;
-
-    void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
-    size_t freeMemory() const;
-    size_t totalMemory() const;
-
-    // Checks whether device supports the given feature
-    bool supports(FeatureSet feature_set) const;
-
-    // Checks whether the GPU module can be run on the given device
-    bool isCompatible() const;
-
-    int deviceID() const { return device_id_; }
-
-private:
-    void query();
-
-    int device_id_;
-
-    String name_;
-    int multi_processor_count_;
-    int majorVersion_;
-    int minorVersion_;
-};
-
-CV_EXPORTS void printCudaDeviceInfo(int device);
-
-CV_EXPORTS void printShortCudaDeviceInfo(int device);
-
-//////////////////////////////// GpuMat ///////////////////////////////
-
-//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
-class CV_EXPORTS GpuMat
-{
-public:
-    //! default constructor
-    GpuMat();
-
-    //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
-    GpuMat(int rows, int cols, int type);
-    GpuMat(Size size, int type);
-
-    //! constucts GpuMatrix and fills it with the specified value _s.
-    GpuMat(int rows, int cols, int type, Scalar s);
-    GpuMat(Size size, int type, Scalar s);
-
-    //! copy constructor
-    GpuMat(const GpuMat& m);
-
-    //! constructor for GpuMatrix headers pointing to user-allocated data
-    GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
-    GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
-
-    //! creates a matrix header for a part of the bigger matrix
-    GpuMat(const GpuMat& m, Range rowRange, Range colRange);
-    GpuMat(const GpuMat& m, Rect roi);
-
-    //! builds GpuMat from Mat. Perfom blocking upload to device.
-    explicit GpuMat(const Mat& m);
-
-    //! destructor - calls release()
-    ~GpuMat();
-
-    //! assignment operators
-    GpuMat& operator = (const GpuMat& m);
-
-    //! pefroms blocking upload data to GpuMat.
-    void upload(const Mat& m);
-
-    //! downloads data from device to host memory. Blocking calls.
-    void download(Mat& m) const;
-
-    //! returns a new GpuMatrix header for the specified row
-    GpuMat row(int y) const;
-    //! returns a new GpuMatrix header for the specified column
-    GpuMat col(int x) const;
-    //! ... for the specified row span
-    GpuMat rowRange(int startrow, int endrow) const;
-    GpuMat rowRange(Range r) const;
-    //! ... for the specified column span
-    GpuMat colRange(int startcol, int endcol) const;
-    GpuMat colRange(Range r) const;
-
-    //! returns deep copy of the GpuMatrix, i.e. the data is copied
-    GpuMat clone() const;
-    //! copies the GpuMatrix content to "m".
-    // It calls m.create(this->size(), this->type()).
-    void copyTo(GpuMat& m) const;
-    //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.
-    void copyTo(GpuMat& m, const GpuMat& mask) const;
-    //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.
-    void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;
-
-    void assignTo(GpuMat& m, int type=-1) const;
-
-    //! sets every GpuMatrix element to s
-    GpuMat& operator = (Scalar s);
-    //! sets some of the GpuMatrix elements to s, according to the mask
-    GpuMat& setTo(Scalar s, const GpuMat& mask = GpuMat());
-    //! creates alternative GpuMatrix header for the same data, with different
-    // number of channels and/or different number of rows. see cvReshape.
-    GpuMat reshape(int cn, int rows = 0) const;
-
-    //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.
-    // previous data is unreferenced if needed.
-    void create(int rows, int cols, int type);
-    void create(Size size, int type);
-    //! decreases reference counter;
-    // deallocate the data when reference counter reaches 0.
-    void release();
-
-    //! swaps with other smart pointer
-    void swap(GpuMat& mat);
-
-    //! locates GpuMatrix header within a parent GpuMatrix. See below
-    void locateROI(Size& wholeSize, Point& ofs) const;
-    //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.
-    GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
-    //! extracts a rectangular sub-GpuMatrix
-    // (this is a generalized form of row, rowRange etc.)
-    GpuMat operator()(Range rowRange, Range colRange) const;
-    GpuMat operator()(Rect roi) const;
-
-    //! returns true iff the GpuMatrix data is continuous
-    // (i.e. when there are no gaps between successive rows).
-    // similar to CV_IS_GpuMat_CONT(cvGpuMat->type)
-    bool isContinuous() const;
-    //! returns element size in bytes,
-    // similar to CV_ELEM_SIZE(cvMat->type)
-    size_t elemSize() const;
-    //! returns the size of element channel in bytes.
-    size_t elemSize1() const;
-    //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
-    int type() const;
-    //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
-    int depth() const;
-    //! returns element type, similar to CV_MAT_CN(cvMat->type)
-    int channels() const;
-    //! returns step/elemSize1()
-    size_t step1() const;
-    //! returns GpuMatrix size:
-    // width == number of columns, height == number of rows
-    Size size() const;
-    //! returns true if GpuMatrix data is NULL
-    bool empty() const;
-
-    //! returns pointer to y-th row
-    uchar* ptr(int y = 0);
-    const uchar* ptr(int y = 0) const;
-
-    //! template version of the above method
-    template<typename _Tp> _Tp* ptr(int y = 0);
-    template<typename _Tp> const _Tp* ptr(int y = 0) const;
-
-    template <typename _Tp> operator PtrStepSz<_Tp>() const;
-    template <typename _Tp> operator PtrStep<_Tp>() const;
-
-    // Deprecated function
-    __CV_GPU_DEPR_BEFORE__ template <typename _Tp> operator DevMem2D_<_Tp>() const __CV_GPU_DEPR_AFTER__;
-    #undef __CV_GPU_DEPR_BEFORE__
-    #undef __CV_GPU_DEPR_AFTER__
-
-    /*! includes several bit-fields:
-    - the magic signature
-    - continuity flag
-    - depth
-    - number of channels
-    */
-    int flags;
-
-    //! the number of rows and columns
-    int rows, cols;
-
-    //! a distance between successive rows in bytes; includes the gap if any
-    size_t step;
-
-    //! pointer to the data
-    uchar* data;
-
-    //! pointer to the reference counter;
-    // when GpuMatrix points to user-allocated data, the pointer is NULL
-    int* refcount;
-
-    //! helper fields used in locateROI and adjustROI
-    uchar* datastart;
-    uchar* dataend;
-};
-
-//! Creates continuous GPU matrix
-CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);
-
-//! Ensures that size of the given matrix is not less than (rows, cols) size
-//! and matrix type is match specified one too
-CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
-
-CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat);
-
-////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-
-inline
-GpuMat::GpuMat()
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-{}
-
-inline
-GpuMat::GpuMat(int rows_, int cols_, int type_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-{
-    if (rows_ > 0 && cols_ > 0)
-        create(rows_, cols_, type_);
-}
-
-inline
-GpuMat::GpuMat(Size size_, int type_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-{
-    if (size_.height > 0 && size_.width > 0)
-        create(size_.height, size_.width, type_);
-}
-
-inline
-GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-{
-    if (rows_ > 0 && cols_ > 0)
-    {
-        create(rows_, cols_, type_);
-        setTo(s_);
-    }
-}
-
-inline
-GpuMat::GpuMat(Size size_, int type_, Scalar s_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-{
-    if (size_.height > 0 && size_.width > 0)
-    {
-        create(size_.height, size_.width, type_);
-        setTo(s_);
-    }
-}
-
-inline
-GpuMat::~GpuMat()
-{
-    release();
-}
-
-inline
-GpuMat GpuMat::clone() const
-{
-    GpuMat m;
-    copyTo(m);
-    return m;
-}
-
-inline
-void GpuMat::assignTo(GpuMat& m, int _type) const
-{
-    if (_type < 0)
-        m = *this;
-    else
-        convertTo(m, _type);
-}
-
-inline
-size_t GpuMat::step1() const
-{
-    return step / elemSize1();
-}
-
-inline
-bool GpuMat::empty() const
-{
-    return data == 0;
-}
-
-template<typename _Tp> inline
-_Tp* GpuMat::ptr(int y)
-{
-    return (_Tp*)ptr(y);
-}
-
-template<typename _Tp> inline
-const _Tp* GpuMat::ptr(int y) const
-{
-    return (const _Tp*)ptr(y);
-}
-
-inline
-GpuMat GpuMat::row(int y) const
-{
-    return GpuMat(*this, Range(y, y+1), Range::all());
-}
-
-inline
-GpuMat GpuMat::col(int x) const
-{
-    return GpuMat(*this, Range::all(), Range(x, x+1));
-}
-
-inline
-GpuMat GpuMat::rowRange(int startrow, int endrow) const
-{
-    return GpuMat(*this, Range(startrow, endrow), Range::all());
-}
-
-inline
-GpuMat GpuMat::rowRange(Range r) const
-{
-    return GpuMat(*this, r, Range::all());
-}
-
-inline
-GpuMat GpuMat::colRange(int startcol, int endcol) const
-{
-    return GpuMat(*this, Range::all(), Range(startcol, endcol));
-}
-
-inline
-GpuMat GpuMat::colRange(Range r) const
-{
-    return GpuMat(*this, Range::all(), r);
-}
-
-inline
-void GpuMat::create(Size size_, int type_)
-{
-    create(size_.height, size_.width, type_);
-}
-
-inline
-GpuMat GpuMat::operator()(Range _rowRange, Range _colRange) const
-{
-    return GpuMat(*this, _rowRange, _colRange);
-}
-
-inline
-GpuMat GpuMat::operator()(Rect roi) const
-{
-    return GpuMat(*this, roi);
-}
-
-inline
-bool GpuMat::isContinuous() const
-{
-    return (flags & Mat::CONTINUOUS_FLAG) != 0;
-}
-
-inline
-size_t GpuMat::elemSize() const
-{
-    return CV_ELEM_SIZE(flags);
-}
-
-inline
-size_t GpuMat::elemSize1() const
-{
-    return CV_ELEM_SIZE1(flags);
-}
-
-inline
-int GpuMat::type() const
-{
-    return CV_MAT_TYPE(flags);
-}
-
-inline
-int GpuMat::depth() const
-{
-    return CV_MAT_DEPTH(flags);
-}
-
-inline
-int GpuMat::channels() const
-{
-    return CV_MAT_CN(flags);
-}
-
-inline
-Size GpuMat::size() const
-{
-    return Size(cols, rows);
-}
-
-inline
-uchar* GpuMat::ptr(int y)
-{
-    CV_DbgAssert((unsigned)y < (unsigned)rows);
-    return data + step * y;
-}
-
-inline
-const uchar* GpuMat::ptr(int y) const
-{
-    CV_DbgAssert((unsigned)y < (unsigned)rows);
-    return data + step * y;
-}
-
-inline
-GpuMat& GpuMat::operator = (Scalar s)
-{
-    setTo(s);
-    return *this;
-}
-
-template <class T> inline
-GpuMat::operator PtrStepSz<T>() const
-{
-    return PtrStepSz<T>(rows, cols, (T*)data, step);
-}
-
-template <class T> inline
-GpuMat::operator PtrStep<T>() const
-{
-    return PtrStep<T>((T*)data, step);
-}
-
-template <class T> inline
-GpuMat::operator DevMem2D_<T>() const
-{
-    return DevMem2D_<T>(rows, cols, (T*)data, step);
-}
-
-static inline
-void swap(GpuMat& a, GpuMat& b)
-{
-    a.swap(b);
-}
-
-static inline
-GpuMat createContinuous(int rows, int cols, int type)
-{
-    GpuMat m;
-    createContinuous(rows, cols, type, m);
-    return m;
-}
-
-static inline
-void createContinuous(Size size, int type, GpuMat& m)
-{
-    createContinuous(size.height, size.width, type, m);
-}
-
-static inline
-GpuMat createContinuous(Size size, int type)
-{
-    GpuMat m;
-    createContinuous(size, type, m);
-    return m;
-}
-
-static inline
-void ensureSizeIsEnough(Size size, int type, GpuMat& m)
-{
-    ensureSizeIsEnough(size.height, size.width, type, m);
-}
-
-}} // cv::gpu
-
-#endif // __OPENCV_GPUMAT_HPP__
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -77,7 +77,7 @@ public:
        STD_VECTOR_MAT    = 5 << KIND_SHIFT,
        EXPR              = 6 << KIND_SHIFT,
        OPENGL_BUFFER     = 7 << KIND_SHIFT,
-        OPENGL_TEXTURE    = 8 << KIND_SHIFT,
+        CUDA_MEM          = 8 << KIND_SHIFT,
        GPU_MAT           = 9 << KIND_SHIFT
    };

@ -94,13 +94,12 @@ public:
    _InputArray(const double& val);
    _InputArray(const gpu::GpuMat& d_mat);
    _InputArray(const ogl::Buffer& buf);
-    _InputArray(const ogl::Texture2D& tex);
+    _InputArray(const gpu::CudaMem& cuda_mem);

    virtual Mat getMat(int i=-1) const;
    virtual void getMatVector(std::vector<Mat>& mv) const;
    virtual gpu::GpuMat getGpuMat() const;
    virtual ogl::Buffer getOGlBuffer() const;
-    virtual ogl::Texture2D getOGlTexture2D() const;

    virtual int kind() const;
    virtual Size size(int i=-1) const;
@ -143,7 +142,7 @@ public:
    _OutputArray(std::vector<Mat>& vec);
    _OutputArray(gpu::GpuMat& d_mat);
    _OutputArray(ogl::Buffer& buf);
-    _OutputArray(ogl::Texture2D& tex);
+    _OutputArray(gpu::CudaMem& cuda_mem);
    template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
    template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
    template<typename _Tp> _OutputArray(std::vector<Mat_<_Tp> >& vec);
@ -155,7 +154,7 @@ public:
    _OutputArray(const std::vector<Mat>& vec);
    _OutputArray(const gpu::GpuMat& d_mat);
    _OutputArray(const ogl::Buffer& buf);
-    _OutputArray(const ogl::Texture2D& tex);
+    _OutputArray(const gpu::CudaMem& cuda_mem);
    template<typename _Tp> _OutputArray(const std::vector<_Tp>& vec);
    template<typename _Tp> _OutputArray(const std::vector<std::vector<_Tp> >& vec);
    template<typename _Tp> _OutputArray(const std::vector<Mat_<_Tp> >& vec);
@ -169,7 +168,7 @@ public:
    virtual Mat& getMatRef(int i=-1) const;
    virtual gpu::GpuMat& getGpuMatRef() const;
    virtual ogl::Buffer& getOGlBufferRef() const;
-    virtual ogl::Texture2D& getOGlTexture2DRef() const;
+    virtual gpu::CudaMem& getCudaMemRef() const;
    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
--- a/modules/core/include/opencv2/core/opengl.hpp
+++ b/modules/core/include/opencv2/core/opengl.hpp
@ -40,8 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_OPENGL_INTEROP_HPP__
-#define __OPENCV_OPENGL_INTEROP_HPP__
+#ifndef __OPENCV_CORE_OPENGL_HPP__
+#define __OPENCV_CORE_OPENGL_HPP__
+
+#ifndef __cplusplus
+#  error opengl.hpp header must be compiled as C++
+#endif

 #include "opencv2/core.hpp"

@ -84,7 +88,7 @@ public:

    //! create buffer
    void create(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
-    void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false) { create(asize.height, asize.width, atype, target, autoRelease); }
+    void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);

    //! release memory and delete buffer object
    void release();
@ -92,11 +96,15 @@ public:
    //! set auto release mode (if true, release will be called in object's destructor)
    void setAutoRelease(bool flag);

-    //! copy from host/device memory
+    //! copy from host/device memory (blocking)
    void copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
+    //! copy from device memory (non blocking)
+    void copyFrom(InputArray arr, gpu::Stream& stream, Target target = ARRAY_BUFFER, bool autoRelease = false);

-    //! copy to host/device memory
-    void copyTo(OutputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false) const;
+    //! copy to host/device memory (blocking)
+    void copyTo(OutputArray arr) const;
+    //! copy to device memory (non blocking)
+    void copyTo(OutputArray arr, gpu::Stream& stream) const;

    //! create copy of current buffer
    Buffer clone(Target target = ARRAY_BUFFER, bool autoRelease = false) const;
@ -111,21 +119,26 @@ public:
    Mat mapHost(Access access);
    void unmapHost();

-    //! map to device memory
+    //! map to device memory (blocking)
    gpu::GpuMat mapDevice();
    void unmapDevice();

-    int rows() const { return rows_; }
-    int cols() const { return cols_; }
-    Size size() const { return Size(cols_, rows_); }
-    bool empty() const { return rows_ == 0 || cols_ == 0; }
+    //! map to device memory (non blocking)
+    gpu::GpuMat mapDevice(gpu::Stream& stream);
+    void unmapDevice(gpu::Stream& stream);

-    int type() const { return type_; }
-    int depth() const { return CV_MAT_DEPTH(type_); }
-    int channels() const { return CV_MAT_CN(type_); }
-    int elemSize() const { return CV_ELEM_SIZE(type_); }
-    int elemSize1() const { return CV_ELEM_SIZE1(type_); }
+    int rows() const;
+    int cols() const;
+    Size size() const;
+    bool empty() const;

+    int type() const;
+    int depth() const;
+    int channels() const;
+    int elemSize() const;
+    int elemSize1() const;
+
+    //! get OpenGL opject id
    unsigned int bufId() const;

    class Impl;
@ -165,7 +178,7 @@ public:

    //! create texture
    void create(int arows, int acols, Format aformat, bool autoRelease = false);
-    void create(Size asize, Format aformat, bool autoRelease = false) { create(asize.height, asize.width, aformat, autoRelease); }
+    void create(Size asize, Format aformat, bool autoRelease = false);

    //! release memory and delete texture object
    void release();
@ -182,13 +195,14 @@ public:
    //! bind texture to current active texture unit for GL_TEXTURE_2D target
    void bind() const;

-    int rows() const { return rows_; }
-    int cols() const { return cols_; }
-    Size size() const { return Size(cols_, rows_); }
-    bool empty() const { return rows_ == 0 || cols_ == 0; }
+    int rows() const;
+    int cols() const;
+    Size size() const;
+    bool empty() const;

-    Format format() const { return format_; }
+    Format format() const;

+    //! get OpenGL opject id
    unsigned int texId() const;

    class Impl;
@ -224,8 +238,8 @@ public:

    void bind() const;

-    int size() const { return size_; }
-    bool empty() const { return size_ == 0; }
+    int size() const;
+    bool empty() const;

 private:
    int size_;
@ -260,14 +274,14 @@ enum {
 CV_EXPORTS void render(const Arrays& arr, int mode = POINTS, Scalar color = Scalar::all(255));
 CV_EXPORTS void render(const Arrays& arr, InputArray indices, int mode = POINTS, Scalar color = Scalar::all(255));

-}} // namespace cv::gl
+}} // namespace cv::ogl

 namespace cv { namespace gpu {

 //! set a CUDA device to use OpenGL interoperability
 CV_EXPORTS void setGlDevice(int device = 0);

-}} // cv::gpu
+}}

 namespace cv {

@ -276,4 +290,149 @@ template <> CV_EXPORTS void Ptr<cv::ogl::Texture2D::Impl>::delete_obj();

 }

-#endif // __OPENCV_OPENGL_INTEROP_HPP__
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+inline
+cv::ogl::Buffer::Buffer(int arows, int acols, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(arows, acols, atype, target, autoRelease);
+}
+
+inline
+cv::ogl::Buffer::Buffer(Size asize, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(asize, atype, target, autoRelease);
+}
+
+inline
+void cv::ogl::Buffer::create(Size asize, int atype, Target target, bool autoRelease)
+{
+    create(asize.height, asize.width, atype, target, autoRelease);
+}
+
+inline
+int cv::ogl::Buffer::rows() const
+{
+    return rows_;
+}
+
+inline
+int cv::ogl::Buffer::cols() const
+{
+    return cols_;
+}
+
+inline
+cv::Size cv::ogl::Buffer::size() const
+{
+    return Size(cols_, rows_);
+}
+
+inline
+bool cv::ogl::Buffer::empty() const
+{
+    return rows_ == 0 || cols_ == 0;
+}
+
+inline
+int cv::ogl::Buffer::type() const
+{
+    return type_;
+}
+
+inline
+int cv::ogl::Buffer::depth() const
+{
+    return CV_MAT_DEPTH(type_);
+}
+
+inline
+int cv::ogl::Buffer::channels() const
+{
+    return CV_MAT_CN(type_);
+}
+
+inline
+int cv::ogl::Buffer::elemSize() const
+{
+    return CV_ELEM_SIZE(type_);
+}
+
+inline
+int cv::ogl::Buffer::elemSize1() const
+{
+    return CV_ELEM_SIZE1(type_);
+}
+
+///////
+
+inline
+cv::ogl::Texture2D::Texture2D(int arows, int acols, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(arows, acols, aformat, autoRelease);
+}
+
+inline
+cv::ogl::Texture2D::Texture2D(Size asize, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(asize, aformat, autoRelease);
+}
+
+inline
+void cv::ogl::Texture2D::create(Size asize, Format aformat, bool autoRelease)
+{
+    create(asize.height, asize.width, aformat, autoRelease);
+}
+
+inline
+int cv::ogl::Texture2D::rows() const
+{
+    return rows_;
+}
+
+inline
+int cv::ogl::Texture2D::cols() const
+{
+    return cols_;
+}
+
+inline
+cv::Size cv::ogl::Texture2D::size() const
+{
+    return Size(cols_, rows_);
+}
+
+inline
+bool cv::ogl::Texture2D::empty() const
+{
+    return rows_ == 0 || cols_ == 0;
+}
+
+inline
+cv::ogl::Texture2D::Format cv::ogl::Texture2D::format() const
+{
+    return format_;
+}
+
+///////
+
+inline
+cv::ogl::Arrays::Arrays() : size_(0)
+{
+}
+
+inline
+int cv::ogl::Arrays::size() const
+{
+    return size_;
+}
+
+inline
+bool cv::ogl::Arrays::empty() const
+{
+    return size_ == 0;
+}
+
+#endif /* __OPENCV_CORE_OPENGL_HPP__ */
--- a/modules/core/include/opencv2/core/private.gpu.hpp
+++ b/modules/core/include/opencv2/core/private.gpu.hpp
@ -41,8 +41,8 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_GPU_PRIVATE_HPP__
-#define __OPENCV_CORE_GPU_PRIVATE_HPP__
+#ifndef __OPENCV_CORE_PRIVATE_GPU_HPP__
+#define __OPENCV_CORE_PRIVATE_GPU_HPP__

 #ifndef __OPENCV_BUILD
 #  error this is a private header which should not be used from outside of the OpenCV library
@ -53,11 +53,13 @@
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/base.hpp"

+#include "opencv2/core/gpu.hpp"
+
 #ifdef HAVE_CUDA
 #  include <cuda.h>
 #  include <cuda_runtime.h>
 #  include <npp.h>
-#  include "opencv2/core/stream_accessor.hpp"
+#  include "opencv2/core/gpu_stream_accessor.hpp"
 #  include "opencv2/core/cuda/common.hpp"

 #  define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@ -44,188 +44,113 @@
 #include "opencv2/core/cuda/transform.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/type_traits.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+
+#include "matrix_operations.hpp"

 namespace cv { namespace gpu { namespace cudev
 {
-    void writeScalar(const uchar*);
-    void writeScalar(const schar*);
-    void writeScalar(const ushort*);
-    void writeScalar(const short int*);
-    void writeScalar(const int*);
-    void writeScalar(const float*);
-    void writeScalar(const double*);
-    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
-    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
-}}}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <typename T> struct shift_and_sizeof;
-    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
-    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
-    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
-    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
-    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
-    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
-    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
-
-    ///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// CopyTo /////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////
+    // copyWithMask

-    template <typename T> void copyToWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
+    template <typename T>
+    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream)
    {
-        if (colorMask)
-            cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMask(mask), stream);
+        if (multiChannelMask)
+            cv::gpu::cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, identity<T>(), SingleMask(mask), stream);
        else
-            cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
+            cv::gpu::cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
    }

-    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
+    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream)
    {
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream);

-        static func_t tab[] =
+        static const func_t tab[] =
        {
            0,
-            copyToWithMask<unsigned char>,
-            copyToWithMask<unsigned short>,
+            copyWithMask<uchar>,
+            copyWithMask<ushort>,
            0,
-            copyToWithMask<int>,
+            copyWithMask<int>,
            0,
            0,
            0,
-            copyToWithMask<double>
+            copyWithMask<double>
        };

-        tab[elemSize1](src, dst, cn, mask, colorMask, stream);
+        const func_t func = tab[elemSize1];
+        CV_DbgAssert( func != 0 );
+
+        func(src, dst, cn, mask, multiChannelMask, stream);
    }

    ///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// SetTo //////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
+    // set

-    __constant__ uchar scalar_8u[4];
-    __constant__ schar scalar_8s[4];
-    __constant__ ushort scalar_16u[4];
-    __constant__ short scalar_16s[4];
-    __constant__ int scalar_32s[4];
-    __constant__ float scalar_32f[4];
-    __constant__ double scalar_64f[4];
+    template<typename T, class Mask>
+    __global__ void set(PtrStepSz<T> mat, const Mask mask, const int channels, const typename TypeVec<T, 4>::vec_type value)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;

-    template <typename T> __device__ __forceinline__ T readScalar(int i);
-    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
-    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
-    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
-    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
-    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
-    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
-    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
+        if (x >= mat.cols * channels || y >= mat.rows)
+            return;

-    void writeScalar(const uchar* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
-    }
-    void writeScalar(const schar* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
-    }
-    void writeScalar(const ushort* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
-    }
-    void writeScalar(const short* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
-    }
-    void writeScalar(const int* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
-    }
-    void writeScalar(const float* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
-    }
-    void writeScalar(const double* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
+        const T scalar[4] = {value.x, value.y, value.z, value.w};
+
+        if (mask(y, x / channels))
+            mat(y, x) = scalar[x % channels];
    }

-    template<typename T>
-    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if ((x < cols * channels ) && (y < rows))
-        {
-            size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-            mat[idx] = readScalar<T>(x % channels);
-        }
-    }
-
-    template<typename T>
-    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if ((x < cols * channels ) && (y < rows))
-            if (mask[y * step_mask + x / channels] != 0)
-            {
-                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-                mat[idx] = readScalar<T>(x % channels);
-            }
-    }
    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream)
+    void set(PtrStepSz<T> mat, const T* scalar, int channels, cudaStream_t stream)
    {
-        writeScalar(scalar);
+        typedef typename TypeVec<T, 4>::vec_type scalar_t;

-        dim3 threadsPerBlock(32, 8, 1);
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+        dim3 block(32, 8);
+        dim3 grid(divUp(mat.cols * channels, block.x), divUp(mat.rows, block.y));

-        set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
+        set<T><<<grid, block, 0, stream>>>(mat, WithOutMask(), channels, VecTraits<scalar_t>::make(scalar));
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall ( cudaDeviceSynchronize() );
    }

-    template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<uchar >(PtrStepSz<uchar > mat, const uchar*  scalar, int channels, cudaStream_t stream);
+    template void set<schar >(PtrStepSz<schar > mat, const schar*  scalar, int channels, cudaStream_t stream);
+    template void set<ushort>(PtrStepSz<ushort> mat, const ushort* scalar, int channels, cudaStream_t stream);
+    template void set<short >(PtrStepSz<short > mat, const short*  scalar, int channels, cudaStream_t stream);
+    template void set<int   >(PtrStepSz<int   > mat, const int*    scalar, int channels, cudaStream_t stream);
+    template void set<float >(PtrStepSz<float > mat, const float*  scalar, int channels, cudaStream_t stream);
+    template void set<double>(PtrStepSz<double> mat, const double* scalar, int channels, cudaStream_t stream);

    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream)
+    void set(PtrStepSz<T> mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream)
    {
-        writeScalar(scalar);
+        typedef typename TypeVec<T, 4>::vec_type scalar_t;

-        dim3 threadsPerBlock(32, 8, 1);
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+        dim3 block(32, 8);
+        dim3 grid(divUp(mat.cols * channels, block.x), divUp(mat.rows, block.y));

-        set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
+        set<T><<<grid, block, 0, stream>>>(mat, SingleMask(mask), channels, VecTraits<scalar_t>::make(scalar));
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall ( cudaDeviceSynchronize() );
    }

-    template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, int channels, cudaStream_t stream);
+    template void set<uchar >(PtrStepSz<uchar > mat, const uchar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<schar >(PtrStepSz<schar > mat, const schar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<ushort>(PtrStepSz<ushort> mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<short >(PtrStepSz<short > mat, const short*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<int   >(PtrStepSz<int   > mat, const int*    scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<float >(PtrStepSz<float > mat, const float*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<double>(PtrStepSz<double> mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);

    ///////////////////////////////////////////////////////////////////////////
-    //////////////////////////////// ConvertTo ////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
+    // convert

    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
    {
@ -290,18 +215,11 @@ namespace cv { namespace gpu { namespace cudev
    template<typename T, typename D, typename S>
    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
    {
-        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
        cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }

-#if defined  __clang__
-# pragma clang diagnostic push
-# pragma clang diagnostic ignored "-Wmissing-declarations"
-#endif
-
-    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream)
+    void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream)
    {
        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);

@ -372,11 +290,7 @@ namespace cv { namespace gpu { namespace cudev
            }
        };

-        caller_t func = tab[sdepth][ddepth];
+        const caller_t func = tab[sdepth][ddepth];
        func(src, dst, alpha, beta, stream);
    }
-
-#if defined __clang__
-# pragma clang diagnostic pop
-#endif
 }}} // namespace cv { namespace gpu { namespace cudev
--- a/modules/core/src/cuda/matrix_operations.hpp
+++ b/modules/core/src/cuda/matrix_operations.hpp
@ -0,0 +1,57 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream);
+
+    template <typename T>
+    void set(PtrStepSz<T> mat, const T* scalar, int channels, cudaStream_t stream);
+
+    template <typename T>
+    void set(PtrStepSz<T> mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+
+    void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
+}}}
--- a/modules/core/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
@ -1,348 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA)
-
-cv::gpu::Stream::Stream() { throw_no_cuda(); }
-cv::gpu::Stream::~Stream() {}
-cv::gpu::Stream::Stream(const Stream&) { throw_no_cuda(); }
-Stream& cv::gpu::Stream::operator=(const Stream&) { throw_no_cuda(); return *this; }
-bool cv::gpu::Stream::queryIfComplete() { throw_no_cuda(); return false; }
-void cv::gpu::Stream::waitForCompletion() { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat&, Mat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat&, CudaMem&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueUpload(const CudaMem&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueUpload(const Mat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueConvert(const GpuMat&, GpuMat&, int, double, double) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueHostCallback(StreamCallback, void*) { throw_no_cuda(); }
-Stream& cv::gpu::Stream::Null() { throw_no_cuda(); static Stream s; return s; }
-cv::gpu::Stream::operator bool() const { throw_no_cuda(); return false; }
-cv::gpu::Stream::Stream(Impl*) { throw_no_cuda(); }
-void cv::gpu::Stream::create() { throw_no_cuda(); }
-void cv::gpu::Stream::release() { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu
-{
-    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream);
-    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-}}
-
-struct Stream::Impl
-{
-    static cudaStream_t getStream(const Impl* impl)
-    {
-        return impl ? impl->stream : 0;
-    }
-
-    cudaStream_t stream;
-    int ref_counter;
-};
-
-cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
-{
-    return Stream::Impl::getStream(stream.impl);
-}
-
-cv::gpu::Stream::Stream() : impl(0)
-{
-    create();
-}
-
-cv::gpu::Stream::~Stream()
-{
-    release();
-}
-
-cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
-{
-    if (impl)
-        CV_XADD(&impl->ref_counter, 1);
-}
-
-Stream& cv::gpu::Stream::operator =(const Stream& stream)
-{
-    if (this != &stream)
-    {
-        release();
-        impl = stream.impl;
-        if (impl)
-            CV_XADD(&impl->ref_counter, 1);
-    }
-
-    return *this;
-}
-
-bool cv::gpu::Stream::queryIfComplete()
-{
-    cudaStream_t stream = Impl::getStream(impl);
-    cudaError_t err = cudaStreamQuery(stream);
-
-    if (err == cudaErrorNotReady || err == cudaSuccess)
-        return err == cudaSuccess;
-
-    cudaSafeCall(err);
-    return false;
-}
-
-void cv::gpu::Stream::waitForCompletion()
-{
-    cudaStream_t stream = Impl::getStream(impl);
-    cudaSafeCall( cudaStreamSynchronize(stream) );
-}
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
-{
-    // if not -> allocation will be done, but after that dst will not point to page locked memory
-    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
-}
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst)
-{
-    dst.create(src.size(), src.type(), CudaMem::ALLOC_PAGE_LOCKED);
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
-}
-
-void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
-{
-    const int sdepth = src.depth();
-
-    if (sdepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    cudaStream_t stream = Impl::getStream(impl);
-
-    if (val[0] == 0.0 && val[1] == 0.0 && val[2] == 0.0 && val[3] == 0.0)
-    {
-        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, stream) );
-        return;
-    }
-
-    if (sdepth == CV_8U)
-    {
-        int cn = src.channels();
-
-        if (cn == 1 || (cn == 2 && val[0] == val[1]) || (cn == 3 && val[0] == val[1] && val[0] == val[2]) || (cn == 4 && val[0] == val[1] && val[0] == val[2] && val[0] == val[3]))
-        {
-            int ival = saturate_cast<uchar>(val[0]);
-            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, ival, src.cols * src.elemSize(), src.rows, stream) );
-            return;
-        }
-    }
-
-    setTo(src, val, stream);
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
-{
-    const int sdepth = src.depth();
-
-    if (sdepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    CV_Assert(mask.type() == CV_8UC1);
-
-    cudaStream_t stream = Impl::getStream(impl);
-
-    setTo(src, val, mask, stream);
-}
-
-void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double alpha, double beta)
-{
-    if (dtype < 0)
-        dtype = src.type();
-    else
-        dtype = CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels());
-
-    const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
-
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
-                && fabs(beta) < std::numeric_limits<double>::epsilon();
-
-    if (sdepth == ddepth && noScale)
-    {
-        enqueueCopy(src, dst);
-        return;
-    }
-
-    dst.create(src.size(), dtype);
-
-    cudaStream_t stream = Impl::getStream(impl);
-    convertTo(src, dst, alpha, beta, stream);
-}
-
-#if CUDART_VERSION >= 5000
-
-namespace
-{
-    struct CallbackData
-    {
-        cv::gpu::Stream::StreamCallback callback;
-        void* userData;
-        Stream stream;
-    };
-
-    void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
-    {
-        CallbackData* data = reinterpret_cast<CallbackData*>(userData);
-        data->callback(data->stream, static_cast<int>(status), data->userData);
-        delete data;
-    }
-}
-
-#endif
-
-void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
-{
-#if CUDART_VERSION >= 5000
-    CallbackData* data = new CallbackData;
-    data->callback = callback;
-    data->userData = userData;
-    data->stream = *this;
-
-    cudaStream_t stream = Impl::getStream(impl);
-
-    cudaSafeCall( cudaStreamAddCallback(stream, cudaStreamCallback, data, 0) );
-#else
-    (void) callback;
-    (void) userData;
-    CV_Error(CV_StsNotImplemented, "This function requires CUDA 5.0");
-#endif
-}
-
-cv::gpu::Stream& cv::gpu::Stream::Null()
-{
-    static Stream s((Impl*) 0);
-    return s;
-}
-
-cv::gpu::Stream::operator bool() const
-{
-    return impl && impl->stream;
-}
-
-cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_)
-{
-}
-
-void cv::gpu::Stream::create()
-{
-    if (impl)
-        release();
-
-    cudaStream_t stream;
-    cudaSafeCall( cudaStreamCreate( &stream ) );
-
-    impl = (Stream::Impl*) fastMalloc(sizeof(Stream::Impl));
-
-    impl->stream = stream;
-    impl->ref_counter = 1;
-}
-
-void cv::gpu::Stream::release()
-{
-    if (impl && CV_XADD(&impl->ref_counter, -1) == 1)
-    {
-        cudaSafeCall( cudaStreamDestroy(impl->stream) );
-        cv::fastFree(impl);
-    }
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/core/src/gpu_cuda_mem.cpp
+++ b/modules/core/src/gpu_cuda_mem.cpp
@ -0,0 +1,215 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+namespace
+{
+    size_t alignUpStep(size_t what, size_t alignment)
+    {
+        size_t alignMask = alignment - 1;
+        size_t inverseAlignMask = ~alignMask;
+        size_t res = (what + alignMask) & inverseAlignMask;
+        return res;
+    }
+}
+
+void cv::gpu::CudaMem::create(int rows_, int cols_, int type_)
+{
+#ifndef HAVE_CUDA
+    (void) rows_;
+    (void) cols_;
+    (void) type_;
+    throw_no_cuda();
+#else
+    if (alloc_type == SHARED)
+    {
+        DeviceInfo devInfo;
+        CV_Assert( devInfo.canMapHostMemory() );
+    }
+
+    type_ &= Mat::TYPE_MASK;
+
+    if (rows == rows_ && cols == cols_ && type() == type_ && data)
+        return;
+
+    if (data)
+        release();
+
+    CV_DbgAssert( rows_ >= 0 && cols_ >= 0 );
+
+    if (rows_ > 0 && cols_ > 0)
+    {
+        flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + type_;
+        rows = rows_;
+        cols = cols_;
+        step = elemSize() * cols;
+
+        if (alloc_type == SHARED)
+        {
+            DeviceInfo devInfo;
+            step = alignUpStep(step, devInfo.textureAlignment());
+        }
+
+        int64 _nettosize = (int64)step*rows;
+        size_t nettosize = (size_t)_nettosize;
+
+        if (_nettosize != (int64)nettosize)
+            CV_Error(cv::Error::StsNoMem, "Too big buffer is allocated");
+
+        size_t datasize = alignSize(nettosize, (int)sizeof(*refcount));
+
+        void* ptr = 0;
+
+        switch (alloc_type)
+        {
+        case PAGE_LOCKED:    cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocDefault) ); break;
+        case SHARED:         cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocMapped) );  break;
+        case WRITE_COMBINED: cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocWriteCombined) ); break;
+        default:             CV_Error(cv::Error::StsBadFlag, "Invalid alloc type");
+        }
+
+        datastart = data =  (uchar*)ptr;
+        dataend = data + nettosize;
+
+        refcount = (int*)cv::fastMalloc(sizeof(*refcount));
+        *refcount = 1;
+    }
+#endif
+}
+
+CudaMem cv::gpu::CudaMem::reshape(int new_cn, int new_rows) const
+{
+    CudaMem hdr = *this;
+
+    int cn = channels();
+    if (new_cn == 0)
+        new_cn = cn;
+
+    int total_width = cols * cn;
+
+    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
+        new_rows = rows * total_width / new_cn;
+
+    if (new_rows != 0 && new_rows != rows)
+    {
+        int total_size = total_width * rows;
+
+        if (!isContinuous())
+            CV_Error(cv::Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
+
+        if ((unsigned)new_rows > (unsigned)total_size)
+            CV_Error(cv::Error::StsOutOfRange, "Bad new number of rows");
+
+        total_width = total_size / new_rows;
+
+        if (total_width * new_rows != total_size)
+            CV_Error(cv::Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
+
+        hdr.rows = new_rows;
+        hdr.step = total_width * elemSize1();
+    }
+
+    int new_width = total_width / new_cn;
+
+    if (new_width * new_cn != total_width)
+        CV_Error(cv::Error::BadNumChannels, "The total width is not divisible by the new number of channels");
+
+    hdr.cols = new_width;
+    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
+
+    return hdr;
+}
+
+void cv::gpu::CudaMem::release()
+{
+#ifdef HAVE_CUDA
+    if (refcount && CV_XADD(refcount, -1) == 1)
+    {
+        cudaFreeHost(datastart);
+        fastFree(refcount);
+    }
+
+    data = datastart = dataend = 0;
+    step = rows = cols = 0;
+    refcount = 0;
+#endif
+}
+
+GpuMat cv::gpu::CudaMem::createGpuMatHeader() const
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    return GpuMat();
+#else
+    CV_Assert( alloc_type == SHARED );
+
+    void *pdev;
+    cudaSafeCall( cudaHostGetDevicePointer(&pdev, data, 0) );
+
+    return GpuMat(rows, cols, type(), pdev, step);
+#endif
+}
+
+void cv::gpu::registerPageLocked(Mat& m)
+{
+#ifndef HAVE_CUDA
+    (void) m;
+    throw_no_cuda();
+#else
+    CV_Assert( m.isContinuous() );
+    cudaSafeCall( cudaHostRegister(m.data, m.step * m.rows, cudaHostRegisterPortable) );
+#endif
+}
+
+void cv::gpu::unregisterPageLocked(Mat& m)
+{
+#ifndef HAVE_CUDA
+    (void) m;
+#else
+    cudaSafeCall( cudaHostUnregister(m.data) );
+#endif
+}
--- a/modules/core/src/gpu_info.cpp
+++ b/modules/core/src/gpu_info.cpp
--- a/modules/core/src/gpu_mat.cpp
+++ b/modules/core/src/gpu_mat.cpp
--- a/modules/core/src/gpu_stream.cpp
+++ b/modules/core/src/gpu_stream.cpp
@ -0,0 +1,308 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+////////////////////////////////////////////////////////////////
+// Stream
+
+#ifndef HAVE_CUDA
+
+class cv::gpu::Stream::Impl
+{
+public:
+    Impl(void* ptr = 0)
+    {
+        (void) ptr;
+        throw_no_cuda();
+    }
+};
+
+#else
+
+class cv::gpu::Stream::Impl
+{
+public:
+    cudaStream_t stream;
+
+    Impl();
+    Impl(cudaStream_t stream);
+
+    ~Impl();
+};
+
+cv::gpu::Stream::Impl::Impl() : stream(0)
+{
+    cudaSafeCall( cudaStreamCreate(&stream) );
+}
+
+cv::gpu::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
+{
+}
+
+cv::gpu::Stream::Impl::~Impl()
+{
+    if (stream)
+        cudaStreamDestroy(stream);
+}
+
+cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
+{
+    return stream.impl_->stream;
+}
+
+#endif
+
+cv::gpu::Stream::Stream()
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+#else
+    impl_ = new Impl;
+#endif
+}
+
+bool cv::gpu::Stream::queryIfComplete() const
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    return false;
+#else
+    cudaError_t err = cudaStreamQuery(impl_->stream);
+
+    if (err == cudaErrorNotReady || err == cudaSuccess)
+        return err == cudaSuccess;
+
+    cudaSafeCall(err);
+    return false;
+#endif
+}
+
+void cv::gpu::Stream::waitForCompletion()
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
+#endif
+}
+
+void cv::gpu::Stream::waitEvent(const Event& event)
+{
+#ifndef HAVE_CUDA
+    (void) event;
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
+#endif
+}
+
+#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
+
+namespace
+{
+    struct CallbackData
+    {
+        Stream::StreamCallback callback;
+        void* userData;
+
+        CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
+    };
+
+    void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
+    {
+        CallbackData* data = reinterpret_cast<CallbackData*>(userData);
+        data->callback(static_cast<int>(status), data->userData);
+        delete data;
+    }
+}
+
+#endif
+
+void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+{
+#ifndef HAVE_CUDA
+    (void) callback;
+    (void) userData;
+    throw_no_cuda();
+#else
+    #if CUDART_VERSION < 5000
+        (void) callback;
+        (void) userData;
+        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA 5.0");
+    #else
+        CallbackData* data = new CallbackData(callback, userData);
+
+        cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
+    #endif
+#endif
+}
+
+Stream& cv::gpu::Stream::Null()
+{
+    static Stream s(new Impl(0));
+    return s;
+}
+
+cv::gpu::Stream::operator bool_type() const
+{
+#ifndef HAVE_CUDA
+    return 0;
+#else
+    return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
+#endif
+}
+
+template <> void cv::Ptr<Stream::Impl>::delete_obj()
+{
+    if (obj) delete obj;
+}
+
+////////////////////////////////////////////////////////////////
+// Stream
+
+#ifndef HAVE_CUDA
+
+class cv::gpu::Event::Impl
+{
+public:
+    Impl(unsigned int)
+    {
+        throw_no_cuda();
+    }
+};
+
+#else
+
+class cv::gpu::Event::Impl
+{
+public:
+    cudaEvent_t event;
+
+    Impl(unsigned int flags);
+    ~Impl();
+};
+
+cv::gpu::Event::Impl::Impl(unsigned int flags) : event(0)
+{
+    cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
+}
+
+cv::gpu::Event::Impl::~Impl()
+{
+    if (event)
+        cudaEventDestroy(event);
+}
+
+cudaEvent_t cv::gpu::EventAccessor::getEvent(const Event& event)
+{
+    return event.impl_->event;
+}
+
+#endif
+
+cv::gpu::Event::Event(CreateFlags flags)
+{
+#ifndef HAVE_CUDA
+    (void) flags;
+    throw_no_cuda();
+#else
+    impl_ = new Impl(flags);
+#endif
+}
+
+void cv::gpu::Event::record(Stream& stream)
+{
+#ifndef HAVE_CUDA
+    (void) stream;
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
+#endif
+}
+
+bool cv::gpu::Event::queryIfComplete() const
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    return false;
+#else
+    cudaError_t err = cudaEventQuery(impl_->event);
+
+    if (err == cudaErrorNotReady || err == cudaSuccess)
+        return err == cudaSuccess;
+
+    cudaSafeCall(err);
+    return false;
+#endif
+}
+
+void cv::gpu::Event::waitForCompletion()
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaEventSynchronize(impl_->event) );
+#endif
+}
+
+float cv::gpu::Event::elapsedTime(const Event& start, const Event& end)
+{
+#ifndef HAVE_CUDA
+    (void) start;
+    (void) end;
+    throw_no_cuda();
+    return 0.0f;
+#else
+    float ms;
+    cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
+    return ms;
+#endif
+}
+
+template <> void cv::Ptr<Event::Impl>::delete_obj()
+{
+    if (obj) delete obj;
+}
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -41,8 +41,6 @@
 //M*/

 #include "precomp.hpp"
-#include "opencv2/core/gpumat.hpp"
-#include "opencv2/core/opengl.hpp"

 /****************************************************************************************\
 *                           [scaled] Identity matrix initialization                      *
@ -941,14 +939,15 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
 \*************************************************************************************************/

 _InputArray::_InputArray() : flags(0), obj(0) {}
-_InputArray::~_InputArray() {}
 _InputArray::_InputArray(const Mat& m) : flags(MAT), obj((void*)&m) {}
 _InputArray::_InputArray(const std::vector<Mat>& vec) : flags(STD_VECTOR_MAT), obj((void*)&vec) {}
 _InputArray::_InputArray(const double& val) : flags(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F), obj((void*)&val), sz(Size(1,1)) {}
 _InputArray::_InputArray(const MatExpr& expr) : flags(FIXED_TYPE + FIXED_SIZE + EXPR), obj((void*)&expr) {}
 _InputArray::_InputArray(const gpu::GpuMat& d_mat) : flags(GPU_MAT), obj((void*)&d_mat) {}
 _InputArray::_InputArray(const ogl::Buffer& buf) : flags(OPENGL_BUFFER), obj((void*)&buf) {}
-_InputArray::_InputArray(const ogl::Texture2D& tex) : flags(OPENGL_TEXTURE), obj((void*)&tex) {}
+_InputArray::_InputArray(const gpu::CudaMem& cuda_mem) : flags(CUDA_MEM), obj((void*)&cuda_mem) {}
+
+_InputArray::~_InputArray() {}

 Mat _InputArray::getMat(int i) const
 {
@ -996,14 +995,37 @@ Mat _InputArray::getMat(int i) const
        return !v.empty() ? Mat(size(i), t, (void*)&v[0]) : Mat();
    }

-    CV_Assert( k == STD_VECTOR_MAT );
-    //if( k == STD_VECTOR_MAT )
+    if( k == STD_VECTOR_MAT )
    {
        const std::vector<Mat>& v = *(const std::vector<Mat>*)obj;
        CV_Assert( 0 <= i && i < (int)v.size() );

        return v[i];
    }
+
+    if( k == OPENGL_BUFFER )
+    {
+        CV_Assert( i < 0 );
+        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call mapHost/unmapHost methods for ogl::Buffer object");
+        return Mat();
+    }
+
+    if( k == GPU_MAT )
+    {
+        CV_Assert( i < 0 );
+        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call download method for gpu::GpuMat object");
+        return Mat();
+    }
+
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+    {
+        CV_Assert( i < 0 );
+
+        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+
+        return cuda_mem->createMatHeader();
+    }
 }


@ -1092,10 +1114,29 @@ gpu::GpuMat _InputArray::getGpuMat() const
 {
    int k = kind();

-    CV_Assert(k == GPU_MAT);
+    if (k == GPU_MAT)
+    {
+        const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
+        return *d_mat;
+    }

-    const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
-    return *d_mat;
+    if (k == CUDA_MEM)
+    {
+        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+        return cuda_mem->createGpuMatHeader();
+    }
+
+    if (k == OPENGL_BUFFER)
+    {
+        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call mapDevice/unmapDevice methods for ogl::Buffer object");
+        return gpu::GpuMat();
+    }
+
+    if (k == NONE)
+        return gpu::GpuMat();
+
+    CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for gpu::GpuMat and gpu::CudaMem");
+    return gpu::GpuMat();
 }

 ogl::Buffer _InputArray::getOGlBuffer() const
@ -1108,16 +1149,6 @@ ogl::Buffer _InputArray::getOGlBuffer() const
    return *gl_buf;
 }

-ogl::Texture2D _InputArray::getOGlTexture2D() const
-{
-    int k = kind();
-
-    CV_Assert(k == OPENGL_TEXTURE);
-
-    const ogl::Texture2D* gl_tex = (const ogl::Texture2D*)obj;
-    return *gl_tex;
-}
-
 int _InputArray::kind() const
 {
    return flags & KIND_MASK;
@ -1186,20 +1217,20 @@ Size _InputArray::size(int i) const
        return buf->size();
    }

-    if( k == OPENGL_TEXTURE )
-    {
-        CV_Assert( i < 0 );
-        const ogl::Texture2D* tex = (const ogl::Texture2D*)obj;
-        return tex->size();
-    }
-
-    CV_Assert( k == GPU_MAT );
-    //if( k == GPU_MAT )
+    if( k == GPU_MAT )
    {
        CV_Assert( i < 0 );
        const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
        return d_mat->size();
    }
+
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+    {
+        CV_Assert( i < 0 );
+        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+        return cuda_mem->size();
+    }
 }

 size_t _InputArray::total(int i) const
@ -1252,9 +1283,12 @@ int _InputArray::type(int i) const
    if( k == OPENGL_BUFFER )
        return ((const ogl::Buffer*)obj)->type();

-    CV_Assert( k == GPU_MAT );
-    //if( k == GPU_MAT )
+    if( k == GPU_MAT )
        return ((const gpu::GpuMat*)obj)->type();
+
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+        return ((const gpu::CudaMem*)obj)->type();
 }

 int _InputArray::depth(int i) const
@ -1304,29 +1338,29 @@ bool _InputArray::empty() const
    if( k == OPENGL_BUFFER )
        return ((const ogl::Buffer*)obj)->empty();

-    if( k == OPENGL_TEXTURE )
-        return ((const ogl::Texture2D*)obj)->empty();
-
-    CV_Assert( k == GPU_MAT );
-    //if( k == GPU_MAT )
+    if( k == GPU_MAT )
        return ((const gpu::GpuMat*)obj)->empty();
+
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+        return ((const gpu::CudaMem*)obj)->empty();
 }


 _OutputArray::_OutputArray() {}
-_OutputArray::~_OutputArray() {}
 _OutputArray::_OutputArray(Mat& m) : _InputArray(m) {}
 _OutputArray::_OutputArray(std::vector<Mat>& vec) : _InputArray(vec) {}
 _OutputArray::_OutputArray(gpu::GpuMat& d_mat) : _InputArray(d_mat) {}
 _OutputArray::_OutputArray(ogl::Buffer& buf) : _InputArray(buf) {}
-_OutputArray::_OutputArray(ogl::Texture2D& tex) : _InputArray(tex) {}
+_OutputArray::_OutputArray(gpu::CudaMem& cuda_mem) : _InputArray(cuda_mem) {}

 _OutputArray::_OutputArray(const Mat& m) : _InputArray(m) {flags |= FIXED_SIZE|FIXED_TYPE;}
 _OutputArray::_OutputArray(const std::vector<Mat>& vec) : _InputArray(vec) {flags |= FIXED_SIZE;}
 _OutputArray::_OutputArray(const gpu::GpuMat& d_mat) : _InputArray(d_mat) {flags |= FIXED_SIZE|FIXED_TYPE;}
 _OutputArray::_OutputArray(const ogl::Buffer& buf) : _InputArray(buf) {flags |= FIXED_SIZE|FIXED_TYPE;}
-_OutputArray::_OutputArray(const ogl::Texture2D& tex) : _InputArray(tex) {flags |= FIXED_SIZE|FIXED_TYPE;}
+_OutputArray::_OutputArray(const gpu::CudaMem& cuda_mem) : _InputArray(cuda_mem) {flags |= FIXED_SIZE|FIXED_TYPE;}

+_OutputArray::~_OutputArray() {}

 bool _OutputArray::fixedSize() const
 {
@ -1362,6 +1396,13 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
        ((ogl::Buffer*)obj)->create(_sz, mtype);
        return;
    }
+    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    {
+        CV_Assert(!fixedSize() || ((gpu::CudaMem*)obj)->size() == _sz);
+        CV_Assert(!fixedType() || ((gpu::CudaMem*)obj)->type() == mtype);
+        ((gpu::CudaMem*)obj)->create(_sz, mtype);
+        return;
+    }
    int sizes[] = {_sz.height, _sz.width};
    create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
@ -1390,6 +1431,13 @@ void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransp
        ((ogl::Buffer*)obj)->create(rows, cols, mtype);
        return;
    }
+    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    {
+        CV_Assert(!fixedSize() || ((gpu::CudaMem*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedType() || ((gpu::CudaMem*)obj)->type() == mtype);
+        ((gpu::CudaMem*)obj)->create(rows, cols, mtype);
+        return;
+    }
    int sizes[] = {rows, cols};
    create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
@ -1609,15 +1657,15 @@ void _OutputArray::release() const
        return;
    }

-    if( k == OPENGL_BUFFER )
+    if( k == CUDA_MEM )
    {
-        ((ogl::Buffer*)obj)->release();
+        ((gpu::CudaMem*)obj)->release();
        return;
    }

-    if( k == OPENGL_TEXTURE )
+    if( k == OPENGL_BUFFER )
    {
-        ((ogl::Texture2D*)obj)->release();
+        ((ogl::Buffer*)obj)->release();
        return;
    }

@ -1693,11 +1741,11 @@ ogl::Buffer& _OutputArray::getOGlBufferRef() const
    return *(ogl::Buffer*)obj;
 }

-ogl::Texture2D& _OutputArray::getOGlTexture2DRef() const
+gpu::CudaMem& _OutputArray::getCudaMemRef() const
 {
    int k = kind();
-    CV_Assert( k == OPENGL_TEXTURE );
-    return *(ogl::Texture2D*)obj;
+    CV_Assert( k == CUDA_MEM );
+    return *(gpu::CudaMem*)obj;
 }

 static _OutputArray _none;
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@ -1,294 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-cv::gpu::CudaMem::CudaMem()
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-}
-
-cv::gpu::CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( _rows > 0 && _cols > 0 )
-        create( _rows, _cols, _type, _alloc_type);
-}
-
-cv::gpu::CudaMem::CudaMem(Size _size, int _type, int _alloc_type)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( _size.height > 0 && _size.width > 0 )
-        create( _size.height, _size.width, _type, _alloc_type);
-}
-
-cv::gpu::CudaMem::CudaMem(const CudaMem& m)
-    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
-{
-    if( refcount )
-        CV_XADD(refcount, 1);
-}
-
-cv::gpu::CudaMem::CudaMem(const Mat& m, int _alloc_type)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( m.rows > 0 && m.cols > 0 )
-        create( m.size(), m.type(), _alloc_type);
-
-    Mat tmp = createMatHeader();
-    m.copyTo(tmp);
-}
-
-cv::gpu::CudaMem::~CudaMem()
-{
-    release();
-}
-
-CudaMem& cv::gpu::CudaMem::operator = (const CudaMem& m)
-{
-    if( this != &m )
-    {
-        if( m.refcount )
-            CV_XADD(m.refcount, 1);
-        release();
-        flags = m.flags;
-        rows = m.rows; cols = m.cols;
-        step = m.step; data = m.data;
-        datastart = m.datastart;
-        dataend = m.dataend;
-        refcount = m.refcount;
-        alloc_type = m.alloc_type;
-    }
-    return *this;
-}
-
-CudaMem cv::gpu::CudaMem::clone() const
-{
-    CudaMem m(size(), type(), alloc_type);
-    Mat to = m;
-    Mat from = *this;
-    from.copyTo(to);
-    return m;
-}
-
-void cv::gpu::CudaMem::create(Size _size, int _type, int _alloc_type)
-{
-    create(_size.height, _size.width, _type, _alloc_type);
-}
-
-Mat cv::gpu::CudaMem::createMatHeader() const
-{
-    return Mat(size(), type(), data, step);
-}
-
-cv::gpu::CudaMem::operator Mat() const
-{
-    return createMatHeader();
-}
-
-cv::gpu::CudaMem::operator GpuMat() const
-{
-    return createGpuMatHeader();
-}
-
-bool cv::gpu::CudaMem::isContinuous() const
-{
-    return (flags & Mat::CONTINUOUS_FLAG) != 0;
-}
-
-size_t cv::gpu::CudaMem::elemSize() const
-{
-    return CV_ELEM_SIZE(flags);
-}
-
-size_t cv::gpu::CudaMem::elemSize1() const
-{
-    return CV_ELEM_SIZE1(flags);
-}
-
-int cv::gpu::CudaMem::type() const
-{
-    return CV_MAT_TYPE(flags);
-}
-
-int cv::gpu::CudaMem::depth() const
-{
-    return CV_MAT_DEPTH(flags);
-}
-
-int cv::gpu::CudaMem::channels() const
-{
-    return CV_MAT_CN(flags);
-}
-
-size_t cv::gpu::CudaMem::step1() const
-{
-    return step/elemSize1();
-}
-
-Size cv::gpu::CudaMem::size() const
-{
-    return Size(cols, rows);
-}
-
-bool cv::gpu::CudaMem::empty() const
-{
-    return data == 0;
-}
-
-#if !defined (HAVE_CUDA)
-
-void cv::gpu::registerPageLocked(Mat&) { throw_no_cuda(); }
-void cv::gpu::unregisterPageLocked(Mat&) { throw_no_cuda(); }
-void cv::gpu::CudaMem::create(int, int, int, int) { throw_no_cuda(); }
-bool cv::gpu::CudaMem::canMapHostMemory() { throw_no_cuda(); return false; }
-void cv::gpu::CudaMem::release() { throw_no_cuda(); }
-GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_no_cuda(); return GpuMat(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-void cv::gpu::registerPageLocked(Mat& m)
-{
-    cudaSafeCall( cudaHostRegister(m.ptr(), m.step * m.rows, cudaHostRegisterPortable) );
-}
-
-void cv::gpu::unregisterPageLocked(Mat& m)
-{
-    cudaSafeCall( cudaHostUnregister(m.ptr()) );
-}
-
-bool cv::gpu::CudaMem::canMapHostMemory()
-{
-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, getDevice()) );
-    return (prop.canMapHostMemory != 0) ? true : false;
-}
-
-namespace
-{
-    size_t alignUpStep(size_t what, size_t alignment)
-    {
-        size_t alignMask = alignment-1;
-        size_t inverseAlignMask = ~alignMask;
-        size_t res = (what + alignMask) & inverseAlignMask;
-        return res;
-    }
-}
-
-void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)
-{
-    if (_alloc_type == ALLOC_ZEROCOPY && !canMapHostMemory())
-        CV_Error(cv::Error::GpuApiCallError, "ZeroCopy is not supported by current device");
-
-    _type &= Mat::TYPE_MASK;
-    if( rows == _rows && cols == _cols && type() == _type && data )
-        return;
-    if( data )
-        release();
-    CV_DbgAssert( _rows >= 0 && _cols >= 0 );
-    if( _rows > 0 && _cols > 0 )
-    {
-        flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + _type;
-        rows = _rows;
-        cols = _cols;
-        step = elemSize()*cols;
-        if (_alloc_type == ALLOC_ZEROCOPY)
-        {
-            cudaDeviceProp prop;
-            cudaSafeCall( cudaGetDeviceProperties(&prop, getDevice()) );
-            step = alignUpStep(step, prop.textureAlignment);
-        }
-        int64 _nettosize = (int64)step*rows;
-        size_t nettosize = (size_t)_nettosize;
-        if( _nettosize != (int64)nettosize )
-            CV_Error(CV_StsNoMem, "Too big buffer is allocated");
-        size_t datasize = alignSize(nettosize, (int)sizeof(*refcount));
-
-        //datastart = data = (uchar*)fastMalloc(datasize + sizeof(*refcount));
-        alloc_type = _alloc_type;
-        void *ptr = 0;
-
-        switch (alloc_type)
-        {
-        case ALLOC_PAGE_LOCKED:    cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocDefault) ); break;
-        case ALLOC_ZEROCOPY:       cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocMapped) );  break;
-        case ALLOC_WRITE_COMBINED: cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocWriteCombined) ); break;
-        default:                   CV_Error(cv::Error::StsBadFlag, "Invalid alloc type");
-        }
-
-        datastart = data =  (uchar*)ptr;
-        dataend = data + nettosize;
-
-        refcount = (int*)cv::fastMalloc(sizeof(*refcount));
-        *refcount = 1;
-    }
-}
-
-GpuMat cv::gpu::CudaMem::createGpuMatHeader () const
-{
-    CV_Assert( alloc_type == ALLOC_ZEROCOPY );
-
-    GpuMat res;
-
-    void *pdev;
-    cudaSafeCall( cudaHostGetDevicePointer( &pdev, data, 0 ) );
-    res = GpuMat(rows, cols, type(), pdev, step);
-
-    return res;
-}
-
-void cv::gpu::CudaMem::release()
-{
-    if( refcount && CV_XADD(refcount, -1) == 1 )
-    {
-        cudaSafeCall( cudaFreeHost(datastart ) );
-        fastFree(refcount);
-    }
-    data = datastart = dataend = 0;
-    step = rows = cols = 0;
-    refcount = 0;
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/core/src/opengl_interop.cpp
+++ b/modules/core/src/opengl_interop.cpp
@ -55,62 +55,61 @@ using namespace cv::gpu;
 namespace
 {
    #ifndef HAVE_OPENGL
-        void throw_no_ogl() { CV_Error(CV_OpenGlNotSupported, "The library is compiled without OpenGL support"); }
+        inline void throw_no_ogl() { CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support"); }
    #else
-        void throw_no_ogl() { CV_Error(CV_OpenGlApiCallError, "OpenGL context doesn't exist"); }
+        inline void throw_no_ogl() { CV_Error(cv::Error::OpenGlApiCallError, "OpenGL context doesn't exist"); }
    #endif

-bool checkError(const char* file, const int line, const char* func = 0)
-{
-#ifndef HAVE_OPENGL
-    (void) file;
-    (void) line;
-    (void) func;
-    return true;
-#else
-    GLenum err = gl::GetError();
-
-    if (err != gl::NO_ERROR_)
+    bool checkError(const char* file, const int line, const char* func = 0)
    {
-        const char* msg;
+    #ifndef HAVE_OPENGL
+        (void) file;
+        (void) line;
+        (void) func;
+        return true;
+    #else
+        GLenum err = gl::GetError();

-        switch (err)
+        if (err != gl::NO_ERROR_)
        {
-        case gl::INVALID_ENUM:
-            msg = "An unacceptable value is specified for an enumerated argument";
-            break;
+            const char* msg;

-        case gl::INVALID_VALUE:
-            msg = "A numeric argument is out of range";
-            break;
+            switch (err)
+            {
+            case gl::INVALID_ENUM:
+                msg = "An unacceptable value is specified for an enumerated argument";
+                break;

-        case gl::INVALID_OPERATION:
-            msg = "The specified operation is not allowed in the current state";
-            break;
+            case gl::INVALID_VALUE:
+                msg = "A numeric argument is out of range";
+                break;

-        case gl::OUT_OF_MEMORY:
-            msg = "There is not enough memory left to execute the command";
-            break;
+            case gl::INVALID_OPERATION:
+                msg = "The specified operation is not allowed in the current state";
+                break;

-        default:
-            msg = "Unknown error";
-        };
+            case gl::OUT_OF_MEMORY:
+                msg = "There is not enough memory left to execute the command";
+                break;

-        cvError(CV_OpenGlApiCallError, func, msg, file, line);
+            default:
+                msg = "Unknown error";
+            };

-        return false;
+            cvError(CV_OpenGlApiCallError, func, msg, file, line);
+
+            return false;
+        }
+
+        return true;
+    #endif
    }

-    return true;
-#endif
-}
-
-#if defined(__GNUC__)
-    #define CV_CheckGlError() CV_DbgAssert( (checkError(__FILE__, __LINE__, __func__)) )
-#else
-    #define CV_CheckGlError() CV_DbgAssert( (checkError(__FILE__, __LINE__)) )
-#endif
-
+    #if defined(__GNUC__)
+        #define CV_CheckGlError() CV_DbgAssert( (checkError(__FILE__, __LINE__, __func__)) )
+    #else
+        #define CV_CheckGlError() CV_DbgAssert( (checkError(__FILE__, __LINE__)) )
+    #endif
 } // namespace

 #ifdef HAVE_OPENGL
@ -129,7 +128,7 @@ void cv::gpu::setGlDevice(int device)
    (void) device;
    throw_no_ogl();
 #else
-    #if !defined(HAVE_CUDA) || defined(CUDA_DISABLER)
+    #ifndef HAVE_CUDA
        (void) device;
        throw_no_cuda();
    #else
@ -141,7 +140,7 @@ void cv::gpu::setGlDevice(int device)
 ////////////////////////////////////////////////////////////////////////
 // CudaResource

-#if defined(HAVE_OPENGL) && defined(HAVE_CUDA) && !defined(CUDA_DISABLER)
+#if defined(HAVE_OPENGL) && defined(HAVE_CUDA)

 namespace
 {
@ -353,12 +352,13 @@ const Ptr<cv::ogl::Buffer::Impl>& cv::ogl::Buffer::Impl::empty()
    return p;
 }

-cv::ogl::Buffer::Impl::Impl() : bufId_(0), autoRelease_(true)
+cv::ogl::Buffer::Impl::Impl() : bufId_(0), autoRelease_(false)
 {
 }

 cv::ogl::Buffer::Impl::Impl(GLuint abufId, bool autoRelease) : bufId_(abufId), autoRelease_(autoRelease)
 {
+    CV_Assert( gl::IsBuffer(abufId) == gl::TRUE_ );
 }

 cv::ogl::Buffer::Impl::Impl(GLsizeiptr size, const GLvoid* data, GLenum target, bool autoRelease) : bufId_(0), autoRelease_(autoRelease)
@ -437,29 +437,31 @@ void cv::ogl::Buffer::Impl::unmapHost()
 }

 #ifdef HAVE_CUDA
-    void cv::ogl::Buffer::Impl::copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream)
-    {
-        cudaResource_.registerBuffer(bufId_);
-        cudaResource_.copyFrom(src, spitch, width, height, stream);
-    }

-    void cv::ogl::Buffer::Impl::copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream) const
-    {
-        cudaResource_.registerBuffer(bufId_);
-        cudaResource_.copyTo(dst, dpitch, width, height, stream);
-    }
+void cv::ogl::Buffer::Impl::copyFrom(const void* src, size_t spitch, size_t width, size_t height, cudaStream_t stream)
+{
+    cudaResource_.registerBuffer(bufId_);
+    cudaResource_.copyFrom(src, spitch, width, height, stream);
+}

-    void* cv::ogl::Buffer::Impl::mapDevice(cudaStream_t stream)
-    {
-        cudaResource_.registerBuffer(bufId_);
-        return cudaResource_.map(stream);
-    }
+void cv::ogl::Buffer::Impl::copyTo(void* dst, size_t dpitch, size_t width, size_t height, cudaStream_t stream) const
+{
+    cudaResource_.registerBuffer(bufId_);
+    cudaResource_.copyTo(dst, dpitch, width, height, stream);
+}

-    void cv::ogl::Buffer::Impl::unmapDevice(cudaStream_t stream)
-    {
-        cudaResource_.unmap(stream);
-    }
-#endif
+void* cv::ogl::Buffer::Impl::mapDevice(cudaStream_t stream)
+{
+    cudaResource_.registerBuffer(bufId_);
+    return cudaResource_.map(stream);
+}
+
+void cv::ogl::Buffer::Impl::unmapDevice(cudaStream_t stream)
+{
+    cudaResource_.unmap(stream);
+}
+
+#endif // HAVE_CUDA

 #endif // HAVE_OPENGL

@ -505,16 +507,6 @@ cv::ogl::Buffer::Buffer(Size asize, int atype, unsigned int abufId, bool autoRel
 #endif
 }

-cv::ogl::Buffer::Buffer(int arows, int acols, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
-{
-    create(arows, acols, atype, target, autoRelease);
-}
-
-cv::ogl::Buffer::Buffer(Size asize, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
-{
-    create(asize, atype, target, autoRelease);
-}
-
 cv::ogl::Buffer::Buffer(InputArray arr, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
 {
 #ifndef HAVE_OPENGL
@ -528,22 +520,9 @@ cv::ogl::Buffer::Buffer(InputArray arr, Target target, bool autoRelease) : rows_
    switch (kind)
    {
    case _InputArray::OPENGL_BUFFER:
-        {
-            copyFrom(arr, target, autoRelease);
-            break;
-        }
-
-    case _InputArray::OPENGL_TEXTURE:
-        {
-            copyFrom(arr, target, autoRelease);
-            break;
-        }
-
    case _InputArray::GPU_MAT:
-        {
-            copyFrom(arr, target, autoRelease);
-            break;
-        }
+        copyFrom(arr, target, autoRelease);
+        break;

    default:
        {
@ -613,14 +592,6 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, Target target, bool autoRelease)
 #else
    const int kind = arr.kind();

-    if (kind == _InputArray::OPENGL_TEXTURE)
-    {
-        ogl::Texture2D tex = arr.getOGlTexture2D();
-        tex.copyTo(*this);
-        setAutoRelease(autoRelease);
-        return;
-    }
-
    const Size asize = arr.size();
    const int atype = arr.type();
    create(asize, atype, target, autoRelease);
@ -636,7 +607,7 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, Target target, bool autoRelease)

    case _InputArray::GPU_MAT:
        {
-            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+            #ifndef HAVE_CUDA
                throw_no_cuda();
            #else
                GpuMat dmat = arr.getGpuMat();
@ -656,13 +627,36 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, Target target, bool autoRelease)
 #endif
 }

-void cv::ogl::Buffer::copyTo(OutputArray arr, Target target, bool autoRelease) const
+void cv::ogl::Buffer::copyFrom(InputArray arr, gpu::Stream& stream, Target target, bool autoRelease)
 {
 #ifndef HAVE_OPENGL
    (void) arr;
+    (void) stream;
    (void) target;
    (void) autoRelease;
    throw_no_ogl();
+#else
+    #ifndef HAVE_CUDA
+        (void) arr;
+        (void) stream;
+        (void) target;
+        (void) autoRelease;
+        throw_no_cuda();
+    #else
+        GpuMat dmat = arr.getGpuMat();
+
+        create(dmat.size(), dmat.type(), target, autoRelease);
+
+        impl_->copyFrom(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows, gpu::StreamAccessor::getStream(stream));
+    #endif
+#endif
+}
+
+void cv::ogl::Buffer::copyTo(OutputArray arr) const
+{
+#ifndef HAVE_OPENGL
+    (void) arr;
+    throw_no_ogl();
 #else
    const int kind = arr.kind();

@ -670,19 +664,13 @@ void cv::ogl::Buffer::copyTo(OutputArray arr, Target target, bool autoRelease) c
    {
    case _InputArray::OPENGL_BUFFER:
        {
-            arr.getOGlBufferRef().copyFrom(*this, target, autoRelease);
-            break;
-        }
-
-    case _InputArray::OPENGL_TEXTURE:
-        {
-            arr.getOGlTexture2DRef().copyFrom(*this, autoRelease);
+            arr.getOGlBufferRef().copyFrom(*this);
            break;
        }

    case _InputArray::GPU_MAT:
        {
-            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+            #ifndef HAVE_CUDA
                throw_no_cuda();
            #else
                GpuMat& dmat = arr.getGpuMatRef();
@ -704,6 +692,25 @@ void cv::ogl::Buffer::copyTo(OutputArray arr, Target target, bool autoRelease) c
 #endif
 }

+void cv::ogl::Buffer::copyTo(OutputArray arr, gpu::Stream& stream) const
+{
+#ifndef HAVE_OPENGL
+    (void) arr;
+    (void) stream;
+    throw_no_ogl();
+#else
+    #ifndef HAVE_CUDA
+        (void) arr;
+        (void) stream;
+        throw_no_cuda();
+    #else
+        arr.create(rows_, cols_, type_);
+        GpuMat dmat = arr.getGpuMat();
+        impl_->copyTo(dmat.data, dmat.step, dmat.cols * dmat.elemSize(), dmat.rows, gpu::StreamAccessor::getStream(stream));
+    #endif
+#endif
+}
+
 cv::ogl::Buffer cv::ogl::Buffer::clone(Target target, bool autoRelease) const
 {
 #ifndef HAVE_OPENGL
@ -765,7 +772,7 @@ GpuMat cv::ogl::Buffer::mapDevice()
    throw_no_ogl();
    return GpuMat();
 #else
-    #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+    #ifndef HAVE_CUDA
        throw_no_cuda();
        return GpuMat();
    #else
@ -779,7 +786,7 @@ void cv::ogl::Buffer::unmapDevice()
 #ifndef HAVE_OPENGL
    throw_no_ogl();
 #else
-    #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+    #ifndef HAVE_CUDA
        throw_no_cuda();
    #else
        impl_->unmapDevice();
@ -787,6 +794,38 @@ void cv::ogl::Buffer::unmapDevice()
 #endif
 }

+gpu::GpuMat cv::ogl::Buffer::mapDevice(gpu::Stream& stream)
+{
+#ifndef HAVE_OPENGL
+    (void) stream;
+    throw_no_ogl();
+    return GpuMat();
+#else
+    #ifndef HAVE_CUDA
+        (void) stream;
+        throw_no_cuda();
+        return GpuMat();
+    #else
+        return GpuMat(rows_, cols_, type_, impl_->mapDevice(gpu::StreamAccessor::getStream(stream)));
+    #endif
+#endif
+}
+
+void cv::ogl::Buffer::unmapDevice(gpu::Stream& stream)
+{
+#ifndef HAVE_OPENGL
+    (void) stream;
+    throw_no_ogl();
+#else
+    #ifndef HAVE_CUDA
+        (void) stream;
+        throw_no_cuda();
+    #else
+        impl_->unmapDevice(gpu::StreamAccessor::getStream(stream));
+    #endif
+#endif
+}
+
 unsigned int cv::ogl::Buffer::bufId() const
 {
 #ifndef HAVE_OPENGL
@ -844,12 +883,13 @@ const Ptr<cv::ogl::Texture2D::Impl> cv::ogl::Texture2D::Impl::empty()
    return p;
 }

-cv::ogl::Texture2D::Impl::Impl() : texId_(0), autoRelease_(true)
+cv::ogl::Texture2D::Impl::Impl() : texId_(0), autoRelease_(false)
 {
 }

 cv::ogl::Texture2D::Impl::Impl(GLuint atexId, bool autoRelease) : texId_(atexId), autoRelease_(autoRelease)
 {
+    CV_Assert( gl::IsTexture(atexId) == gl::TRUE_ );
 }

 cv::ogl::Texture2D::Impl::Impl(GLint internalFormat, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid* pixels, bool autoRelease) : texId_(0), autoRelease_(autoRelease)
@ -955,16 +995,6 @@ cv::ogl::Texture2D::Texture2D(Size asize, Format aformat, unsigned int atexId, b
 #endif
 }

-cv::ogl::Texture2D::Texture2D(int arows, int acols, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
-{
-    create(arows, acols, aformat, autoRelease);
-}
-
-cv::ogl::Texture2D::Texture2D(Size asize, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
-{
-    create(asize, aformat, autoRelease);
-}
-
 cv::ogl::Texture2D::Texture2D(InputArray arr, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
 {
 #ifndef HAVE_OPENGL
@ -1005,7 +1035,7 @@ cv::ogl::Texture2D::Texture2D(InputArray arr, bool autoRelease) : rows_(0), cols

    case _InputArray::GPU_MAT:
        {
-            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+            #ifndef HAVE_CUDA
                throw_no_cuda();
            #else
                GpuMat dmat = arr.getGpuMat();
@ -1118,7 +1148,7 @@ void cv::ogl::Texture2D::copyFrom(InputArray arr, bool autoRelease)

    case _InputArray::GPU_MAT:
        {
-            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+            #ifndef HAVE_CUDA
                throw_no_cuda();
            #else
                GpuMat dmat = arr.getGpuMat();
@ -1169,7 +1199,7 @@ void cv::ogl::Texture2D::copyTo(OutputArray arr, int ddepth, bool autoRelease) c

    case _InputArray::GPU_MAT:
        {
-            #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+            #ifndef HAVE_CUDA
                throw_no_cuda();
            #else
                ogl::Buffer buf(rows_, cols_, CV_MAKE_TYPE(ddepth, cn), ogl::Buffer::PIXEL_PACK_BUFFER);
@ -1221,10 +1251,6 @@ template <> void cv::Ptr<cv::ogl::Texture2D::Impl>::delete_obj()
 ////////////////////////////////////////////////////////////////////////
 // ogl::Arrays

-cv::ogl::Arrays::Arrays() : size_(0)
-{
-}
-
 void cv::ogl::Arrays::setVertexArray(InputArray vertex)
 {
    const int cn = vertex.channels();
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -45,11 +45,11 @@

 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/core_c.h"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/core/opengl.hpp"

 #include "opencv2/core/private.hpp"
-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include <assert.h>
 #include <ctype.h>
--- a/modules/gpu/doc/data_structures.rst
+++ b/modules/gpu/doc/data_structures.rst
@ -6,32 +6,22 @@ Data Structures


 gpu::PtrStepSz
---------------
+--------------
 .. ocv:class:: gpu::PtrStepSz

 Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA kernels). Typically, it is used internally by OpenCV and by users who write device code. You can call its members from both host and device code. ::

-    template <typename T> struct PtrStepSz
+    template <typename T> struct PtrStepSz : public PtrStep<T>
    {
-        int cols;
-        int rows;
-        T* data;
-        size_t step;
-
-        PtrStepSz() : cols(0), rows(0), data(0), step(0){};
-        PtrStepSz(int rows, int cols, T *data, size_t step);
+        __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
+        __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
+            : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}

        template <typename U>
-        explicit PtrStepSz(const PtrStepSz<U>& d);
+        explicit PtrStepSz(const PtrStepSz<U>& d) : PtrStep<T>((T*)d.data, d.step), cols(d.cols), rows(d.rows){}

-        typedef T elem_type;
-        enum { elem_size = sizeof(elem_type) };
-
-        __CV_GPU_HOST_DEVICE__ size_t elemSize() const;
-
-        /* returns pointer to the beginning of the given image row */
-        __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0);
-        __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const;
+        int cols;
+        int rows;
    };

    typedef PtrStepSz<unsigned char> PtrStepSzb;
@ -41,32 +31,32 @@ Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compi


 gpu::PtrStep
--------------
+------------
 .. ocv:class:: gpu::PtrStep

 Structure similar to :ocv:class:`gpu::PtrStepSz` but containing only a pointer and row step. Width and height fields are excluded due to performance reasons. The structure is intended for internal use or for users who write device code. ::

-    template<typename T> struct PtrStep
+    template <typename T> struct PtrStep : public DevPtr<T>
    {
-            T* data;
-            size_t step;
+        __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
+        __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}

-            PtrStep();
-            PtrStep(const PtrStepSz<T>& mem);
+        //! stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!!
+        size_t step;

-            typedef T elem_type;
-            enum { elem_size = sizeof(elem_type) };
+        __CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
+        __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }

-            __CV_GPU_HOST_DEVICE__ size_t elemSize() const;
-            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0);
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const;
+        __CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
+        __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
    };

-    typedef PtrStep<unsigned char> PtrStep;
+    typedef PtrStep<unsigned char> PtrStepb;
    typedef PtrStep<float> PtrStepf;
    typedef PtrStep<int> PtrStepi;


+
 gpu::GpuMat
 -----------
 .. ocv:class:: gpu::GpuMat
@ -89,28 +79,31 @@ Beware that the latter limitation may lead to overloaded matrix operators that c
            //! default constructor
            GpuMat();

+            //! constructs GpuMat of the specified size and type
            GpuMat(int rows, int cols, int type);
            GpuMat(Size size, int type);

            .....

-            //! builds GpuMat from Mat. Blocks uploading to device.
-            explicit GpuMat (const Mat& m);
+            //! builds GpuMat from host memory (Blocking call)
+            explicit GpuMat(InputArray arr);

            //! returns lightweight PtrStepSz structure for passing
            //to nvcc-compiled code. Contains size, data ptr and step.
            template <class T> operator PtrStepSz<T>() const;
            template <class T> operator PtrStep<T>() const;

-            //! blocks uploading data to GpuMat.
-            void upload(const cv::Mat& m);
-            void upload(const CudaMem& m, Stream& stream);
+            //! pefroms upload data to GpuMat (Blocking call)
+            void upload(InputArray arr);

-            //! downloads data from device to host memory. Blocking calls.
-            void download(cv::Mat& m) const;
+            //! pefroms upload data to GpuMat (Non-Blocking call)
+            void upload(InputArray arr, Stream& stream);

-            //! download async
-            void download(CudaMem& m, Stream& stream) const;
+            //! pefroms download data from device to host memory (Blocking call)
+            void download(OutputArray dst) const;
+
+            //! pefroms download data from device to host memory (Non-Blocking call)
+            void download(OutputArray dst, Stream& stream) const;
    };


@ -121,16 +114,10 @@ Beware that the latter limitation may lead to overloaded matrix operators that c


 gpu::createContinuous
-------------------------
-Creates a continuous matrix in the GPU memory.
+---------------------
+Creates a continuous matrix.

-.. ocv:function:: void gpu::createContinuous(int rows, int cols, int type, GpuMat& m)
-
-.. ocv:function:: GpuMat gpu::createContinuous(int rows, int cols, int type)
-
-.. ocv:function:: void gpu::createContinuous(Size size, int type, GpuMat& m)
-
-.. ocv:function:: GpuMat gpu::createContinuous(Size size, int type)
+.. ocv:function:: void gpu::createContinuous(int rows, int cols, int type, OutputArray arr)

    :param rows: Row count.

@ -138,36 +125,92 @@ Creates a continuous matrix in the GPU memory.

    :param type: Type of the matrix.

-    :param m: Destination matrix. This parameter changes only if it has a proper type and area ( :math:`\texttt{rows} \times \texttt{cols}` ).
+    :param arr: Destination matrix. This parameter changes only if it has a proper type and area ( :math:`\texttt{rows} \times \texttt{cols}` ).

 Matrix is called continuous if its elements are stored continuously, that is, without gaps at the end of each row.



 gpu::ensureSizeIsEnough
---------------------------
+-----------------------
 Ensures that the size of a matrix is big enough and the matrix has a proper type.

-.. ocv:function:: void gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
-
-.. ocv:function:: void gpu::ensureSizeIsEnough(Size size, int type, GpuMat& m)
+.. ocv:function:: void gpu::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)

    :param rows: Minimum desired number of rows.

    :param cols: Minimum desired number of columns.

-    :param size: Rows and columns passed as a structure.
-
    :param type: Desired matrix type.

-    :param m: Destination matrix.
+    :param arr: Destination matrix.

 The function does not reallocate memory if the matrix has proper attributes already.



+gpu::CudaMem
+------------
+.. ocv:class:: gpu::CudaMem
+
+Class with reference counting wrapping special memory type allocation functions from CUDA. Its interface is also :ocv:func:`Mat`-like but with additional memory type parameters.
+
+* **PAGE_LOCKED** sets a page locked memory type used commonly for fast and asynchronous uploading/downloading data from/to GPU.
+* **SHARED** specifies a zero copy memory allocation that enables mapping the host memory to GPU address space, if supported.
+* **WRITE_COMBINED**  sets the write combined buffer that is not cached by CPU. Such buffers are used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache utilization.
+
+.. note:: Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2 Pinned Memory APIs* document or *CUDA C Programming Guide*.
+
+::
+
+    class CV_EXPORTS CudaMem
+    {
+    public:
+        enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };
+
+        explicit CudaMem(AllocType alloc_type = PAGE_LOCKED);
+
+        CudaMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
+        CudaMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
+
+        //! creates from host memory with coping data
+        explicit CudaMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
+
+        ......
+
+        //! returns matrix header with disabled reference counting for CudaMem data.
+        Mat createMatHeader() const;
+
+        //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+        GpuMat createGpuMatHeader() const;
+
+        ......
+
+        AllocType alloc_type;
+    };
+
+
+
+gpu::CudaMem::createMatHeader
+-----------------------------
+Creates a header without reference counting to :ocv:class:`gpu::CudaMem` data.
+
+.. ocv:function:: Mat gpu::CudaMem::createMatHeader() const
+
+
+
+gpu::CudaMem::createGpuMatHeader
+--------------------------------
+Maps CPU memory to GPU address space and creates the :ocv:class:`gpu::GpuMat` header without reference counting for it.
+
+.. ocv:function:: GpuMat gpu::CudaMem::createGpuMatHeader() const
+
+This can be done only if memory was allocated with the ``SHARED`` flag and if it is supported by the hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which eliminates an extra copy.
+
+
+
 gpu::registerPageLocked
-------------------------------
+-----------------------
 Page-locks the memory of matrix and maps it for the device(s).

 .. ocv:function:: void gpu::registerPageLocked(Mat& m)
@ -177,7 +220,7 @@ Page-locks the memory of matrix and maps it for the device(s).


 gpu::unregisterPageLocked
-------------------------------
+-------------------------
 Unmaps the memory of matrix and makes it pageable again.

 .. ocv:function:: void gpu::unregisterPageLocked(Mat& m)
@ -186,83 +229,11 @@ Unmaps the memory of matrix and makes it pageable again.



-gpu::CudaMem
------------
-.. ocv:class:: gpu::CudaMem
-
-Class with reference counting wrapping special memory type allocation functions from CUDA. Its interface is also
-:ocv:func:`Mat`-like but with additional memory type parameters.
-
-* **ALLOC_PAGE_LOCKED** sets a page locked memory type used commonly for fast and asynchronous uploading/downloading data from/to GPU.
-* **ALLOC_ZEROCOPY** specifies a zero copy memory allocation that enables mapping the host memory to GPU address space, if supported.
-* **ALLOC_WRITE_COMBINED**  sets the write combined buffer that is not cached by CPU. Such buffers are used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache utilization.
-
-.. note:: Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2 Pinned Memory APIs* document or *CUDA C Programming Guide*.
-
-::
-
-    class CV_EXPORTS CudaMem
-    {
-    public:
-            enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2,
-                     ALLOC_WRITE_COMBINED = 4 };
-
-            CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-            //! creates from cv::Mat with coping data
-            explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED);
-
-             ......
-
-            void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED);
-
-            //! returns matrix header with disabled ref. counting for CudaMem data.
-            Mat createMatHeader() const;
-            operator Mat() const;
-
-            //! maps host memory into device address space
-            GpuMat createGpuMatHeader() const;
-            operator GpuMat() const;
-
-            //if host memory can be mapped to gpu address space;
-            static bool canMapHostMemory();
-
-            int alloc_type;
-    };
-
-
-
-gpu::CudaMem::createMatHeader
---------------------------------
-Creates a header without reference counting to :ocv:class:`gpu::CudaMem` data.
-
-.. ocv:function:: Mat gpu::CudaMem::createMatHeader() const
-
-
-
-gpu::CudaMem::createGpuMatHeader
------------------------------------
-Maps CPU memory to GPU address space and creates the :ocv:class:`gpu::GpuMat` header without reference counting for it.
-
-.. ocv:function:: GpuMat gpu::CudaMem::createGpuMatHeader() const
-
-This can be done only if memory was allocated with the ``ALLOC_ZEROCOPY`` flag and if it is supported by the hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which eliminates an extra copy.
-
-
-
-gpu::CudaMem::canMapHostMemory
----------------------------------
-Returns ``true`` if the current hardware supports address space mapping and ``ALLOC_ZEROCOPY`` memory allocation.
-
-.. ocv:function:: static bool gpu::CudaMem::canMapHostMemory()
-
-
-
 gpu::Stream
 -----------
 .. ocv:class:: gpu::Stream

-This class encapsulates a queue of asynchronous calls. Some functions have overloads with the additional ``gpu::Stream`` parameter. The overloads do initialization work (allocate output buffers, upload constants, and so on), start the GPU kernel, and return before results are ready. You can check whether all operations are complete via :ocv:func:`gpu::Stream::queryIfComplete`. You can asynchronously upload/download data from/to page-locked buffers, using the :ocv:class:`gpu::CudaMem` or :ocv:class:`Mat` header that points to a region of :ocv:class:`gpu::CudaMem`.
+This class encapsulates a queue of asynchronous calls.

 .. note:: Currently, you may face problems if an operation is enqueued twice with different data. Some functions use the constant GPU memory, and next call may update the memory before the previous one has been finished. But calling different operations asynchronously is safe because each operation has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are also safe.

@ -272,30 +243,24 @@ This class encapsulates a queue of asynchronous calls. Some functions have overl
    {
    public:
        Stream();
-        ~Stream();

-        Stream(const Stream&);
-        Stream& operator=(const Stream&);
+        //! queries an asynchronous stream for completion status
+        bool queryIfComplete() const;

-        bool queryIfComplete();
+        //! waits for stream tasks to complete
        void waitForCompletion();

-        void enqueueDownload(const GpuMat& src, CudaMem& dst);
-        void enqueueDownload(const GpuMat& src, Mat& dst);
+        //! makes a compute stream wait on an event
+        void waitEvent(const Event& event);

-        void enqueueUpload(const CudaMem& src, GpuMat& dst);
-        void enqueueUpload(const Mat& src, GpuMat& dst);
-
-        void enqueueCopy(const GpuMat& src, GpuMat& dst);
-
-        void enqueueMemSet(const GpuMat& src, Scalar val);
-        void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);
-
-        void enqueueConvert(const GpuMat& src, GpuMat& dst, int type,
-                            double a = 1, double b = 0);
-
-        typedef void (*StreamCallback)(Stream& stream, int status, void* userData);
+        //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
        void enqueueHostCallback(StreamCallback callback, void* userData);
+
+        //! return Stream object for default CUDA stream
+        static Stream& Null();
+
+        //! returns true if stream object is not default (!= 0)
+        operator bool_type() const;
    };


@ -316,53 +281,11 @@ Blocks the current CPU thread until all operations in the stream are complete.



-gpu::Stream::enqueueDownload
----------------------------
-Copies data from device to host.
+gpu::Stream::waitEvent
+----------------------
+Makes a compute stream wait on an event.

-.. ocv:function:: void gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst)
-
-.. ocv:function:: void gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
-
-.. note:: ``cv::Mat`` must point to page locked memory (i.e. to ``CudaMem`` data or to its subMat) or must be registered with :ocv:func:`gpu::registerPageLocked` .
-
-
-
-gpu::Stream::enqueueUpload
--------------------------
-Copies data from host to device.
-
-.. ocv:function:: void gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst)
-
-.. ocv:function:: void gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)
-
-.. note:: ``cv::Mat`` must point to page locked memory (i.e. to ``CudaMem`` data or to its subMat) or must be registered with :ocv:func:`gpu::registerPageLocked` .
-
-
-
-gpu::Stream::enqueueCopy
------------------------
-Copies data from device to device.
-
-.. ocv:function:: void gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst)
-
-
-
-gpu::Stream::enqueueMemSet
--------------------------
-Initializes or sets device memory to a value.
-
-.. ocv:function:: void gpu::Stream::enqueueMemSet( GpuMat& src, Scalar val )
-
-.. ocv:function:: void gpu::Stream::enqueueMemSet( GpuMat& src, Scalar val, const GpuMat& mask )
-
-
-
-gpu::Stream::enqueueConvert
---------------------------
-Converts matrix type, ex from float to uchar depending on type.
-
-.. ocv:function:: void gpu::Stream::enqueueConvert( const GpuMat& src, GpuMat& dst, int dtype, double a=1, double b=0 )
+.. ocv:function:: void gpu::Stream::waitEvent(const Event& event)



--- a/modules/gpu/doc/initalization_and_information.rst
+++ b/modules/gpu/doc/initalization_and_information.rst
@ -107,23 +107,186 @@ Class providing functionality for querying the specified GPU properties. ::
    class CV_EXPORTS DeviceInfo
    {
    public:
+        //! creates DeviceInfo object for the current GPU
        DeviceInfo();
+
+        //! creates DeviceInfo object for the given GPU
        DeviceInfo(int device_id);

-        String name() const;
+        //! ASCII string identifying device
+        const char* name() const;

-        int majorVersion() const;
-        int minorVersion() const;
+        //! global memory available on device in bytes
+        size_t totalGlobalMem() const;

+        //! shared memory available per block in bytes
+        size_t sharedMemPerBlock() const;
+
+        //! 32-bit registers available per block
+        int regsPerBlock() const;
+
+        //! warp size in threads
+        int warpSize() const;
+
+        //! maximum pitch in bytes allowed by memory copies
+        size_t memPitch() const;
+
+        //! maximum number of threads per block
+        int maxThreadsPerBlock() const;
+
+        //! maximum size of each dimension of a block
+        Vec3i maxThreadsDim() const;
+
+        //! maximum size of each dimension of a grid
+        Vec3i maxGridSize() const;
+
+        //! clock frequency in kilohertz
+        int clockRate() const;
+
+        //! constant memory available on device in bytes
+        size_t totalConstMem() const;
+
+        //! major compute capability
+        int major() const;
+
+        //! minor compute capability
+        int minor() const;
+
+        //! alignment requirement for textures
+        size_t textureAlignment() const;
+
+        //! pitch alignment requirement for texture references bound to pitched memory
+        size_t texturePitchAlignment() const;
+
+        //! number of multiprocessors on device
        int multiProcessorCount() const;

+        //! specified whether there is a run time limit on kernels
+        bool kernelExecTimeoutEnabled() const;
+
+        //! device is integrated as opposed to discrete
+        bool integrated() const;
+
+        //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
+        bool canMapHostMemory() const;
+
+        enum ComputeMode
+        {
+            ComputeModeDefault,         /**< default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */
+            ComputeModeExclusive,       /**< compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */
+            ComputeModeProhibited,      /**< compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */
+            ComputeModeExclusiveProcess /**< compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */
+        };
+
+        //! compute mode
+        ComputeMode computeMode() const;
+
+        //! maximum 1D texture size
+        int maxTexture1D() const;
+
+        //! maximum 1D mipmapped texture size
+        int maxTexture1DMipmap() const;
+
+        //! maximum size for 1D textures bound to linear memory
+        int maxTexture1DLinear() const;
+
+        //! maximum 2D texture dimensions
+        Vec2i maxTexture2D() const;
+
+        //! maximum 2D mipmapped texture dimensions
+        Vec2i maxTexture2DMipmap() const;
+
+        //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
+        Vec3i maxTexture2DLinear() const;
+
+        //! maximum 2D texture dimensions if texture gather operations have to be performed
+        Vec2i maxTexture2DGather() const;
+
+        //! maximum 3D texture dimensions
+        Vec3i maxTexture3D() const;
+
+        //! maximum Cubemap texture dimensions
+        int maxTextureCubemap() const;
+
+        //! maximum 1D layered texture dimensions
+        Vec2i maxTexture1DLayered() const;
+
+        //! maximum 2D layered texture dimensions
+        Vec3i maxTexture2DLayered() const;
+
+        //! maximum Cubemap layered texture dimensions
+        Vec2i maxTextureCubemapLayered() const;
+
+        //! maximum 1D surface size
+        int maxSurface1D() const;
+
+        //! maximum 2D surface dimensions
+        Vec2i maxSurface2D() const;
+
+        //! maximum 3D surface dimensions
+        Vec3i maxSurface3D() const;
+
+        //! maximum 1D layered surface dimensions
+        Vec2i maxSurface1DLayered() const;
+
+        //! maximum 2D layered surface dimensions
+        Vec3i maxSurface2DLayered() const;
+
+        //! maximum Cubemap surface dimensions
+        int maxSurfaceCubemap() const;
+
+        //! maximum Cubemap layered surface dimensions
+        Vec2i maxSurfaceCubemapLayered() const;
+
+        //! alignment requirements for surfaces
+        size_t surfaceAlignment() const;
+
+        //! device can possibly execute multiple kernels concurrently
+        bool concurrentKernels() const;
+
+        //! device has ECC support enabled
+        bool ECCEnabled() const;
+
+        //! PCI bus ID of the device
+        int pciBusID() const;
+
+        //! PCI device ID of the device
+        int pciDeviceID() const;
+
+        //! PCI domain ID of the device
+        int pciDomainID() const;
+
+        //! true if device is a Tesla device using TCC driver, false otherwise
+        bool tccDriver() const;
+
+        //! number of asynchronous engines
+        int asyncEngineCount() const;
+
+        //! device shares a unified address space with the host
+        bool unifiedAddressing() const;
+
+        //! peak memory clock frequency in kilohertz
+        int memoryClockRate() const;
+
+        //! global memory bus width in bits
+        int memoryBusWidth() const;
+
+        //! size of L2 cache in bytes
+        int l2CacheSize() const;
+
+        //! maximum resident threads per multiprocessor
+        int maxThreadsPerMultiProcessor() const;
+
+        //! gets free and total device memory
+        void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
        size_t freeMemory() const;
        size_t totalMemory() const;

-        bool supports(FeatureSet feature) const;
-        bool isCompatible() const;
+        //! checks whether device supports the given feature
+        bool supports(FeatureSet feature_set) const;

-        int deviceID() const;
+        //! checks whether the GPU module can be run on the given device
+        bool isCompatible() const;
    };


@ -146,31 +309,23 @@ gpu::DeviceInfo::name
 ---------------------
 Returns the device name.

-.. ocv:function:: String gpu::DeviceInfo::name() const
+.. ocv:function:: const char* gpu::DeviceInfo::name() const



-gpu::DeviceInfo::majorVersion
-----------------------------
+gpu::DeviceInfo::major
+----------------------
 Returns the major compute capability version.

-.. ocv:function:: int gpu::DeviceInfo::majorVersion()
+.. ocv:function:: int gpu::DeviceInfo::major()



-gpu::DeviceInfo::minorVersion
-----------------------------
+gpu::DeviceInfo::minor
+----------------------
 Returns the minor compute capability version.

-.. ocv:function:: int gpu::DeviceInfo::minorVersion()
-
-
-
-gpu::DeviceInfo::multiProcessorCount
------------------------------------
-Returns the number of streaming multiprocessors.
-
-.. ocv:function:: int gpu::DeviceInfo::multiProcessorCount()
+.. ocv:function:: int gpu::DeviceInfo::minor()



@ -194,7 +349,7 @@ gpu::DeviceInfo::supports
 -------------------------
 Provides information on GPU feature support.

-.. ocv:function:: bool gpu::DeviceInfo::supports( FeatureSet feature_set ) const
+.. ocv:function:: bool gpu::DeviceInfo::supports(FeatureSet feature_set) const

    :param feature_set: Features to be checked. See :ocv:enum:`gpu::FeatureSet`.

--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@ -47,7 +47,7 @@
 #  error gpu.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 #if !defined(__OPENCV_BUILD) && !defined(OPENCV_GPU_SKIP_INCLUDE)
    #include "opencv2/opencv_modules.hpp"
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@ -49,7 +49,7 @@
 #include "opencv2/calib3d.hpp"
 #include "opencv2/objdetect.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "opencv2/opencv_modules.hpp"

--- a/modules/gpu/test/test_opengl.cpp
+++ b/modules/gpu/test/test_opengl.cpp
@ -126,25 +126,6 @@ GPU_TEST_P(Buffer, ConstructorFromBuffer)
    EXPECT_EQ(buf_gold.type(), buf.type());
 }

-GPU_TEST_P(Buffer, ConstructorFromTexture2D)
-{
-    const int depth = CV_MAT_DEPTH(type);
-    const int cn = CV_MAT_CN(type);
-
-    if (depth != CV_32F || cn == 2)
-        return;
-
-    cv::Mat gold = randomMat(size, type, 0, 1.0);
-    cv::ogl::Texture2D tex_gold(gold, true);
-
-    cv::ogl::Buffer buf(tex_gold, cv::ogl::Buffer::PIXEL_PACK_BUFFER, true);
-
-    cv::Mat bufData;
-    buf.copyTo(bufData);
-
-    EXPECT_MAT_NEAR(gold, bufData, 1e-2);
-}
-
 GPU_TEST_P(Buffer, Create)
 {
    cv::ogl::Buffer buf;
@ -198,26 +179,6 @@ GPU_TEST_P(Buffer, CopyFromBuffer)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, CopyFromTexture2D)
-{
-    const int depth = CV_MAT_DEPTH(type);
-    const int cn = CV_MAT_CN(type);
-
-    if (depth != CV_32F || cn == 2)
-        return;
-
-    cv::Mat gold = randomMat(size, type, 0, 1.0);
-    cv::ogl::Texture2D tex_gold(gold, true);
-
-    cv::ogl::Buffer buf;
-    buf.copyFrom(tex_gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
-
-    cv::Mat bufData;
-    buf.copyTo(bufData);
-
-    EXPECT_MAT_NEAR(gold, bufData, 1e-2);
-}
-
 GPU_TEST_P(Buffer, CopyToGpuMat)
 {
    cv::Mat gold = randomMat(size, type);
@ -237,7 +198,8 @@ GPU_TEST_P(Buffer, CopyToBuffer)
    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);

    cv::ogl::Buffer dst;
-    buf.copyTo(dst, cv::ogl::Buffer::ARRAY_BUFFER, true);
+    buf.copyTo(dst);
+    dst.setAutoRelease(true);

    EXPECT_NE(buf.bufId(), dst.bufId());

@ -247,27 +209,6 @@ GPU_TEST_P(Buffer, CopyToBuffer)
    EXPECT_MAT_NEAR(gold, bufData, 0);
 }

-GPU_TEST_P(Buffer, CopyToTexture2D)
-{
-    const int depth = CV_MAT_DEPTH(type);
-    const int cn = CV_MAT_CN(type);
-
-    if (depth != CV_32F || cn == 2)
-        return;
-
-    cv::Mat gold = randomMat(size, type, 0, 1.0);
-
-    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::PIXEL_PACK_BUFFER, true);
-
-    cv::ogl::Texture2D tex;
-    buf.copyTo(tex, cv::ogl::Buffer::PIXEL_PACK_BUFFER, true);
-
-    cv::Mat texData;
-    tex.copyTo(texData);
-
-    EXPECT_MAT_NEAR(gold, texData, 1e-2);
-}
-
 GPU_TEST_P(Buffer, Clone)
 {
    cv::Mat gold = randomMat(size, type);
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
@ -47,7 +47,7 @@
 #  error gpuarithm.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 namespace cv { namespace gpu {

--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
@ -217,10 +217,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    {
        if (src3.empty())
        {
-            if (stream)
-                stream.enqueueMemSet(dst, Scalar::all(0));
-            else
-                dst.setTo(Scalar::all(0));
+            dst.setTo(Scalar::all(0), stream);
        }
        else
        {
@ -230,10 +227,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
            }
            else
            {
-                if (stream)
-                    stream.enqueueCopy(src3, dst);
-                else
-                    src3.copyTo(dst);
+                src3.copyTo(dst, stream);
            }
        }
    }
@ -336,18 +330,13 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
        cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);

        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-        if (s)
-            s.enqueueMemSet(sum, Scalar::all(0));
-        else
-            sum.setTo(Scalar::all(0));
+
+        sum.setTo(Scalar::all(0), s);

        GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));

-        if (s)
-            s.enqueueCopy(res, inner);
-        else
-            res.copyTo(inner);
+        res.copyTo(inner, s);
    }
    else
    {
@ -720,10 +709,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
            GpuMat result_block(result_roi_size, result_data.type(),
                                result_data.ptr(), result_data.step);

-            if (stream)
-                stream.enqueueCopy(result_block, result_roi);
-            else
-                result_block.copyTo(result_roi);
+            result_block.copyTo(result_roi, stream);
        }
    }

--- a/modules/gpuarithm/src/precomp.hpp
+++ b/modules/gpuarithm/src/precomp.hpp
@ -50,7 +50,7 @@
 #include "opencv2/gpuarithm.hpp"
 #include "opencv2/core/utility.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "opencv2/opencv_modules.hpp"

--- a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
+++ b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
@ -49,7 +49,7 @@

 #include <memory>

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/gpufilters.hpp"

 namespace cv { namespace gpu {
--- a/modules/gpubgsegm/src/cuda/fgd.hpp
+++ b/modules/gpubgsegm/src/cuda/fgd.hpp
@ -43,7 +43,7 @@
 #ifndef __FGD_BGFG_COMMON_HPP__
 #define __FGD_BGFG_COMMON_HPP__

-#include "opencv2/core/cuda_devptrs.hpp"
+#include "opencv2/core/gpu_types.hpp"

 namespace bgfg
 {
--- a/modules/gpubgsegm/src/gmg.cpp
+++ b/modules/gpubgsegm/src/gmg.cpp
@ -134,10 +134,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
        initialize(frame.size(), 0.0f, frame.depth() == CV_8U ? 255.0f : frame.depth() == CV_16U ? std::numeric_limits<ushort>::max() : 1.0f);

    fgmask.create(frameSize_, CV_8UC1);
-    if (stream)
-        stream.enqueueMemSet(fgmask, cv::Scalar::all(0));
-    else
-        fgmask.setTo(cv::Scalar::all(0));
+    fgmask.setTo(cv::Scalar::all(0), stream);

    funcs[frame.depth()][frame.channels() - 1](frame, fgmask, colors_, weights_, nfeatures_, frameNum_, learningRate, updateBackgroundModel, cv::gpu::StreamAccessor::getStream(stream));

--- a/modules/gpubgsegm/src/precomp.hpp
+++ b/modules/gpubgsegm/src/precomp.hpp
@ -50,6 +50,6 @@
 #include "opencv2/gpufilters.hpp"
 #include "opencv2/gpuimgproc.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/gpucodec/include/opencv2/gpucodec.hpp
+++ b/modules/gpucodec/include/opencv2/gpucodec.hpp
@ -49,7 +49,7 @@

 #include <iosfwd>

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 namespace cv { namespace gpu {

--- a/modules/gpucodec/src/cuvid_video_source.h
+++ b/modules/gpucodec/src/cuvid_video_source.h
@ -43,7 +43,7 @@
 #ifndef __CUVUD_VIDEO_SOURCE_H__
 #define __CUVUD_VIDEO_SOURCE_H__

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"
 #include "opencv2/gpucodec.hpp"
 #include "thread.h"

--- a/modules/gpucodec/src/frame_queue.h
+++ b/modules/gpucodec/src/frame_queue.h
@ -44,7 +44,7 @@
 #define __FRAME_QUEUE_H__

 #include "opencv2/core/utility.hpp"
-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include <nvcuvid.h>

--- a/modules/gpucodec/src/precomp.hpp
+++ b/modules/gpucodec/src/precomp.hpp
@ -52,7 +52,7 @@

 #include "opencv2/gpucodec.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #ifdef HAVE_NVCUVID
    #include <nvcuvid.h>
--- a/modules/gpucodec/src/video_decoder.h
+++ b/modules/gpucodec/src/video_decoder.h
@ -43,7 +43,7 @@
 #ifndef __VIDEO_DECODER_H__
 #define __VIDEO_DECODER_H__

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"
 #include "opencv2/gpucodec.hpp"

 #include <nvcuvid.h>
--- a/modules/gpucodec/src/video_parser.h
+++ b/modules/gpucodec/src/video_parser.h
@ -43,7 +43,7 @@
 #ifndef __VIDEO_PARSER_H__
 #define __VIDEO_PARSER_H__

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"
 #include "opencv2/gpucodec.hpp"
 #include "frame_queue.h"
 #include "video_decoder.h"
--- a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
+++ b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
@ -47,7 +47,7 @@
 #  error gpufeatures2d.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/gpufilters.hpp"

 namespace cv { namespace gpu {
--- a/modules/gpufeatures2d/src/brute_force_matcher.cpp
+++ b/modules/gpufeatures2d/src/brute_force_matcher.cpp
@ -497,10 +497,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
        ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
    }

-    if (stream)
-        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
-    else
-        trainIdx.setTo(Scalar::all(-1));
+    trainIdx.setTo(Scalar::all(-1), stream);

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);
@ -616,10 +613,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
    ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx);
    ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);

-    if (stream)
-        stream.enqueueMemSet(trainIdx, Scalar::all(-1));
-    else
-        trainIdx.setTo(Scalar::all(-1));
+    trainIdx.setTo(Scalar::all(-1), stream);

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);
@ -803,10 +797,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
    }

-    if (stream)
-        stream.enqueueMemSet(nMatches, Scalar::all(0));
-    else
-        nMatches.setTo(Scalar::all(0));
+    nMatches.setTo(Scalar::all(0), stream);

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);
@ -931,10 +922,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance);
    }

-    if (stream)
-        stream.enqueueMemSet(nMatches, Scalar::all(0));
-    else
-        nMatches.setTo(Scalar::all(0));
+    nMatches.setTo(Scalar::all(0), stream);

    caller_t func = callers[query.depth()];
    CV_Assert(func != 0);
--- a/modules/gpufeatures2d/src/precomp.hpp
+++ b/modules/gpufeatures2d/src/precomp.hpp
@ -52,6 +52,6 @@
 #include "opencv2/gpuwarping.hpp"
 #include "opencv2/features2d.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@ -47,7 +47,7 @@
 #  error gpufilters.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/core/base.hpp"

 namespace cv { namespace gpu {
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@ -157,10 +157,7 @@ namespace

            if (roi.size() != src_size)
            {
-                if (stream)
-                    stream.enqueueMemSet(dst, Scalar::all(0));
-                else
-                    dst.setTo(Scalar::all(0));
+                dst.setTo(Scalar::all(0), stream);
            }

            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
@ -221,10 +218,7 @@ namespace

            if (roi.size() != src_size)
            {
-                if (stream)
-                    stream.enqueueMemSet(dst, Scalar::all(0));
-                else
-                    dst.setTo(Scalar::all(0));
+                dst.setTo(Scalar::all(0), stream);
            }

            ensureSizeIsEnough(src_size, bufType, *pbuf);
@ -487,10 +481,7 @@ namespace

            if (roi.size() != src_size)
            {
-                if (stream)
-                    stream.enqueueMemSet(dst, Scalar::all(0));
-                else
-                    dst.setTo(Scalar::all(0));
+                dst.setTo(Scalar::all(0), stream);
            }

            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
@ -557,10 +548,7 @@ namespace

        if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
        {
-            if (stream)
-                stream.enqueueCopy(src, dst);
-            else
-                src.copyTo(dst);
+            src.copyTo(dst, stream);
            return;
        }

@ -890,7 +878,7 @@ namespace
        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            DeviceInfo devInfo;
-            int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
+            int cc = devInfo.major() * 10 + devInfo.minor();
            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
        }

@ -989,7 +977,7 @@ namespace
        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
        {
            DeviceInfo devInfo;
-            int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
+            int cc = devInfo.major() * 10 + devInfo.minor();
            if (ksize > 16 && cc < 20)
                CV_Error(cv::Error::StsNotImplemented, "column linear filter doesn't implemented for kernel size > 16 for device with compute capabilities less than 2.0");

--- a/modules/gpufilters/src/precomp.hpp
+++ b/modules/gpufilters/src/precomp.hpp
@ -48,7 +48,7 @@
 #include "opencv2/gpufilters.hpp"
 #include "opencv2/imgproc.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "opencv2/opencv_modules.hpp"

--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@ -47,7 +47,7 @@
 #  error gpuimgproc.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/core/base.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/gpufilters.hpp"
--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
@ -196,16 +196,9 @@ namespace
            return;
        }

-        if (stream)
-        {
-            stream.enqueueConvert(image, buf.imagef, CV_32F);
-            stream.enqueueConvert(templ, buf.templf, CV_32F);
-        }
-        else
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-        }
+        image.convertTo(buf.imagef, CV_32F, stream);
+        templ.convertTo(buf.templf, CV_32F, stream);
+
        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
    }

@ -317,16 +310,8 @@ namespace
    void matchTemplate_CCOFF_NORMED_8U(
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        if (stream)
-        {
-            stream.enqueueConvert(image, buf.imagef, CV_32F);
-            stream.enqueueConvert(templ, buf.templf, CV_32F);
-        }
-        else
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-        }
+        image.convertTo(buf.imagef, CV_32F, stream);
+        templ.convertTo(buf.templf, CV_32F, stream);

        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);

--- a/modules/gpuimgproc/src/precomp.hpp
+++ b/modules/gpuimgproc/src/precomp.hpp
@ -47,7 +47,7 @@
 #include "opencv2/gpufilters.hpp"

 #include "opencv2/core/private.hpp"
-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "opencv2/opencv_modules.hpp"

--- a/modules/gpulegacy/include/opencv2/gpulegacy/private.hpp
+++ b/modules/gpulegacy/include/opencv2/gpulegacy/private.hpp
@ -48,7 +48,7 @@
 #  error this is a private header which should not be used from outside of the OpenCV library
 #endif

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #ifndef HAVE_CUDA
 #  error gpulegacy module requires CUDA
--- a/modules/gpulegacy/src/precomp.hpp
+++ b/modules/gpulegacy/src/precomp.hpp
@ -56,7 +56,7 @@
 #  include "opencv2/objdetect.hpp"
 #endif

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"
 #include "opencv2/gpulegacy/private.hpp"

 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/gpulegacy/test/test_precomp.hpp
+++ b/modules/gpulegacy/test/test_precomp.hpp
@ -68,11 +68,11 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/gpu_test.hpp"

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/gpulegacy.hpp"
 #include "opencv2/highgui.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "NCVTest.hpp"
 #include "NCVAutoTestLister.hpp"
--- a/modules/gpuoptflow/include/opencv2/gpuoptflow.hpp
+++ b/modules/gpuoptflow/include/opencv2/gpuoptflow.hpp
@ -47,7 +47,7 @@
 #  error gpuoptflow.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 namespace cv { namespace gpu {

--- a/modules/gpuoptflow/src/farneback.cpp
+++ b/modules/gpuoptflow/src/farneback.cpp
@ -235,8 +235,8 @@ void cv::gpu::FarnebackOpticalFlow::operator ()(
            break;
    }

-    streams[0].enqueueConvert(frame0, frames_[0], CV_32F);
-    streams[1].enqueueConvert(frame1, frames_[1], CV_32F);
+    frame0.convertTo(frames_[0], CV_32F, streams[0]);
+    frame1.convertTo(frames_[1], CV_32F, streams[1]);

    if (fastPyramids)
    {
@ -293,21 +293,21 @@ void cv::gpu::FarnebackOpticalFlow::operator ()(
            {
                gpu::resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
                gpu::resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-                streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), scale);
-                streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), scale);
+                curFlowX.convertTo(curFlowX, curFlowX.depth(), scale, streams[0]);
+                curFlowY.convertTo(curFlowY, curFlowY.depth(), scale, streams[1]);
            }
            else
            {
-                streams[0].enqueueMemSet(curFlowX, 0);
-                streams[1].enqueueMemSet(curFlowY, 0);
+                curFlowX.setTo(0, streams[0]);
+                curFlowY.setTo(0, streams[1]);
            }
        }
        else
        {
            gpu::resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR, streams[0]);
            gpu::resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR, streams[1]);
-            streams[0].enqueueConvert(curFlowX, curFlowX, curFlowX.depth(), 1./pyrScale);
-            streams[1].enqueueConvert(curFlowY, curFlowY, curFlowY.depth(), 1./pyrScale);
+            curFlowX.convertTo(curFlowX, curFlowX.depth(), 1./pyrScale, streams[0]);
+            curFlowY.convertTo(curFlowY, curFlowY.depth(), 1./pyrScale, streams[1]);
        }

        GpuMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
@ -343,7 +343,7 @@ void cv::gpu::FarnebackOpticalFlow::operator ()(
            {
                cudev::optflow_farneback::gaussianBlurGpu(
                        frames_[i], smoothSize/2, blurredFrame[i], BORDER_REFLECT101, S(streams[i]));
-                gpu::resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR, streams[i]);
+                gpu::resize(blurredFrame[i], pyrLevel[i], Size(width, height), 0.0, 0.0, INTER_LINEAR, streams[i]);
                cudev::optflow_farneback::polynomialExpansionGpu(pyrLevel[i], polyN, R[i], S(streams[i]));
            }
        }
--- a/modules/gpuoptflow/src/precomp.hpp
+++ b/modules/gpuoptflow/src/precomp.hpp
@ -51,7 +51,7 @@
 #include "opencv2/gpuimgproc.hpp"
 #include "opencv2/video.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "opencv2/opencv_modules.hpp"

--- a/modules/gpuoptflow/test/test_optflow.cpp
+++ b/modules/gpuoptflow/test/test_optflow.cpp
@ -80,7 +80,7 @@ GPU_TEST_P(BroxOpticalFlow, Regression)
    brox(loadMat(frame0), loadMat(frame1), u, v);

    std::string fname(cvtest::TS::ptr()->get_data_path());
-    if (devInfo.majorVersion() >= 2)
+    if (devInfo.major() >= 2)
        fname += "opticalflow/brox_optical_flow_cc20.bin";
    else
        fname += "opticalflow/brox_optical_flow.bin";
--- a/modules/gpustereo/include/opencv2/gpustereo.hpp
+++ b/modules/gpustereo/include/opencv2/gpustereo.hpp
@ -47,7 +47,7 @@
 #  error gpustereo.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 namespace cv { namespace gpu {

--- a/modules/gpustereo/src/disparity_bilateral_filter.cpp
+++ b/modules/gpustereo/src/disparity_bilateral_filter.cpp
@ -113,10 +113,7 @@ namespace

        if (&dst != &disp)
        {
-            if (stream)
-                stream.enqueueCopy(disp, dst);
-            else
-                disp.copyTo(dst);
+            disp.copyTo(dst, stream);
        }

        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
--- a/modules/gpustereo/src/precomp.hpp
+++ b/modules/gpustereo/src/precomp.hpp
@ -47,6 +47,6 @@

 #include "opencv2/gpustereo.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/gpustereo/src/stereobm.cpp
+++ b/modules/gpustereo/src/stereobm.cpp
@ -91,7 +91,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()

    DeviceInfo device_info;

-    if (device_info.majorVersion() > 1 || device_info.multiProcessorCount() > 16)
+    if (device_info.major() > 1 || device_info.multiProcessorCount() > 16)
        return true;

    return false;
--- a/modules/gpustereo/src/stereobp.cpp
+++ b/modules/gpustereo/src/stereobp.cpp
@ -194,20 +194,10 @@ namespace
            if (rthis.levels & 1)
            {
                //can clear less area
-                if (stream)
-                {
-                    stream.enqueueMemSet(u, zero);
-                    stream.enqueueMemSet(d, zero);
-                    stream.enqueueMemSet(l, zero);
-                    stream.enqueueMemSet(r, zero);
-                }
-                else
-                {
-                    u.setTo(zero);
-                    d.setTo(zero);
-                    l.setTo(zero);
-                    r.setTo(zero);
-                }
+                u.setTo(zero, stream);
+                d.setTo(zero, stream);
+                l.setTo(zero, stream);
+                r.setTo(zero, stream);
            }

            if (rthis.levels > 1)
@ -222,20 +212,10 @@ namespace

                if ((rthis.levels & 1) == 0)
                {
-                    if (stream)
-                    {
-                        stream.enqueueMemSet(u2, zero);
-                        stream.enqueueMemSet(d2, zero);
-                        stream.enqueueMemSet(l2, zero);
-                        stream.enqueueMemSet(r2, zero);
-                    }
-                    else
-                    {
-                        u2.setTo(zero);
-                        d2.setTo(zero);
-                        l2.setTo(zero);
-                        r2.setTo(zero);
-                    }
+                    u2.setTo(zero, stream);
+                    d2.setTo(zero, stream);
+                    l2.setTo(zero, stream);
+                    r2.setTo(zero, stream);
                }
            }

@ -313,20 +293,12 @@ namespace

            out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));

-            if (stream)
-                stream.enqueueMemSet(out, zero);
-            else
-                out.setTo(zero);
+            out.setTo(zero, stream);

            output_callers[funcIdx](u, d, l, r, datas.front(), out, cudaStream);

            if (disp.type() != CV_16S)
-            {
-                if (stream)
-                    stream.enqueueConvert(out, disp, disp.type());
-                else
-                    out.convertTo(disp, disp.type());
-            }
+                out.convertTo(disp, disp.type(), stream);
        }

        StereoBeliefPropagation& rthis;
--- a/modules/gpustereo/src/stereocsbp.cpp
+++ b/modules/gpustereo/src/stereocsbp.cpp
@ -213,36 +213,18 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& te

    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);

-    if (stream)
-    {
-        stream.enqueueMemSet(l[0], zero);
-        stream.enqueueMemSet(d[0], zero);
-        stream.enqueueMemSet(r[0], zero);
-        stream.enqueueMemSet(u[0], zero);
+    l[0].setTo(zero, stream);
+    d[0].setTo(zero, stream);
+    r[0].setTo(zero, stream);
+    u[0].setTo(zero, stream);

-        stream.enqueueMemSet(l[1], zero);
-        stream.enqueueMemSet(d[1], zero);
-        stream.enqueueMemSet(r[1], zero);
-        stream.enqueueMemSet(u[1], zero);
+    l[1].setTo(zero, stream);
+    d[1].setTo(zero, stream);
+    r[1].setTo(zero, stream);
+    u[1].setTo(zero, stream);

-        stream.enqueueMemSet(data_cost, zero);
-        stream.enqueueMemSet(data_cost_selected, zero);
-    }
-    else
-    {
-        l[0].setTo(zero);
-        d[0].setTo(zero);
-        r[0].setTo(zero);
-        u[0].setTo(zero);
-
-        l[1].setTo(zero);
-        d[1].setTo(zero);
-        r[1].setTo(zero);
-        u[1].setTo(zero);
-
-        data_cost.setTo(zero);
-        data_cost_selected.setTo(zero);
-    }
+    data_cost.setTo(zero, stream);
+    data_cost_selected.setTo(zero, stream);

    int cur_idx = 0;

@ -279,20 +261,14 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& te

    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));

-    if (stream)
-        stream.enqueueMemSet(out, zero);
-    else
-        out.setTo(zero);
+    out.setTo(zero, stream);

    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream);

    if (disp.type() != CV_16S)
    {
-        if (stream)
-            stream.enqueueConvert(out, disp, disp.type());
-        else
-            out.convertTo(disp, disp.type());
+        out.convertTo(disp, disp.type(), stream);
    }
 }

--- a/modules/gpuwarping/include/opencv2/gpuwarping.hpp
+++ b/modules/gpuwarping/include/opencv2/gpuwarping.hpp
@ -47,7 +47,7 @@
 #  error gpuwarping.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/imgproc.hpp"

 namespace cv { namespace gpu {
--- a/modules/gpuwarping/src/precomp.hpp
+++ b/modules/gpuwarping/src/precomp.hpp
@ -45,7 +45,7 @@

 #include "opencv2/gpuwarping.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "opencv2/opencv_modules.hpp"

--- a/modules/gpuwarping/src/pyramids.cpp
+++ b/modules/gpuwarping/src/pyramids.cpp
@ -184,10 +184,7 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream

    if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
    {
-        if (stream)
-            stream.enqueueCopy(layer0_, outImg);
-        else
-            layer0_.copyTo(outImg);
+        layer0_.copyTo(outImg, stream);
    }

    float lastScale = 1.0f;
@ -202,10 +199,7 @@ void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream

        if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
        {
-            if (stream)
-                stream.enqueueCopy(curLayer, outImg);
-            else
-                curLayer.copyTo(outImg);
+            curLayer.copyTo(outImg, stream);
        }

        if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
--- a/modules/gpuwarping/src/resize.cpp
+++ b/modules/gpuwarping/src/resize.cpp
@ -77,10 +77,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub

    if (dsize == src.size())
    {
-        if (s)
-            s.enqueueCopy(src, dst);
-        else
-            src.copyTo(dst);
+        src.copyTo(dst, s);
        return;
    }

--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@ -148,6 +148,8 @@ CV_EXPORTS_W void setTrackbarPos(const String& trackbarname, const String& winna


 // OpenGL support
+CV_EXPORTS void imshow(const String& winname, const ogl::Texture2D& tex);
+
 CV_EXPORTS void setOpenGlDrawCallback(const String& winname, OpenGlDrawCallback onOpenGlDraw, void* userdata = 0);

 CV_EXPORTS void setOpenGlContext(const String& winname);
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@ -281,39 +281,64 @@ void cv::imshow( const String& winname, InputArray _img )

        setOpenGlContext(winname);

-        if (_img.kind() == _InputArray::OPENGL_TEXTURE)
+        cv::ogl::Texture2D& tex = ownWndTexs[winname];
+
+        if (_img.kind() == _InputArray::GPU_MAT)
        {
-            cv::ogl::Texture2D& tex = wndTexs[winname];
-
-            tex = _img.getOGlTexture2D();
+            cv::ogl::Buffer& buf = ownWndBufs[winname];
+            buf.copyFrom(_img);
+            buf.setAutoRelease(false);

+            tex.copyFrom(buf);
            tex.setAutoRelease(false);
-
-            setOpenGlDrawCallback(winname, glDrawTextureCallback, &tex);
        }
        else
        {
-            cv::ogl::Texture2D& tex = ownWndTexs[winname];
-
-            if (_img.kind() == _InputArray::GPU_MAT)
-            {
-                cv::ogl::Buffer& buf = ownWndBufs[winname];
-                buf.copyFrom(_img);
-                buf.setAutoRelease(false);
-
-                tex.copyFrom(buf);
-                tex.setAutoRelease(false);
-            }
-            else
-            {
-                tex.copyFrom(_img);
-            }
-
-            tex.setAutoRelease(false);
-
-            setOpenGlDrawCallback(winname, glDrawTextureCallback, &tex);
+            tex.copyFrom(_img);
        }

+        tex.setAutoRelease(false);
+
+        setOpenGlDrawCallback(winname, glDrawTextureCallback, &tex);
+
+        updateWindow(winname);
+    }
+#endif
+}
+
+void cv::imshow(const String& winname, const ogl::Texture2D& _tex)
+{
+#ifndef HAVE_OPENGL
+    (void) winname;
+    (void) _tex;
+    CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support");
+#else
+    const double useGl = getWindowProperty(winname, WND_PROP_OPENGL);
+
+    if (useGl <= 0)
+    {
+        CV_Error(cv::Error::OpenGlNotSupported, "The window was created without OpenGL context");
+    }
+    else
+    {
+        const double autoSize = getWindowProperty(winname, WND_PROP_AUTOSIZE);
+
+        if (autoSize > 0)
+        {
+            Size size = _tex.size();
+            resizeWindow(winname, size.width, size.height);
+        }
+
+        setOpenGlContext(winname);
+
+        cv::ogl::Texture2D& tex = wndTexs[winname];
+
+        tex = _tex;
+
+        tex.setAutoRelease(false);
+
+        setOpenGlDrawCallback(winname, glDrawTextureCallback, &tex);
+
        updateWindow(winname);
    }
 #endif
--- a/modules/nonfree/include/opencv2/nonfree/gpu.hpp
+++ b/modules/nonfree/include/opencv2/nonfree/gpu.hpp
@ -43,7 +43,7 @@
 #ifndef __OPENCV_NONFREE_GPU_HPP__
 #define __OPENCV_NONFREE_GPU_HPP__

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 namespace cv { namespace gpu {

--- a/modules/nonfree/src/precomp.hpp
+++ b/modules/nonfree/src/precomp.hpp
@ -50,7 +50,7 @@
 #include "opencv2/core/private.hpp"

 #include "opencv2/nonfree/gpu.hpp"
-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "opencv2/opencv_modules.hpp"

--- a/modules/photo/include/opencv2/photo/gpu.hpp
+++ b/modules/photo/include/opencv2/photo/gpu.hpp
@ -43,7 +43,7 @@
 #ifndef __OPENCV_PHOTO_GPU_HPP__
 #define __OPENCV_PHOTO_GPU_HPP__

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 namespace cv { namespace gpu {

--- a/modules/photo/src/denoising_gpu.cpp
+++ b/modules/photo/src/denoising_gpu.cpp
@ -43,7 +43,7 @@
 #include "precomp.hpp"

 #include "opencv2/photo/gpu.hpp"
-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #include "opencv2/opencv_modules.hpp"

--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
@ -43,9 +43,10 @@
 #ifndef __OPENCV_SOFTCASCADE_HPP__
 #define __OPENCV_SOFTCASCADE_HPP__

+#include <iosfwd>
+
 #include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
-#include <ostream>
+#include "opencv2/core/gpu.hpp"

 namespace cv { namespace softcascade {

@ -296,4 +297,4 @@ private:

 }} // namespace cv { namespace softcascade {

-#endif
+#endif
--- a/modules/softcascade/src/cuda/channels.cu
+++ b/modules/softcascade/src/cuda/channels.cu
@ -40,7 +40,7 @@
 //
 //M*/

-#include "opencv2/core/cuda_devptrs.hpp"
+#include "opencv2/core/gpu_types.hpp"
 #include "opencv2/core/cuda/common.hpp"

 namespace cv { namespace softcascade { namespace cudev
--- a/modules/softcascade/src/cuda_invoker.hpp
+++ b/modules/softcascade/src/cuda_invoker.hpp
@ -44,7 +44,7 @@
 #ifndef __OPENCV_ICF_HPP__
 #define __OPENCV_ICF_HPP__

-#include "opencv2/core/cuda_devptrs.hpp"
+#include "opencv2/core/gpu_types.hpp"
 #include "cuda_runtime_api.h"

 #if defined __CUDACC__
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
@ -335,10 +335,7 @@ struct cv::softcascade::SCascade::Fields

    void detect(cv::gpu::GpuMat& objects, cv::gpu::Stream& s) const
    {
-        if (s)
-            s.enqueueMemSet(objects, 0);
-        else
-            cudaMemset(objects.data, 0, sizeof(Detection));
+        objects.setTo(Scalar::all(0), s);

        cudaSafeCall( cudaGetLastError());

@ -354,16 +351,8 @@ struct cv::softcascade::SCascade::Fields
        cv::gpu::GpuMat ndetections = cv::gpu::GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);

-        if (s)
-        {
-            s.enqueueMemSet(overlaps, 0);
-            s.enqueueMemSet(suppressed, 0);
-        }
-        else
-        {
-            overlaps.setTo(0);
-            suppressed.setTo(0);
-        }
+        overlaps.setTo(0, s);
+        suppressed.setTo(0, s);

        cudaStream_t stream = cv::gpu::StreamAccessor::getStream(s);
        cudev::suppress(objects, overlaps, ndetections, suppressed, stream);
@ -488,18 +477,12 @@ void integral(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& sum, cv::gpu::GpuMat&
        cv::softcascade::cudev::shfl_integral(src, buffer, stream);

        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-        if (s)
-            s.enqueueMemSet(sum, cv::Scalar::all(0));
-        else
-            sum.setTo(cv::Scalar::all(0));
+        sum.setTo(cv::Scalar::all(0), s);

        cv::gpu::GpuMat inner = sum(cv::Rect(1, 1, src.cols, src.rows));
        cv::gpu::GpuMat res = buffer(cv::Rect(0, 0, src.cols, src.rows));

-        if (s)
-            s.enqueueCopy(res, inner);
-        else
-            res.copyTo(inner);
+        res.copyTo(inner, s);
    }
    else {CV_Error(cv::Error::GpuNotSupported, ": CC 3.x required.");}
 }
@ -541,10 +524,7 @@ void cv::softcascade::SCascade::detect(InputArray _image, InputArray _rois, Outp
    }
    else
    {
-        if (s)
-            s.enqueueCopy(image, flds.hogluv);
-        else
-            image.copyTo(flds.hogluv);
+        image.copyTo(flds.hogluv, s);
    }

    flds.detect(objects, s);
@ -571,10 +551,7 @@ using cv::gpu::GpuMat;

 inline void setZero(cv::gpu::GpuMat& m, cv::gpu::Stream& s)
 {
-    if (s)
-        s.enqueueMemSet(m, 0);
-    else
-        m.setTo(0);
+    m.setTo(0, s);
 }

 struct SeparablePreprocessor : public cv::softcascade::ChannelsProcessor
--- a/modules/softcascade/src/precomp.hpp
+++ b/modules/softcascade/src/precomp.hpp
@ -43,11 +43,13 @@
 #ifndef __OPENCV_PRECOMP_H__
 #define __OPENCV_PRECOMP_H__

+#include <iostream>
+
 #include "opencv2/softcascade.hpp"
 #include "opencv2/imgproc.hpp"

 #include "opencv2/core/private.hpp"
-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 namespace cv { namespace softcascade { namespace internal
 {
--- a/modules/softcascade/test/test_cuda_softcascade.cpp
+++ b/modules/softcascade/test/test_cuda_softcascade.cpp
@ -41,7 +41,7 @@
 //M*/

 #include "test_precomp.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"


 #ifdef HAVE_CUDA
--- a/modules/softcascade/test/utility.hpp
+++ b/modules/softcascade/test/utility.hpp
@ -43,7 +43,7 @@
 #define __OPENCV_SOFTCASCADE_TEST_UTILITY_HPP__

 #include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/ts.hpp"

 //////////////////////////////////////////////////////////////////////
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@ -44,7 +44,7 @@
 #define __OPENCV_STITCHING_WARPERS_HPP__

 #include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/opencv_modules.hpp"

--- a/modules/superres/perf/perf_precomp.hpp
+++ b/modules/superres/perf/perf_precomp.hpp
@ -52,7 +52,7 @@
 #define __OPENCV_PERF_PRECOMP_HPP__

 #include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/gpu_perf.hpp"
 #include "opencv2/superres.hpp"
--- a/modules/superres/src/input_array_utility.cpp
+++ b/modules/superres/src/input_array_utility.cpp
@ -57,10 +57,6 @@ Mat cv::superres::arrGetMat(InputArray arr, Mat& buf)
        arr.getOGlBuffer().copyTo(buf);
        return buf;

-    case _InputArray::OPENGL_TEXTURE:
-        arr.getOGlTexture2D().copyTo(buf);
-        return buf;
-
    default:
        return arr.getMat();
    }
@ -77,10 +73,6 @@ GpuMat cv::superres::arrGetGpuMat(InputArray arr, GpuMat& buf)
        arr.getOGlBuffer().copyTo(buf);
        return buf;

-    case _InputArray::OPENGL_TEXTURE:
-        arr.getOGlTexture2D().copyTo(buf);
-        return buf;
-
    default:
        buf.upload(arr.getMat());
        return buf;
@ -97,10 +89,6 @@ namespace
    {
        dst.getOGlBufferRef().copyFrom(src);
    }
-    void arr2tex(InputArray src, OutputArray dst)
-    {
-        dst.getOGlTexture2D().copyFrom(src);
-    }
    void mat2gpu(InputArray src, OutputArray dst)
    {
        dst.getGpuMatRef().upload(src.getMat());
@ -109,10 +97,6 @@ namespace
    {
        src.getOGlBuffer().copyTo(dst);
    }
-    void tex2arr(InputArray src, OutputArray dst)
-    {
-        src.getOGlTexture2D().copyTo(dst);
-    }
    void gpu2mat(InputArray src, OutputArray dst)
    {
        GpuMat d = src.getGpuMat();
@ -132,15 +116,15 @@ void cv::superres::arrCopy(InputArray src, OutputArray dst)
    static const func_t funcs[10][10] =
    {
        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
-        {0, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr},
-        {0, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr},
-        {0, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, arr2buf, arr2tex, gpu2gpu}
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu},
+        {0, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, 0 /*buf2arr*/, buf2arr},
+        {0, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/},
+        {0, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, arr2buf, 0 /*arr2tex*/, gpu2gpu}
    };

    const int src_kind = src.kind() >> _InputArray::KIND_SHIFT;
--- a/modules/superres/src/input_array_utility.hpp
+++ b/modules/superres/src/input_array_utility.hpp
@ -44,7 +44,7 @@
 #define __OPENCV_SUPERRES_INPUT_ARRAY_UTILITY_HPP__

 #include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 namespace cv
 {
--- a/modules/superres/src/precomp.hpp
+++ b/modules/superres/src/precomp.hpp
@ -48,14 +48,14 @@

 #include "opencv2/opencv_modules.hpp"
 #include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/core/opengl.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/video/tracking.hpp"
 #include "opencv2/core/private.hpp"

-#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/core/private.gpu.hpp"

 #ifdef HAVE_OPENCV_GPUARITHM
 #  include "opencv2/gpuarithm.hpp"
--- a/modules/ts/include/opencv2/ts/gpu_test.hpp
+++ b/modules/ts/include/opencv2/ts/gpu_test.hpp
@ -45,7 +45,7 @@

 #include <stdexcept>
 #include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/ts.hpp"
--- a/modules/ts/src/gpu_perf.cpp
+++ b/modules/ts/src/gpu_perf.cpp
@ -42,7 +42,7 @@

 #include "precomp.hpp"
 #include "opencv2/ts/gpu_perf.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 #ifdef HAVE_CUDA
    #include <cuda_runtime.h>
@ -287,8 +287,8 @@ namespace perf
            cv::gpu::DeviceInfo info(i);

            printf("[----------]\n"), fflush(stdout);
-            printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()), fflush(stdout);
-            printf("[          ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout);
+            printf("[ DEVICE   ] \t# %d %s.\n", i, info.name()), fflush(stdout);
+            printf("[          ] \tCompute capability: %d.%d\n", (int)info.major(), (int)info.minor()), fflush(stdout);
            printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()), fflush(stdout);
            printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout);
            printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)), fflush(stdout);
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@ -1,7 +1,7 @@
 #include "precomp.hpp"

 #ifdef HAVE_CUDA
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #endif

 #ifdef ANDROID
@ -72,10 +72,6 @@ static void setCurrentThreadAffinityMask(int mask)
 }
 #endif

-#ifdef HAVE_CUDA
-# include <opencv2/core/gpumat.hpp>
-#endif
-
 namespace {

 class PerfEnvironment: public ::testing::Environment
@ -686,13 +682,13 @@ void TestBase::Init(int argc, const char* const argv[])
        cv::gpu::DeviceInfo info(param_cuda_device);
        if (!info.isCompatible())
        {
-            printf("[----------]\n[ FAILURE  ] \tDevice %s is NOT compatible with current GPU module build.\n[----------]\n", info.name().c_str()), fflush(stdout);
+            printf("[----------]\n[ FAILURE  ] \tDevice %s is NOT compatible with current GPU module build.\n[----------]\n", info.name()), fflush(stdout);
            exit(-1);
        }

        cv::gpu::setDevice(param_cuda_device);

-        printf("[----------]\n[ GPU INFO ] \tRun test suite on %s GPU.\n[----------]\n", info.name().c_str()), fflush(stdout);
+        printf("[----------]\n[ GPU INFO ] \tRun test suite on %s GPU.\n[----------]\n", info.name()), fflush(stdout);
    }
 #endif

--- a/modules/videostab/include/opencv2/videostab/wobble_suppression.hpp
+++ b/modules/videostab/include/opencv2/videostab/wobble_suppression.hpp
@ -45,7 +45,7 @@

 #include <vector>
 #include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/videostab/global_motion.hpp"
 #include "opencv2/videostab/log.hpp"

--- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
@ -368,8 +368,8 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)

    gpu::Stream stream;

-    stream.enqueueConvert(b.gI1, b.t1, CV_32F);
-    stream.enqueueConvert(b.gI2, b.t2, CV_32F);
+    b.gI1.convertTo(b.t1, CV_32F, stream);
+    b.gI2.convertTo(b.t2, CV_32F, stream);

    gpu::split(b.t1, b.vI1, stream);
    gpu::split(b.t2, b.vI2, stream);
@ -379,16 +379,16 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)

    for( int i = 0; i < b.gI1.channels(); ++i )
    {
-        gpu::multiply(b.vI2[i], b.vI2[i], b.I2_2, stream);        // I2^2
-        gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, stream);        // I1^2
-        gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, stream);       // I1 * I2
+        gpu::multiply(b.vI2[i], b.vI2[i], b.I2_2, 1, -1, stream);        // I2^2
+        gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
+        gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2

        gpu::GaussianBlur(b.vI1[i], b.mu1, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
        gpu::GaussianBlur(b.vI2[i], b.mu2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);

-        gpu::multiply(b.mu1, b.mu1, b.mu1_2, stream);
-        gpu::multiply(b.mu2, b.mu2, b.mu2_2, stream);
-        gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, stream);
+        gpu::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
+        gpu::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
+        gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);

        gpu::GaussianBlur(b.I1_2, b.sigma1_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
        gpu::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, gpu::GpuMat(), -1, stream);
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@ -82,8 +82,8 @@ int main()
        if (!dev_info.isCompatible())
        {
            std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.majorVersion()
-                 << dev_info.minorVersion() << "\n";
+                 << dev_info.name() << ", CC " << dev_info.major()
+                 << dev_info.minor() << "\n";
            return -1;
        }
    }
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@ -112,8 +112,8 @@ int main(int argc, char** argv)
        if (!dev_info.isCompatible())
        {
            std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.majorVersion()
-                 << dev_info.minorVersion() << "\n";
+                 << dev_info.name() << ", CC " << dev_info.major()
+                 << dev_info.minor() << "\n";
            return -1;
        }
    }
--- a/samples/gpu/multi.cpp
+++ b/samples/gpu/multi.cpp
@ -62,8 +62,8 @@ int main()
        if (!dev_info.isCompatible())
        {
            std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.majorVersion()
-                 << dev_info.minorVersion() << "\n";
+                 << dev_info.name() << ", CC " << dev_info.major()
+                 << dev_info.minor() << "\n";
            return -1;
        }
    }
--- a/samples/gpu/opengl.cpp
+++ b/samples/gpu/opengl.cpp
@ -25,7 +25,7 @@ int main()

 #include "opencv2/core/core.hpp"
 #include "opencv2/core/opengl.hpp"
-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"
 #include "opencv2/highgui/highgui.hpp"

 using namespace std;
--- a/samples/gpu/performance/performance.cpp
+++ b/samples/gpu/performance/performance.cpp
@ -191,7 +191,7 @@ int main(int argc, const char* argv[])
    DeviceInfo dev_info(device);
    if (!dev_info.isCompatible())
    {
-        cerr << "GPU module isn't built for GPU #" << device << " " << dev_info.name() << ", CC " << dev_info.majorVersion() << '.' << dev_info.minorVersion() << endl;
+        cerr << "GPU module isn't built for GPU #" << device << " " << dev_info.name() << ", CC " << dev_info.major() << '.' << dev_info.minor() << endl;
        return -1;
    }
    setDevice(device);
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp
@ -81,8 +81,8 @@ int main(int argc, char** argv)
        if (!dev_info.isCompatible())
        {
            std::cout << "GPU module isn't built for GPU #" << i << " ("
-                 << dev_info.name() << ", CC " << dev_info.majorVersion()
-                 << dev_info.minorVersion() << "\n";
+                 << dev_info.name() << ", CC " << dev_info.major()
+                 << dev_info.minor() << "\n";
            return -1;
        }
    }