renmaed gpu module -> cuda

2013-07-24 11:41:44 +04:00
parent 29386f1449
commit ae94256edc
58 changed files with 45 additions and 131 deletions
--- a/modules/cuda/CMakeLists.txt
+++ b/modules/cuda/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(ANDROID OR IOS)
+  ocv_module_disable(cuda)
+endif()
+
+set(the_description "CUDA-accelerated Computer Vision")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
+
+ocv_define_module(cuda opencv_calib3d opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
--- a/modules/cuda/doc/calib3d.rst
+++ b/modules/cuda/doc/calib3d.rst
@@ -0,0 +1,36 @@
+Camera Calibration and 3D Reconstruction
+========================================
+
+.. highlight:: cpp
+
+
+
+gpu::solvePnPRansac
+-------------------
+Finds the object pose from 3D-2D point correspondences.
+
+.. ocv:function:: void gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, int num_iters=100, float max_dist=8.0, int min_inlier_count=100, vector<int>* inliers=NULL)
+
+    :param object: Single-row matrix of object points.
+
+    :param image: Single-row matrix of image points.
+
+    :param camera_mat: 3x3 matrix of intrinsic camera parameters.
+
+    :param dist_coef: Distortion coefficients. See :ocv:func:`undistortPoints` for details.
+
+    :param rvec: Output 3D rotation vector.
+
+    :param tvec: Output 3D translation vector.
+
+    :param use_extrinsic_guess: Flag to indicate that the function must use ``rvec`` and ``tvec`` as an initial transformation guess. It is not supported for now.
+
+    :param num_iters: Maximum number of RANSAC iterations.
+
+    :param max_dist: Euclidean distance threshold to detect whether point is inlier or not.
+
+    :param min_inlier_count: Flag to indicate that the function must stop if greater or equal number of inliers is achieved. It is not supported for now.
+
+    :param inliers: Output vector of inlier indices.
+
+.. seealso:: :ocv:func:`solvePnPRansac`
--- a/modules/cuda/doc/cuda.rst
+++ b/modules/cuda/doc/cuda.rst
@@ -0,0 +1,12 @@
+************************************
+gpu. GPU-accelerated Computer Vision
+************************************
+
+.. toctree::
+    :maxdepth: 1
+
+    introduction
+    initalization_and_information
+    data_structures
+    object_detection
+    calib3d
--- a/modules/cuda/doc/data_structures.rst
+++ b/modules/cuda/doc/data_structures.rst
@@ -0,0 +1,311 @@
+Data Structures
+===============
+
+.. highlight:: cpp
+
+
+
+gpu::PtrStepSz
+--------------
+.. ocv:class:: gpu::PtrStepSz
+
+Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA kernels). Typically, it is used internally by OpenCV and by users who write device code. You can call its members from both host and device code. ::
+
+    template <typename T> struct PtrStepSz : public PtrStep<T>
+    {
+        __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
+        __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
+            : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}
+
+        template <typename U>
+        explicit PtrStepSz(const PtrStepSz<U>& d) : PtrStep<T>((T*)d.data, d.step), cols(d.cols), rows(d.rows){}
+
+        int cols;
+        int rows;
+    };
+
+    typedef PtrStepSz<unsigned char> PtrStepSzb;
+    typedef PtrStepSz<float> PtrStepSzf;
+    typedef PtrStepSz<int> PtrStepSzi;
+
+
+
+gpu::PtrStep
+------------
+.. ocv:class:: gpu::PtrStep
+
+Structure similar to :ocv:class:`gpu::PtrStepSz` but containing only a pointer and row step. Width and height fields are excluded due to performance reasons. The structure is intended for internal use or for users who write device code. ::
+
+    template <typename T> struct PtrStep : public DevPtr<T>
+    {
+        __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
+        __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
+
+        //! stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!!
+        size_t step;
+
+        __CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
+        __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
+
+        __CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
+        __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
+    };
+
+    typedef PtrStep<unsigned char> PtrStepb;
+    typedef PtrStep<float> PtrStepf;
+    typedef PtrStep<int> PtrStepi;
+
+
+
+gpu::GpuMat
+-----------
+.. ocv:class:: gpu::GpuMat
+
+Base storage class for GPU memory with reference counting. Its interface matches the :ocv:class:`Mat` interface with the following limitations:
+
+* no arbitrary dimensions support (only 2D)
+* no functions that return references to their data (because references on GPU are not valid for CPU)
+* no expression templates technique support
+
+Beware that the latter limitation may lead to overloaded matrix operators that cause memory allocations. The ``GpuMat`` class is convertible to :ocv:class:`gpu::PtrStepSz` and :ocv:class:`gpu::PtrStep` so it can be passed directly to the kernel.
+
+.. note:: In contrast with :ocv:class:`Mat`, in most cases ``GpuMat::isContinuous() == false`` . This means that rows are aligned to a size depending on the hardware. Single-row ``GpuMat`` is always a continuous matrix.
+
+::
+
+    class CV_EXPORTS GpuMat
+    {
+    public:
+            //! default constructor
+            GpuMat();
+
+            //! constructs GpuMat of the specified size and type
+            GpuMat(int rows, int cols, int type);
+            GpuMat(Size size, int type);
+
+            .....
+
+            //! builds GpuMat from host memory (Blocking call)
+            explicit GpuMat(InputArray arr);
+
+            //! returns lightweight PtrStepSz structure for passing
+            //to nvcc-compiled code. Contains size, data ptr and step.
+            template <class T> operator PtrStepSz<T>() const;
+            template <class T> operator PtrStep<T>() const;
+
+            //! pefroms upload data to GpuMat (Blocking call)
+            void upload(InputArray arr);
+
+            //! pefroms upload data to GpuMat (Non-Blocking call)
+            void upload(InputArray arr, Stream& stream);
+
+            //! pefroms download data from device to host memory (Blocking call)
+            void download(OutputArray dst) const;
+
+            //! pefroms download data from device to host memory (Non-Blocking call)
+            void download(OutputArray dst, Stream& stream) const;
+    };
+
+
+.. note:: You are not recommended to leave static or global ``GpuMat`` variables allocated, that is, to rely on its destructor. The destruction order of such variables and CUDA context is undefined. GPU memory release function returns error if the CUDA context has been destroyed before.
+
+.. seealso:: :ocv:class:`Mat`
+
+
+
+gpu::createContinuous
+---------------------
+Creates a continuous matrix.
+
+.. ocv:function:: void gpu::createContinuous(int rows, int cols, int type, OutputArray arr)
+
+    :param rows: Row count.
+
+    :param cols: Column count.
+
+    :param type: Type of the matrix.
+
+    :param arr: Destination matrix. This parameter changes only if it has a proper type and area ( :math:`\texttt{rows} \times \texttt{cols}` ).
+
+Matrix is called continuous if its elements are stored continuously, that is, without gaps at the end of each row.
+
+
+
+gpu::ensureSizeIsEnough
+-----------------------
+Ensures that the size of a matrix is big enough and the matrix has a proper type.
+
+.. ocv:function:: void gpu::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
+
+    :param rows: Minimum desired number of rows.
+
+    :param cols: Minimum desired number of columns.
+
+    :param type: Desired matrix type.
+
+    :param arr: Destination matrix.
+
+The function does not reallocate memory if the matrix has proper attributes already.
+
+
+
+gpu::CudaMem
+------------
+.. ocv:class:: gpu::CudaMem
+
+Class with reference counting wrapping special memory type allocation functions from CUDA. Its interface is also :ocv:func:`Mat`-like but with additional memory type parameters.
+
+* **PAGE_LOCKED** sets a page locked memory type used commonly for fast and asynchronous uploading/downloading data from/to GPU.
+* **SHARED** specifies a zero copy memory allocation that enables mapping the host memory to GPU address space, if supported.
+* **WRITE_COMBINED**  sets the write combined buffer that is not cached by CPU. Such buffers are used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache utilization.
+
+.. note:: Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2 Pinned Memory APIs* document or *CUDA C Programming Guide*.
+
+::
+
+    class CV_EXPORTS CudaMem
+    {
+    public:
+        enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };
+
+        explicit CudaMem(AllocType alloc_type = PAGE_LOCKED);
+
+        CudaMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
+        CudaMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
+
+        //! creates from host memory with coping data
+        explicit CudaMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
+
+        ......
+
+        //! returns matrix header with disabled reference counting for CudaMem data.
+        Mat createMatHeader() const;
+
+        //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+        GpuMat createGpuMatHeader() const;
+
+        ......
+
+        AllocType alloc_type;
+    };
+
+
+
+gpu::CudaMem::createMatHeader
+-----------------------------
+Creates a header without reference counting to :ocv:class:`gpu::CudaMem` data.
+
+.. ocv:function:: Mat gpu::CudaMem::createMatHeader() const
+
+
+
+gpu::CudaMem::createGpuMatHeader
+--------------------------------
+Maps CPU memory to GPU address space and creates the :ocv:class:`gpu::GpuMat` header without reference counting for it.
+
+.. ocv:function:: GpuMat gpu::CudaMem::createGpuMatHeader() const
+
+This can be done only if memory was allocated with the ``SHARED`` flag and if it is supported by the hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which eliminates an extra copy.
+
+
+
+gpu::registerPageLocked
+-----------------------
+Page-locks the memory of matrix and maps it for the device(s).
+
+.. ocv:function:: void gpu::registerPageLocked(Mat& m)
+
+    :param m: Input matrix.
+
+
+
+gpu::unregisterPageLocked
+-------------------------
+Unmaps the memory of matrix and makes it pageable again.
+
+.. ocv:function:: void gpu::unregisterPageLocked(Mat& m)
+
+    :param m: Input matrix.
+
+
+
+gpu::Stream
+-----------
+.. ocv:class:: gpu::Stream
+
+This class encapsulates a queue of asynchronous calls.
+
+.. note:: Currently, you may face problems if an operation is enqueued twice with different data. Some functions use the constant GPU memory, and next call may update the memory before the previous one has been finished. But calling different operations asynchronously is safe because each operation has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are also safe.
+
+::
+
+    class CV_EXPORTS Stream
+    {
+    public:
+        Stream();
+
+        //! queries an asynchronous stream for completion status
+        bool queryIfComplete() const;
+
+        //! waits for stream tasks to complete
+        void waitForCompletion();
+
+        //! makes a compute stream wait on an event
+        void waitEvent(const Event& event);
+
+        //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+        void enqueueHostCallback(StreamCallback callback, void* userData);
+
+        //! return Stream object for default CUDA stream
+        static Stream& Null();
+
+        //! returns true if stream object is not default (!= 0)
+        operator bool_type() const;
+    };
+
+
+
+gpu::Stream::queryIfComplete
+----------------------------
+Returns ``true`` if the current stream queue is finished. Otherwise, it returns false.
+
+.. ocv:function:: bool gpu::Stream::queryIfComplete()
+
+
+
+gpu::Stream::waitForCompletion
+------------------------------
+Blocks the current CPU thread until all operations in the stream are complete.
+
+.. ocv:function:: void gpu::Stream::waitForCompletion()
+
+
+
+gpu::Stream::waitEvent
+----------------------
+Makes a compute stream wait on an event.
+
+.. ocv:function:: void gpu::Stream::waitEvent(const Event& event)
+
+
+
+gpu::Stream::enqueueHostCallback
+--------------------------------
+Adds a callback to be called on the host after all currently enqueued items in the stream have completed.
+
+.. ocv:function:: void gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+
+.. note:: Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization that may depend on outstanding device work or other callbacks that are not mandated to run earlier.  Callbacks without a mandated order (in independent streams) execute in undefined order and may be serialized.
+
+
+
+gpu::StreamAccessor
+-------------------
+.. ocv:struct:: gpu::StreamAccessor
+
+Class that enables getting ``cudaStream_t`` from :ocv:class:`gpu::Stream` and is declared in ``stream_accessor.hpp`` because it is the only public header that depends on the CUDA Runtime API. Including it brings a dependency to your code. ::
+
+    struct StreamAccessor
+    {
+        CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+    };
--- a/modules/cuda/doc/initalization_and_information.rst
+++ b/modules/cuda/doc/initalization_and_information.rst
@@ -0,0 +1,374 @@
+Initalization and Information
+=============================
+
+.. highlight:: cpp
+
+
+
+gpu::getCudaEnabledDeviceCount
+------------------------------
+Returns the number of installed CUDA-enabled devices.
+
+.. ocv:function:: int gpu::getCudaEnabledDeviceCount()
+
+Use this function before any other GPU functions calls. If OpenCV is compiled without GPU support, this function returns 0.
+
+
+
+gpu::setDevice
+--------------
+Sets a device and initializes it for the current thread.
+
+.. ocv:function:: void gpu::setDevice(int device)
+
+    :param device: System index of a GPU device starting with 0.
+
+If the call of this function is omitted, a default device is initialized at the fist GPU usage.
+
+
+
+gpu::getDevice
+--------------
+Returns the current device index set by :ocv:func:`gpu::setDevice` or initialized by default.
+
+.. ocv:function:: int gpu::getDevice()
+
+
+
+gpu::resetDevice
+----------------
+Explicitly destroys and cleans up all resources associated with the current device in the current process.
+
+.. ocv:function:: void gpu::resetDevice()
+
+Any subsequent API call to this device will reinitialize the device.
+
+
+
+gpu::FeatureSet
+---------------
+Enumeration providing GPU computing features.
+
+.. ocv:enum:: gpu::FeatureSet
+
+  .. ocv:emember:: FEATURE_SET_COMPUTE_10
+  .. ocv:emember:: FEATURE_SET_COMPUTE_11
+  .. ocv:emember:: FEATURE_SET_COMPUTE_12
+  .. ocv:emember:: FEATURE_SET_COMPUTE_13
+  .. ocv:emember:: FEATURE_SET_COMPUTE_20
+  .. ocv:emember:: FEATURE_SET_COMPUTE_21
+  .. ocv:emember:: GLOBAL_ATOMICS
+  .. ocv:emember:: SHARED_ATOMICS
+  .. ocv:emember:: NATIVE_DOUBLE
+
+
+gpu::TargetArchs
+----------------
+.. ocv:class:: gpu::TargetArchs
+
+Class providing a set of static methods to check what NVIDIA* card architecture the GPU module was built for.
+
+The following method checks whether the module was built with the support of the given feature:
+
+    .. ocv:function:: static bool gpu::TargetArchs::builtWith( FeatureSet feature_set )
+
+        :param feature_set: Features to be checked. See :ocv:enum:`gpu::FeatureSet`.
+
+There is a set of methods to check whether the module contains intermediate (PTX) or binary GPU code for the given architecture(s):
+
+    .. ocv:function:: static bool gpu::TargetArchs::has(int major, int minor)
+
+    .. ocv:function:: static bool gpu::TargetArchs::hasPtx(int major, int minor)
+
+    .. ocv:function:: static bool gpu::TargetArchs::hasBin(int major, int minor)
+
+    .. ocv:function:: static bool gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
+
+    .. ocv:function:: static bool gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
+
+    .. ocv:function:: static bool gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
+
+    .. ocv:function:: static bool gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
+
+        :param major: Major compute capability version.
+
+        :param minor: Minor compute capability version.
+
+According to the CUDA C Programming Guide Version 3.2: "PTX code produced for some specific compute capability can always be compiled to binary code of greater or equal compute capability".
+
+
+
+gpu::DeviceInfo
+---------------
+.. ocv:class:: gpu::DeviceInfo
+
+Class providing functionality for querying the specified GPU properties. ::
+
+    class CV_EXPORTS DeviceInfo
+    {
+    public:
+        //! creates DeviceInfo object for the current GPU
+        DeviceInfo();
+
+        //! creates DeviceInfo object for the given GPU
+        DeviceInfo(int device_id);
+
+        //! ASCII string identifying device
+        const char* name() const;
+
+        //! global memory available on device in bytes
+        size_t totalGlobalMem() const;
+
+        //! shared memory available per block in bytes
+        size_t sharedMemPerBlock() const;
+
+        //! 32-bit registers available per block
+        int regsPerBlock() const;
+
+        //! warp size in threads
+        int warpSize() const;
+
+        //! maximum pitch in bytes allowed by memory copies
+        size_t memPitch() const;
+
+        //! maximum number of threads per block
+        int maxThreadsPerBlock() const;
+
+        //! maximum size of each dimension of a block
+        Vec3i maxThreadsDim() const;
+
+        //! maximum size of each dimension of a grid
+        Vec3i maxGridSize() const;
+
+        //! clock frequency in kilohertz
+        int clockRate() const;
+
+        //! constant memory available on device in bytes
+        size_t totalConstMem() const;
+
+        //! major compute capability
+        int majorVersion() const;
+
+        //! minor compute capability
+        int minorVersion() const;
+
+        //! alignment requirement for textures
+        size_t textureAlignment() const;
+
+        //! pitch alignment requirement for texture references bound to pitched memory
+        size_t texturePitchAlignment() const;
+
+        //! number of multiprocessors on device
+        int multiProcessorCount() const;
+
+        //! specified whether there is a run time limit on kernels
+        bool kernelExecTimeoutEnabled() const;
+
+        //! device is integrated as opposed to discrete
+        bool integrated() const;
+
+        //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
+        bool canMapHostMemory() const;
+
+        enum ComputeMode
+        {
+            ComputeModeDefault,         /**< default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */
+            ComputeModeExclusive,       /**< compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */
+            ComputeModeProhibited,      /**< compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */
+            ComputeModeExclusiveProcess /**< compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */
+        };
+
+        //! compute mode
+        ComputeMode computeMode() const;
+
+        //! maximum 1D texture size
+        int maxTexture1D() const;
+
+        //! maximum 1D mipmapped texture size
+        int maxTexture1DMipmap() const;
+
+        //! maximum size for 1D textures bound to linear memory
+        int maxTexture1DLinear() const;
+
+        //! maximum 2D texture dimensions
+        Vec2i maxTexture2D() const;
+
+        //! maximum 2D mipmapped texture dimensions
+        Vec2i maxTexture2DMipmap() const;
+
+        //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
+        Vec3i maxTexture2DLinear() const;
+
+        //! maximum 2D texture dimensions if texture gather operations have to be performed
+        Vec2i maxTexture2DGather() const;
+
+        //! maximum 3D texture dimensions
+        Vec3i maxTexture3D() const;
+
+        //! maximum Cubemap texture dimensions
+        int maxTextureCubemap() const;
+
+        //! maximum 1D layered texture dimensions
+        Vec2i maxTexture1DLayered() const;
+
+        //! maximum 2D layered texture dimensions
+        Vec3i maxTexture2DLayered() const;
+
+        //! maximum Cubemap layered texture dimensions
+        Vec2i maxTextureCubemapLayered() const;
+
+        //! maximum 1D surface size
+        int maxSurface1D() const;
+
+        //! maximum 2D surface dimensions
+        Vec2i maxSurface2D() const;
+
+        //! maximum 3D surface dimensions
+        Vec3i maxSurface3D() const;
+
+        //! maximum 1D layered surface dimensions
+        Vec2i maxSurface1DLayered() const;
+
+        //! maximum 2D layered surface dimensions
+        Vec3i maxSurface2DLayered() const;
+
+        //! maximum Cubemap surface dimensions
+        int maxSurfaceCubemap() const;
+
+        //! maximum Cubemap layered surface dimensions
+        Vec2i maxSurfaceCubemapLayered() const;
+
+        //! alignment requirements for surfaces
+        size_t surfaceAlignment() const;
+
+        //! device can possibly execute multiple kernels concurrently
+        bool concurrentKernels() const;
+
+        //! device has ECC support enabled
+        bool ECCEnabled() const;
+
+        //! PCI bus ID of the device
+        int pciBusID() const;
+
+        //! PCI device ID of the device
+        int pciDeviceID() const;
+
+        //! PCI domain ID of the device
+        int pciDomainID() const;
+
+        //! true if device is a Tesla device using TCC driver, false otherwise
+        bool tccDriver() const;
+
+        //! number of asynchronous engines
+        int asyncEngineCount() const;
+
+        //! device shares a unified address space with the host
+        bool unifiedAddressing() const;
+
+        //! peak memory clock frequency in kilohertz
+        int memoryClockRate() const;
+
+        //! global memory bus width in bits
+        int memoryBusWidth() const;
+
+        //! size of L2 cache in bytes
+        int l2CacheSize() const;
+
+        //! maximum resident threads per multiprocessor
+        int maxThreadsPerMultiProcessor() const;
+
+        //! gets free and total device memory
+        void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
+        size_t freeMemory() const;
+        size_t totalMemory() const;
+
+        //! checks whether device supports the given feature
+        bool supports(FeatureSet feature_set) const;
+
+        //! checks whether the GPU module can be run on the given device
+        bool isCompatible() const;
+    };
+
+
+
+gpu::DeviceInfo::DeviceInfo
+---------------------------
+The constructors.
+
+.. ocv:function:: gpu::DeviceInfo::DeviceInfo()
+
+.. ocv:function:: gpu::DeviceInfo::DeviceInfo(int device_id)
+
+    :param device_id: System index of the GPU device starting with 0.
+
+Constructs the ``DeviceInfo`` object for the specified device. If ``device_id`` parameter is missed, it constructs an object for the current device.
+
+
+
+gpu::DeviceInfo::name
+---------------------
+Returns the device name.
+
+.. ocv:function:: const char* gpu::DeviceInfo::name() const
+
+
+
+gpu::DeviceInfo::majorVersion
+-----------------------------
+Returns the major compute capability version.
+
+.. ocv:function:: int gpu::DeviceInfo::majorVersion()
+
+
+
+gpu::DeviceInfo::minorVersion
+-----------------------------
+Returns the minor compute capability version.
+
+.. ocv:function:: int gpu::DeviceInfo::minorVersion()
+
+
+
+gpu::DeviceInfo::freeMemory
+---------------------------
+Returns the amount of free memory in bytes.
+
+.. ocv:function:: size_t gpu::DeviceInfo::freeMemory()
+
+
+
+gpu::DeviceInfo::totalMemory
+----------------------------
+Returns the amount of total memory in bytes.
+
+.. ocv:function:: size_t gpu::DeviceInfo::totalMemory()
+
+
+
+gpu::DeviceInfo::supports
+-------------------------
+Provides information on GPU feature support.
+
+.. ocv:function:: bool gpu::DeviceInfo::supports(FeatureSet feature_set) const
+
+    :param feature_set: Features to be checked. See :ocv:enum:`gpu::FeatureSet`.
+
+This function returns ``true`` if the device has the specified GPU feature. Otherwise, it returns ``false`` .
+
+
+
+gpu::DeviceInfo::isCompatible
+-----------------------------
+Checks the GPU module and device compatibility.
+
+.. ocv:function:: bool gpu::DeviceInfo::isCompatible()
+
+This function returns ``true`` if the GPU module can be run on the specified device. Otherwise, it returns ``false`` .
+
+
+
+gpu::DeviceInfo::deviceID
+-------------------------
+Returns system index of the GPU device starting with 0.
+
+.. ocv:function:: int gpu::DeviceInfo::deviceID()
--- a/modules/cuda/doc/introduction.rst
+++ b/modules/cuda/doc/introduction.rst
@@ -0,0 +1,62 @@
+GPU Module Introduction
+=======================
+
+.. highlight:: cpp
+
+General Information
+-------------------
+
+The OpenCV GPU module is a set of classes and functions to utilize GPU computational capabilities. It is implemented using NVIDIA* CUDA* Runtime API and supports only NVIDIA GPUs. The OpenCV GPU module includes utility functions, low-level vision primitives, and high-level algorithms. The utility functions and low-level primitives provide a powerful infrastructure for developing fast vision algorithms taking advantage of GPU whereas the high-level functionality includes some state-of-the-art algorithms (such as stereo correspondence, face and people detectors, and others) ready to be used by the application developers.
+
+The GPU module is designed as a host-level API. This means that if you have pre-compiled OpenCV GPU binaries, you are not required to have the CUDA Toolkit installed or write any extra code to make use of the GPU.
+
+The OpenCV GPU module is designed for ease of use and does not require any knowledge of CUDA. Though, such a knowledge will certainly be useful to handle non-trivial cases or achieve the highest performance. It is helpful to understand the cost of various operations, what the GPU does, what the preferred data formats are, and so on. The GPU module is an effective instrument for quick implementation of GPU-accelerated computer vision algorithms. However, if your algorithm involves many simple operations, then, for the best possible performance, you may still need to write your own kernels to avoid extra write and read operations on the intermediate results.
+
+To enable CUDA support, configure OpenCV using ``CMake`` with ``WITH_CUDA=ON`` . When the flag is set and if CUDA is installed, the full-featured OpenCV GPU module is built. Otherwise, the module is still built but at runtime all functions from the module throw
+:ocv:class:`Exception` with ``CV_GpuNotSupported`` error code, except for
+:ocv:func:`gpu::getCudaEnabledDeviceCount()`. The latter function returns zero GPU count in this case. Building OpenCV without CUDA support does not perform device code compilation, so it does not require the CUDA Toolkit installed. Therefore, using the
+:ocv:func:`gpu::getCudaEnabledDeviceCount()` function, you can implement a high-level algorithm that will detect GPU presence at runtime and choose an appropriate implementation (CPU or GPU) accordingly.
+
+Compilation for Different NVIDIA* Platforms
+-------------------------------------------
+
+NVIDIA* compiler enables generating binary code (cubin and fatbin) and intermediate code (PTX). Binary code often implies a specific GPU architecture and generation, so the compatibility with other GPUs is not guaranteed. PTX is targeted for a virtual platform that is defined entirely by the set of capabilities or features. Depending on the selected virtual platform, some of the instructions are emulated or disabled, even if the real hardware supports all the features.
+
+At the first call, the PTX code is compiled to binary code for the particular GPU using a JIT compiler. When the target GPU has a compute capability (CC) lower than the PTX code, JIT fails.
+By default, the OpenCV GPU module includes:
+
+*
+    Binaries for compute capabilities 1.3 and 2.0 (controlled by ``CUDA_ARCH_BIN``     in ``CMake``)
+
+*
+    PTX code for compute capabilities 1.1 and 1.3 (controlled by ``CUDA_ARCH_PTX``     in ``CMake``)
+
+This means that for devices with CC 1.3 and 2.0 binary images are ready to run. For all newer platforms, the PTX code for 1.3 is JIT'ed to a binary image. For devices with CC 1.1 and 1.2, the PTX for 1.1 is JIT'ed. For devices with CC 1.0, no code is available and the functions throw
+:ocv:class:`Exception`. For platforms where JIT compilation is performed first, the run is slow.
+
+On a GPU with CC 1.0, you can still compile the GPU module and most of the functions will run flawlessly. To achieve this, add "1.0" to the list of binaries, for example, ``CUDA_ARCH_BIN="1.0 1.3 2.0"`` . The functions that cannot be run on CC 1.0 GPUs throw an exception.
+
+You can always determine at runtime whether the OpenCV GPU-built binaries (or PTX code) are compatible with your GPU. The function
+:ocv:func:`gpu::DeviceInfo::isCompatible` returns the compatibility status (true/false).
+
+Utilizing Multiple GPUs
+-----------------------
+
+In the current version, each of the OpenCV GPU algorithms can use only a single GPU. So, to utilize multiple GPUs, you have to manually distribute the work between GPUs.
+Switching active devie can be done using :ocv:func:`gpu::setDevice()` function.  For more details please read Cuda C Programing Guide.
+
+While developing algorithms for multiple GPUs, note a data passing overhead. For primitive functions and small images, it can be significant, which may eliminate all the advantages of having multiple GPUs. But for high-level algorithms, consider using multi-GPU acceleration. For example, the Stereo Block Matching algorithm has been successfully parallelized using the following algorithm:
+
+
+ 1.   Split each image of the stereo pair into two horizontal overlapping stripes.
+
+
+ 2.   Process each pair of stripes (from the left and right images) on a separate Fermi* GPU.
+
+
+ 3.   Merge the results into a single disparity map.
+
+With this algorithm, a dual GPU gave a 180
+%
+performance increase comparing to the single Fermi GPU. For a source code example, see
+http://code.opencv.org/projects/opencv/repository/revisions/master/entry/samples/gpu/.
--- a/modules/cuda/doc/object_detection.rst
+++ b/modules/cuda/doc/object_detection.rst
@@ -0,0 +1,325 @@
+Object Detection
+================
+
+.. highlight:: cpp
+
+
+
+gpu::HOGDescriptor
+------------------
+.. ocv:struct:: gpu::HOGDescriptor
+
+The class implements Histogram of Oriented Gradients ([Dalal2005]_) object detector. ::
+
+    struct CV_EXPORTS HOGDescriptor
+    {
+        enum { DEFAULT_WIN_SIGMA = -1 };
+        enum { DEFAULT_NLEVELS = 64 };
+        enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
+
+        HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
+                      Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
+                      int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
+                      double threshold_L2hys=0.2, bool gamma_correction=true,
+                      int nlevels=DEFAULT_NLEVELS);
+
+        size_t getDescriptorSize() const;
+        size_t getBlockHistogramSize() const;
+
+        void setSVMDetector(const vector<float>& detector);
+
+        static vector<float> getDefaultPeopleDetector();
+        static vector<float> getPeopleDetector48x96();
+        static vector<float> getPeopleDetector64x128();
+
+        void detect(const GpuMat& img, vector<Point>& found_locations,
+                    double hit_threshold=0, Size win_stride=Size(),
+                    Size padding=Size());
+
+        void detectMultiScale(const GpuMat& img, vector<Rect>& found_locations,
+                              double hit_threshold=0, Size win_stride=Size(),
+                              Size padding=Size(), double scale0=1.05,
+                              int group_threshold=2);
+
+        void getDescriptors(const GpuMat& img, Size win_stride,
+                            GpuMat& descriptors,
+                            int descr_format=DESCR_FORMAT_COL_BY_COL);
+
+        Size win_size;
+        Size block_size;
+        Size block_stride;
+        Size cell_size;
+        int nbins;
+        double win_sigma;
+        double threshold_L2hys;
+        bool gamma_correction;
+        int nlevels;
+
+    private:
+        // Hidden
+    }
+
+
+Interfaces of all methods are kept similar to the ``CPU HOG`` descriptor and detector analogues as much as possible.
+
+.. note::
+
+   * An example applying the HOG descriptor for people detection can be found at opencv_source_code/samples/cpp/peopledetect.cpp
+   * A GPU example applying the HOG descriptor for people detection can be found at opencv_source_code/samples/gpu/hog.cpp
+
+   * (Python) An example applying the HOG descriptor for people detection can be found at opencv_source_code/samples/python2/peopledetect.py
+
+gpu::HOGDescriptor::HOGDescriptor
+-------------------------------------
+Creates the ``HOG`` descriptor and detector.
+
+.. ocv:function:: gpu::HOGDescriptor::HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16), Size block_stride=Size(8, 8), Size cell_size=Size(8, 8), int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA, double threshold_L2hys=0.2, bool gamma_correction=true, int nlevels=DEFAULT_NLEVELS)
+
+   :param win_size: Detection window size. Align to block size and block stride.
+
+   :param block_size: Block size in pixels. Align to cell size. Only (16,16) is supported for now.
+
+   :param block_stride: Block stride. It must be a multiple of cell size.
+
+   :param cell_size: Cell size. Only (8, 8) is supported for now.
+
+   :param nbins: Number of bins. Only 9 bins per cell are supported for now.
+
+   :param win_sigma: Gaussian smoothing window parameter.
+
+   :param threshold_L2hys: L2-Hys normalization method shrinkage.
+
+   :param gamma_correction: Flag to specify whether the gamma correction preprocessing is required or not.
+
+   :param nlevels: Maximum number of detection window increases.
+
+
+
+gpu::HOGDescriptor::getDescriptorSize
+-----------------------------------------
+Returns the number of coefficients required for the classification.
+
+.. ocv:function:: size_t gpu::HOGDescriptor::getDescriptorSize() const
+
+
+
+gpu::HOGDescriptor::getBlockHistogramSize
+---------------------------------------------
+Returns the block histogram size.
+
+.. ocv:function:: size_t gpu::HOGDescriptor::getBlockHistogramSize() const
+
+
+
+gpu::HOGDescriptor::setSVMDetector
+--------------------------------------
+Sets coefficients for the linear SVM classifier.
+
+.. ocv:function:: void gpu::HOGDescriptor::setSVMDetector(const vector<float>& detector)
+
+
+
+gpu::HOGDescriptor::getDefaultPeopleDetector
+------------------------------------------------
+Returns coefficients of the classifier trained for people detection (for default window size).
+
+.. ocv:function:: static vector<float> gpu::HOGDescriptor::getDefaultPeopleDetector()
+
+
+
+gpu::HOGDescriptor::getPeopleDetector48x96
+----------------------------------------------
+Returns coefficients of the classifier trained for people detection (for 48x96 windows).
+
+.. ocv:function:: static vector<float> gpu::HOGDescriptor::getPeopleDetector48x96()
+
+
+
+gpu::HOGDescriptor::getPeopleDetector64x128
+-----------------------------------------------
+Returns coefficients of the classifier trained for people detection (for 64x128 windows).
+
+.. ocv:function:: static vector<float> gpu::HOGDescriptor::getPeopleDetector64x128()
+
+
+
+gpu::HOGDescriptor::detect
+------------------------------
+Performs object detection without a multi-scale window.
+
+.. ocv:function:: void gpu::HOGDescriptor::detect(const GpuMat& img, vector<Point>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size())
+
+   :param img: Source image.  ``CV_8UC1``  and  ``CV_8UC4`` types are supported for now.
+
+   :param found_locations: Left-top corner points of detected objects boundaries.
+
+   :param hit_threshold: Threshold for the distance between features and SVM classifying plane. Usually it is 0 and should be specfied in the detector coefficients (as the last free coefficient). But if the free coefficient is omitted (which is allowed), you can specify it manually here.
+
+   :param win_stride: Window stride. It must be a multiple of block stride.
+
+   :param padding: Mock parameter to keep the CPU interface compatibility. It must be (0,0).
+
+
+
+gpu::HOGDescriptor::detectMultiScale
+----------------------------------------
+Performs object detection with a multi-scale window.
+
+.. ocv:function:: void gpu::HOGDescriptor::detectMultiScale(const GpuMat& img, vector<Rect>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size(), double scale0=1.05, int group_threshold=2)
+
+   :param img: Source image. See  :ocv:func:`gpu::HOGDescriptor::detect`  for type limitations.
+
+   :param found_locations: Detected objects boundaries.
+
+   :param hit_threshold: Threshold for the distance between features and SVM classifying plane. See  :ocv:func:`gpu::HOGDescriptor::detect`  for details.
+
+   :param win_stride: Window stride. It must be a multiple of block stride.
+
+   :param padding: Mock parameter to keep the CPU interface compatibility. It must be (0,0).
+
+   :param scale0: Coefficient of the detection window increase.
+
+   :param group_threshold: Coefficient to regulate the similarity threshold. When detected, some objects can be covered by many rectangles. 0 means not to perform grouping. See  :ocv:func:`groupRectangles` .
+
+
+
+gpu::HOGDescriptor::getDescriptors
+--------------------------------------
+Returns block descriptors computed for the whole image.
+
+.. ocv:function:: void gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format=DESCR_FORMAT_COL_BY_COL)
+
+   :param img: Source image. See  :ocv:func:`gpu::HOGDescriptor::detect`  for type limitations.
+
+   :param win_stride: Window stride. It must be a multiple of block stride.
+
+   :param descriptors: 2D array of descriptors.
+
+   :param descr_format: Descriptor storage format:
+
+        * **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
+
+        * **DESCR_FORMAT_COL_BY_COL** - Column-major order.
+
+The function is mainly used to learn the classifier.
+
+
+gpu::CascadeClassifier_GPU
+--------------------------
+.. ocv:class:: gpu::CascadeClassifier_GPU
+
+Cascade classifier class used for object detection. Supports HAAR and LBP cascades. ::
+
+    class CV_EXPORTS CascadeClassifier_GPU
+    {
+    public:
+            CascadeClassifier_GPU();
+            CascadeClassifier_GPU(const String& filename);
+            ~CascadeClassifier_GPU();
+
+            bool empty() const;
+            bool load(const String& filename);
+            void release();
+
+            /* Returns number of detected objects */
+            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());
+            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
+
+            /* Finds only the largest object. Special mode if training is required.*/
+            bool findLargestObject;
+
+            /* Draws rectangles in input image */
+            bool visualizeInPlace;
+
+            Size getClassifierSize() const;
+    };
+
+.. note::
+
+   * A cascade classifier example can be found at opencv_source_code/samples/gpu/cascadeclassifier.cpp
+   * A Nvidea API specific cascade classifier example can be found at opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp
+
+gpu::CascadeClassifier_GPU::CascadeClassifier_GPU
+-----------------------------------------------------
+Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
+
+.. ocv:function:: gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const String& filename)
+
+    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.
+
+
+
+gpu::CascadeClassifier_GPU::empty
+-------------------------------------
+Checks whether the classifier is loaded or not.
+
+.. ocv:function:: bool gpu::CascadeClassifier_GPU::empty() const
+
+
+
+gpu::CascadeClassifier_GPU::load
+------------------------------------
+Loads the classifier from a file. The previous content is destroyed.
+
+.. ocv:function:: bool gpu::CascadeClassifier_GPU::load(const String& filename)
+
+    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.
+
+
+gpu::CascadeClassifier_GPU::release
+---------------------------------------
+Destroys the loaded classifier.
+
+.. ocv:function:: void gpu::CascadeClassifier_GPU::release()
+
+
+
+gpu::CascadeClassifier_GPU::detectMultiScale
+------------------------------------------------
+Detects objects of different sizes in the input image.
+
+.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size())
+
+.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4)
+
+    :param image: Matrix of type  ``CV_8U``  containing an image where objects should be detected.
+
+    :param objectsBuf: Buffer to store detected objects (rectangles). If it is empty, it is allocated with the default size. If not empty, the function searches not more than N objects, where ``N = sizeof(objectsBufer's data)/sizeof(cv::Rect)``.
+
+    :param maxObjectSize: Maximum possible object size. Objects larger than that are ignored. Used for second signature and supported only for LBP cascades.
+
+    :param scaleFactor:  Parameter specifying how much the image size is reduced at each image scale.
+
+    :param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.
+
+    :param minSize: Minimum possible object size. Objects smaller than that are ignored.
+
+The detected objects are returned as a list of rectangles.
+
+The function returns the number of detected objects, so you can retrieve them as in the following example: ::
+
+    gpu::CascadeClassifier_GPU cascade_gpu(...);
+
+    Mat image_cpu = imread(...)
+    GpuMat image_gpu(image_cpu);
+
+    GpuMat objbuf;
+    int detections_number = cascade_gpu.detectMultiScale( image_gpu,
+              objbuf, 1.2, minNeighbors);
+
+    Mat obj_host;
+    // download only detected number of rectangles
+    objbuf.colRange(0, detections_number).download(obj_host);
+
+    Rect* faces = obj_host.ptr<Rect>();
+    for(int i = 0; i < detections_num; ++i)
+       cv::rectangle(image_cpu, faces[i], Scalar(255));
+
+    imshow("Faces", image_cpu);
+
+
+.. seealso:: :ocv:func:`CascadeClassifier::detectMultiScale`
+
+
+
+.. [Dalal2005] Navneet Dalal and Bill Triggs. *Histogram of oriented gradients for human detection*. 2005.
--- a/modules/cuda/include/opencv2/cuda.hpp
+++ b/modules/cuda/include/opencv2/cuda.hpp
@@ -0,0 +1,220 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CUDA_HPP__
+#define __OPENCV_CUDA_HPP__
+
+#ifndef __cplusplus
+#  error cuda.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+namespace cv { namespace cuda {
+
+//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
+
+struct CV_EXPORTS HOGConfidence
+{
+   double scale;
+   std::vector<Point> locations;
+   std::vector<double> confidences;
+   std::vector<double> part_scores[4];
+};
+
+struct CV_EXPORTS HOGDescriptor
+{
+    enum { DEFAULT_WIN_SIGMA = -1 };
+    enum { DEFAULT_NLEVELS = 64 };
+    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
+
+    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
+                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
+                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
+                  double threshold_L2hys=0.2, bool gamma_correction=true,
+                  int nlevels=DEFAULT_NLEVELS);
+
+    size_t getDescriptorSize() const;
+    size_t getBlockHistogramSize() const;
+
+    void setSVMDetector(const std::vector<float>& detector);
+
+    static std::vector<float> getDefaultPeopleDetector();
+    static std::vector<float> getPeopleDetector48x96();
+    static std::vector<float> getPeopleDetector64x128();
+
+    void detect(const GpuMat& img, std::vector<Point>& found_locations,
+                double hit_threshold=0, Size win_stride=Size(),
+                Size padding=Size());
+
+    void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
+                          double hit_threshold=0, Size win_stride=Size(),
+                          Size padding=Size(), double scale0=1.05,
+                          int group_threshold=2);
+
+    void computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
+                                                Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences);
+
+    void computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
+                                                                    double hit_threshold, Size win_stride, Size padding,
+                                                                    std::vector<HOGConfidence> &conf_out, int group_threshold);
+
+    void getDescriptors(const GpuMat& img, Size win_stride,
+                        GpuMat& descriptors,
+                        int descr_format=DESCR_FORMAT_COL_BY_COL);
+
+    Size win_size;
+    Size block_size;
+    Size block_stride;
+    Size cell_size;
+    int nbins;
+    double win_sigma;
+    double threshold_L2hys;
+    bool gamma_correction;
+    int nlevels;
+
+protected:
+    void computeBlockHistograms(const GpuMat& img);
+    void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
+
+    double getWinSigma() const;
+    bool checkDetectorSize() const;
+
+    static int numPartsWithin(int size, int part_size, int stride);
+    static Size numPartsWithin(Size size, Size part_size, Size stride);
+
+    // Coefficients of the separating plane
+    float free_coef;
+    GpuMat detector;
+
+    // Results of the last classification step
+    GpuMat labels, labels_buf;
+    Mat labels_host;
+
+    // Results of the last histogram evaluation step
+    GpuMat block_hists, block_hists_buf;
+
+    // Gradients conputation results
+    GpuMat grad, qangle, grad_buf, qangle_buf;
+
+    // returns subbuffer with required size, reallocates buffer if nessesary.
+    static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
+    static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);
+
+    std::vector<GpuMat> image_scales;
+};
+
+//////////////////////////// CascadeClassifier ////////////////////////////
+
+// The cascade classifier class for object detection: supports old haar and new lbp xlm formats and nvbin for haar cascades olny.
+class CV_EXPORTS CascadeClassifier_GPU
+{
+public:
+    CascadeClassifier_GPU();
+    CascadeClassifier_GPU(const String& filename);
+    ~CascadeClassifier_GPU();
+
+    bool empty() const;
+    bool load(const String& filename);
+    void release();
+
+    /* returns number of detected objects */
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
+    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
+
+    bool findLargestObject;
+    bool visualizeInPlace;
+
+    Size getClassifierSize() const;
+
+private:
+    struct CascadeClassifierImpl;
+    CascadeClassifierImpl* impl;
+    struct HaarCascade;
+    struct LbpCascade;
+    friend class CascadeClassifier_GPU_LBP;
+};
+
+//////////////////////////// Labeling ////////////////////////////
+
+//!performs labeling via graph cuts of a 2D regular 4-connected graph.
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels,
+                         GpuMat& buf, Stream& stream = Stream::Null());
+
+//!performs labeling via graph cuts of a 2D regular 8-connected graph.
+CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
+                         GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight,
+                         GpuMat& labels,
+                         GpuMat& buf, Stream& stream = Stream::Null());
+
+//! compute mask for Generalized Flood fill componetns labeling.
+CV_EXPORTS void connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& stream = Stream::Null());
+
+//! performs connected componnents labeling.
+CV_EXPORTS void labelComponents(const GpuMat& mask, GpuMat& components, int flags = 0, Stream& stream = Stream::Null());
+
+//////////////////////////// Calib3d ////////////////////////////
+
+CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
+                                GpuMat& dst, Stream& stream = Stream::Null());
+
+CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
+                              const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
+                              Stream& stream = Stream::Null());
+
+CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
+                               const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,
+                               int num_iters=100, float max_dist=8.0, int min_inlier_count=100,
+                               std::vector<int>* inliers=NULL);
+
+//////////////////////////// VStab ////////////////////////////
+
+//! removes points (CV_32FC2, single row matrix) with zero mask value
+CV_EXPORTS void compactPoints(GpuMat &points0, GpuMat &points1, const GpuMat &mask);
+
+CV_EXPORTS void calcWobbleSuppressionMaps(
+        int left, int idx, int right, Size size, const Mat &ml, const Mat &mr,
+        GpuMat &mapx, GpuMat &mapy);
+
+}} // namespace cv { namespace cuda {
+
+#endif /* __OPENCV_CUDA_HPP__ */
--- a/modules/cuda/perf/perf_calib3d.cpp
+++ b/modules/cuda/perf/perf_calib3d.cpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+DEF_PARAM_TEST_1(Count, int);
+
+//////////////////////////////////////////////////////////////////////
+// ProjectPoints
+
+PERF_TEST_P(Count, Calib3D_ProjectPoints,
+            Values(5000, 10000, 20000))
+{
+    const int count = GetParam();
+
+    cv::Mat src(1, count, CV_32FC3);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SolvePnPRansac
+
+PERF_TEST_P(Count, Calib3D_SolvePnPRansac,
+            Values(5000, 10000, 20000))
+{
+    declare.time(10.0);
+
+    const int count = GetParam();
+
+    cv::Mat object(1, count, CV_32FC3);
+    declare.in(object, WARMUP_RNG);
+
+    cv::Mat camera_mat(3, 3, CV_32FC1);
+    cv::randu(camera_mat, 0.5, 1);
+    camera_mat.at<float>(0, 1) = 0.f;
+    camera_mat.at<float>(1, 0) = 0.f;
+    camera_mat.at<float>(2, 0) = 0.f;
+    camera_mat.at<float>(2, 1) = 0.f;
+
+    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
+
+    cv::Mat rvec_gold(1, 3, CV_32FC1);
+    cv::randu(rvec_gold, 0, 1);
+
+    cv::Mat tvec_gold(1, 3, CV_32FC1);
+    cv::randu(tvec_gold, 0, 1);
+
+    std::vector<cv::Point2f> image_vec;
+    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);
+
+    const cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
+
+    cv::Mat rvec;
+    cv::Mat tvec;
+
+    if (PERF_RUN_GPU())
+    {
+        TEST_CYCLE() cv::cuda::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        GPU_SANITY_CHECK(rvec, 1e-3);
+        GPU_SANITY_CHECK(tvec, 1e-3);
+    }
+    else
+    {
+        TEST_CYCLE() cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        CPU_SANITY_CHECK(rvec, 1e-6);
+        CPU_SANITY_CHECK(tvec, 1e-6);
+    }
+}
--- a/modules/cuda/perf/perf_labeling.cpp
+++ b/modules/cuda/perf/perf_labeling.cpp
@@ -0,0 +1,195 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+DEF_PARAM_TEST_1(Image, string);
+
+struct GreedyLabeling
+{
+    struct dot
+    {
+        int x;
+        int y;
+
+        static dot make(int i, int j)
+        {
+            dot d; d.x = i; d.y = j;
+            return d;
+        }
+    };
+
+    struct InInterval
+    {
+        InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {}
+        const int lo, hi;
+
+        bool operator() (const unsigned char a, const unsigned char b) const
+        {
+            int d = a - b;
+            return lo <= d && d <= hi;
+        }
+
+    private:
+        InInterval& operator=(const InInterval&);
+
+
+    };
+
+    GreedyLabeling(cv::Mat img)
+    : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
+
+    ~GreedyLabeling(){delete[] stack;}
+
+    void operator() (cv::Mat labels) const
+    {
+        labels.setTo(cv::Scalar::all(-1));
+        InInterval inInt(0, 2);
+        int cc = -1;
+
+        int* dist_labels = (int*)labels.data;
+        int pitch = static_cast<int>(labels.step1());
+
+        unsigned char* source = (unsigned char*)image.data;
+        int width = image.cols;
+        int height = image.rows;
+
+        for (int j = 0; j < image.rows; ++j)
+            for (int i = 0; i < image.cols; ++i)
+            {
+                if (dist_labels[j * pitch + i] != -1) continue;
+
+                dot* top = stack;
+                dot p = dot::make(i, j);
+                cc++;
+
+                dist_labels[j * pitch + i] = cc;
+
+                while (top >= stack)
+                {
+                    int*  dl = &dist_labels[p.y * pitch + p.x];
+                    unsigned char* sp = &source[p.y * image.step1() + p.x];
+
+                    dl[0] = cc;
+
+                    //right
+                    if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                        *top++ = dot::make(p.x + 1, p.y);
+
+                    //left
+                    if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                        *top++ = dot::make(p.x - 1, p.y);
+
+                    //bottom
+                    if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
+                        *top++ = dot::make(p.x, p.y + 1);
+
+                    //top
+                    if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-static_cast<int>(image.step1())]))
+                        *top++ = dot::make(p.x, p.y - 1);
+
+                    p = *--top;
+                }
+            }
+    }
+
+    cv::Mat image;
+    cv::Mat _labels;
+    dot* stack;
+};
+
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectivityMask,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
+
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::cuda::GpuMat d_image(image);
+        cv::cuda::GpuMat mask;
+
+        TEST_CYCLE() cv::cuda::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        GPU_SANITY_CHECK(mask);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectedComponents,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
+
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::cuda::GpuMat d_mask;
+        cv::cuda::connectivityMask(cv::cuda::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        cv::cuda::GpuMat components;
+
+        TEST_CYCLE() cv::cuda::labelComponents(d_mask, components);
+
+        GPU_SANITY_CHECK(components);
+    }
+    else
+    {
+        GreedyLabeling host(image);
+
+        TEST_CYCLE() host(host._labels);
+
+        cv::Mat components = host._labels;
+        CPU_SANITY_CHECK(components);
+    }
+}
--- a/modules/cuda/perf/perf_main.cpp
+++ b/modules/cuda/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(gpu)
--- a/modules/cuda/perf/perf_matop.cpp
+++ b/modules/cuda/perf/perf_matop.cpp
@@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+//////////////////////////////////////////////////////////////////////
+// SetTo
+
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    const cv::Scalar val(1, 2, 3, 4);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::cuda::GpuMat dst(size, type);
+
+        TEST_CYCLE() dst.setTo(val);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst(size, type);
+
+        TEST_CYCLE() dst.setTo(val);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SetToMasked
+
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    cv::Mat mask(size, CV_8UC1);
+    declare.in(src, mask, WARMUP_RNG);
+
+    const cv::Scalar val(1, 2, 3, 4);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::cuda::GpuMat dst(src);
+        const cv::cuda::GpuMat d_mask(mask);
+
+        TEST_CYCLE() dst.setTo(val, d_mask);
+
+        GPU_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst = src;
+
+        TEST_CYCLE() dst.setTo(val, mask);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CopyToMasked
+
+PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    cv::Mat mask(size, CV_8UC1);
+    declare.in(src, mask, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        const cv::cuda::GpuMat d_mask(mask);
+        cv::cuda::GpuMat dst(d_src.size(), d_src.type(), cv::Scalar::all(0));
+
+        TEST_CYCLE() d_src.copyTo(dst, d_mask);
+
+        GPU_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst(src.size(), src.type(), cv::Scalar::all(0));
+
+        TEST_CYCLE() src.copyTo(dst, mask);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ConvertTo
+
+DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);
+
+PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth1 = GET_PARAM(1);
+    const int depth2 = GET_PARAM(2);
+
+    cv::Mat src(size, depth1);
+    declare.in(src, WARMUP_RNG);
+
+    const double a = 0.5;
+    const double b = 1.0;
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() d_src.convertTo(dst, depth2, a, b);
+
+        GPU_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() src.convertTo(dst, depth2, a, b);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
--- a/modules/cuda/perf/perf_objdetect.cpp
+++ b/modules/cuda/perf/perf_objdetect.cpp
@@ -0,0 +1,173 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+///////////////////////////////////////////////////////////////
+// HOG
+
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, ObjDetect_HOG,
+            Values<string>("gpu/hog/road.png",
+                           "gpu/caltech/image_00000009_0.png",
+                           "gpu/caltech/image_00000032_0.png",
+                           "gpu/caltech/image_00000165_0.png",
+                           "gpu/caltech/image_00000261_0.png",
+                           "gpu/caltech/image_00000469_0.png",
+                           "gpu/caltech/image_00000527_0.png",
+                           "gpu/caltech/image_00000574_0.png"))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_img(img);
+        std::vector<cv::Rect> gpu_found_locations;
+
+        cv::cuda::HOGDescriptor d_hog;
+        d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+
+        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);
+
+        SANITY_CHECK(gpu_found_locations);
+    }
+    else
+    {
+        std::vector<cv::Rect> cpu_found_locations;
+
+        cv::HOGDescriptor hog;
+        hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+
+        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);
+
+        SANITY_CHECK(cpu_found_locations);
+    }
+}
+
+///////////////////////////////////////////////////////////////
+// HaarClassifier
+
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
+{
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::cuda::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat objects_buffer;
+        int detections_num = 0;
+
+        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects(detections_num);
+        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
+        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
+
+        std::vector<cv::Rect> cpu_rects;
+
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);
+
+        SANITY_CHECK(cpu_rects);
+    }
+}
+
+///////////////////////////////////////////////////////////////
+// LBP cascade
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
+{
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::cuda::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+
+        const cv::cuda::GpuMat d_img(img);
+        cv::cuda::GpuMat objects_buffer;
+        int detections_num = 0;
+
+        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects(detections_num);
+        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
+        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
+
+        std::vector<cv::Rect> cpu_rects;
+
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);
+
+        SANITY_CHECK(cpu_rects);
+    }
+}
--- a/modules/cuda/perf/perf_precomp.hpp
+++ b/modules/cuda/perf/perf_precomp.hpp
@@ -0,0 +1,65 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_perf.hpp"
+
+#include "opencv2/cuda.hpp"
+#include "opencv2/calib3d.hpp"
+#include "opencv2/objdetect.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
--- a/modules/cuda/src/calib3d.cpp
+++ b/modules/cuda/src/calib3d.cpp
@@ -0,0 +1,292 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, std::vector<int>*) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace transform_points
+    {
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, PtrStepSz<float3> dst, cudaStream_t stream);
+    }
+
+    namespace project_points
+    {
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, const float* proj, PtrStepSz<float2> dst, cudaStream_t stream);
+    }
+
+    namespace solve_pnp_ransac
+    {
+        int maxNumIters();
+
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float dist_threshold, int* hypothesis_scores);
+    }
+}}}
+
+using namespace ::cv::cuda::device;
+
+namespace
+{
+    void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
+        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
+        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
+
+        // Convert rotation vector into matrix
+        Mat rot;
+        Rodrigues(rvec, rot);
+
+        dst.create(src.size(), src.type());
+        transform_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), dst, stream);
+    }
+}
+
+void cv::cuda::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
+{
+    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
+        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
+        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
+        CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
+        CV_Assert(dist_coef.empty()); // Undistortion isn't supported
+
+        // Convert rotation vector into matrix
+        Mat rot;
+        Rodrigues(rvec, rot);
+
+        dst.create(src.size(), CV_32FC2);
+        project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), camera_mat.ptr<float>(), dst,stream);
+    }
+}
+
+void cv::cuda::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
+{
+    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    // Selects subset_size random different points from [0, num_points - 1] range
+    void selectRandom(int subset_size, int num_points, std::vector<int>& subset)
+    {
+        subset.resize(subset_size);
+        for (int i = 0; i < subset_size; ++i)
+        {
+            bool was;
+            do
+            {
+                subset[i] = rand() % num_points;
+                was = false;
+                for (int j = 0; j < i; ++j)
+                    if (subset[j] == subset[i])
+                    {
+                        was = true;
+                        break;
+                    }
+            } while (was);
+        }
+    }
+
+    // Computes rotation, translation pair for small subsets if the input data
+    class TransformHypothesesGenerator : public ParallelLoopBody
+    {
+    public:
+        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
+                                     const Mat& camera_mat_, int num_points_, int subset_size_,
+                                     Mat rot_matrices_, Mat transl_vectors_)
+                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_),
+                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
+                  transl_vectors(transl_vectors_) {}
+
+        void operator()(const Range& range) const
+        {
+            // Input data for generation of the current hypothesis
+            std::vector<int> subset_indices(subset_size);
+            Mat_<Point3f> object_subset(1, subset_size);
+            Mat_<Point2f> image_subset(1, subset_size);
+
+            // Current hypothesis data
+            Mat rot_vec(1, 3, CV_64F);
+            Mat rot_mat(3, 3, CV_64F);
+            Mat transl_vec(1, 3, CV_64F);
+
+            for (int iter = range.start; iter < range.end; ++iter)
+            {
+                selectRandom(subset_size, num_points, subset_indices);
+                for (int i = 0; i < subset_size; ++i)
+                {
+                   object_subset(0, i) = object->at<Point3f>(subset_indices[i]);
+                   image_subset(0, i) = image->at<Point2f>(subset_indices[i]);
+                }
+
+                solvePnP(object_subset, image_subset, *camera_mat, *dist_coef, rot_vec, transl_vec);
+
+                // Remember translation vector
+                Mat transl_vec_ = transl_vectors.colRange(iter * 3, (iter + 1) * 3);
+                transl_vec = transl_vec.reshape(0, 1);
+                transl_vec.convertTo(transl_vec_, CV_32F);
+
+                // Remember rotation matrix
+                Rodrigues(rot_vec, rot_mat);
+                Mat rot_mat_ = rot_matrices.colRange(iter * 9, (iter + 1) * 9).reshape(0, 3);
+                rot_mat.convertTo(rot_mat_, CV_32F);
+            }
+        }
+
+        const Mat* object;
+        const Mat* image;
+        const Mat* dist_coef;
+        const Mat* camera_mat;
+        int num_points;
+        int subset_size;
+
+        // Hypotheses storage (global)
+        Mat rot_matrices;
+        Mat transl_vectors;
+    };
+}
+
+void cv::cuda::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
+                             const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess,
+                             int num_iters, float max_dist, int min_inlier_count,
+                             std::vector<int>* inliers)
+{
+    (void)min_inlier_count;
+    CV_Assert(object.rows == 1 && object.cols > 0 && object.type() == CV_32FC3);
+    CV_Assert(image.rows == 1 && image.cols > 0 && image.type() == CV_32FC2);
+    CV_Assert(object.cols == image.cols);
+    CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
+    CV_Assert(!use_extrinsic_guess); // We don't support initial guess for now
+    CV_Assert(num_iters <= solve_pnp_ransac::maxNumIters());
+
+    const int subset_size = 4;
+    const int num_points = object.cols;
+    CV_Assert(num_points >= subset_size);
+
+    // Unapply distortion and intrinsic camera transformations
+    Mat eye_camera_mat = Mat::eye(3, 3, CV_32F);
+    Mat empty_dist_coef;
+    Mat image_normalized;
+    undistortPoints(image, image_normalized, camera_mat, dist_coef, Mat(), eye_camera_mat);
+
+    // Hypotheses storage (global)
+    Mat rot_matrices(1, num_iters * 9, CV_32F);
+    Mat transl_vectors(1, num_iters * 3, CV_32F);
+
+    // Generate set of hypotheses using small subsets of the input data
+    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
+                                      num_points, subset_size, rot_matrices, transl_vectors);
+    parallel_for_(Range(0, num_iters), body);
+
+    // Compute scores (i.e. number of inliers) for each hypothesis
+    GpuMat d_object(object);
+    GpuMat d_image_normalized(image_normalized);
+    GpuMat d_hypothesis_scores(1, num_iters, CV_32S);
+    solve_pnp_ransac::computeHypothesisScores(
+            num_iters, num_points, rot_matrices.ptr<float>(), transl_vectors.ptr<float3>(),
+            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist,
+            d_hypothesis_scores.ptr<int>());
+
+    // Find the best hypothesis index
+    Point best_idx;
+    double best_score;
+    cuda::minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
+    int num_inliers = static_cast<int>(best_score);
+
+    // Extract the best hypothesis data
+
+    Mat rot_mat = rot_matrices.colRange(best_idx.x * 9, (best_idx.x + 1) * 9).reshape(0, 3);
+    Rodrigues(rot_mat, rvec);
+    rvec = rvec.reshape(0, 1);
+
+    tvec = transl_vectors.colRange(best_idx.x * 3, (best_idx.x + 1) * 3).clone();
+    tvec = tvec.reshape(0, 1);
+
+    // Build vector of inlier indices
+    if (inliers != NULL)
+    {
+        inliers->clear();
+        inliers->reserve(num_inliers);
+
+        Point3f p, p_transf;
+        Point2f p_proj;
+        const float* rot = rot_mat.ptr<float>();
+        const float* transl = tvec.ptr<float>();
+
+        for (int i = 0; i < num_points; ++i)
+        {
+            p = object.at<Point3f>(0, i);
+            p_transf.x = rot[0] * p.x + rot[1] * p.y + rot[2] * p.z + transl[0];
+            p_transf.y = rot[3] * p.x + rot[4] * p.y + rot[5] * p.z + transl[1];
+            p_transf.z = rot[6] * p.x + rot[7] * p.y + rot[8] * p.z + transl[2];
+            p_proj.x = p_transf.x / p_transf.z;
+            p_proj.y = p_transf.y / p_transf.z;
+            if (norm(p_proj - image_normalized.at<Point2f>(0, i)) < max_dist)
+                inliers->push_back(i);
+        }
+    }
+}
+
+#endif
--- a/modules/cuda/src/cascadeclassifier.cpp
+++ b/modules/cuda/src/cascadeclassifier.cpp
@@ -0,0 +1,758 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/objdetect/objdetect_c.h"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+cv::cuda::CascadeClassifier_GPU::CascadeClassifier_GPU()               { throw_no_cuda(); }
+cv::cuda::CascadeClassifier_GPU::CascadeClassifier_GPU(const String&)  { throw_no_cuda(); }
+cv::cuda::CascadeClassifier_GPU::~CascadeClassifier_GPU()              { throw_no_cuda(); }
+bool cv::cuda::CascadeClassifier_GPU::empty() const                    { throw_no_cuda(); return true; }
+bool cv::cuda::CascadeClassifier_GPU::load(const String&)              { throw_no_cuda(); return true; }
+Size cv::cuda::CascadeClassifier_GPU::getClassifierSize() const        { throw_no_cuda(); return Size();}
+void cv::cuda::CascadeClassifier_GPU::release()                        { throw_no_cuda(); }
+int cv::cuda::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_no_cuda(); return -1;}
+int cv::cuda::CascadeClassifier_GPU::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_no_cuda(); return -1;}
+
+#else
+
+struct cv::cuda::CascadeClassifier_GPU::CascadeClassifierImpl
+{
+public:
+    CascadeClassifierImpl(){}
+    virtual ~CascadeClassifierImpl(){}
+
+    virtual unsigned int process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
+                      bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize, cv::Size maxObjectSize) = 0;
+
+    virtual cv::Size getClassifierCvSize() const = 0;
+    virtual bool read(const String& classifierAsXml) = 0;
+};
+
+#ifndef HAVE_OPENCV_CUDALEGACY
+
+struct cv::cuda::CascadeClassifier_GPU::HaarCascade : cv::cuda::CascadeClassifier_GPU::CascadeClassifierImpl
+{
+public:
+    HaarCascade()
+    {
+        throw_no_cuda();
+    }
+
+    unsigned int process(const GpuMat&, GpuMat&, float, int, bool, bool, cv::Size, cv::Size)
+    {
+        throw_no_cuda();
+        return 0;
+    }
+
+    cv::Size getClassifierCvSize() const
+    {
+        throw_no_cuda();
+        return cv::Size();
+    }
+
+    bool read(const String&)
+    {
+        throw_no_cuda();
+        return false;
+    }
+};
+
+#else
+
+struct cv::cuda::CascadeClassifier_GPU::HaarCascade : cv::cuda::CascadeClassifier_GPU::CascadeClassifierImpl
+{
+public:
+    HaarCascade() : lastAllocatedFrameSize(-1, -1)
+    {
+        ncvSetDebugOutputHandler(NCVDebugOutputHandler);
+    }
+
+    bool read(const String& filename)
+    {
+        ncvSafeCall( load(filename) );
+        return true;
+    }
+
+    NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
+                      bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize,
+                      /*out*/unsigned int& numDetections)
+    {
+        calculateMemReqsAndAllocate(src.size());
+
+        NCVMemPtr src_beg;
+        src_beg.ptr = (void*)src.ptr<Ncv8u>();
+        src_beg.memtype = NCVMemoryTypeDevice;
+
+        NCVMemSegment src_seg;
+        src_seg.begin = src_beg;
+        src_seg.size  = src.step * src.rows;
+
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
+        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
+
+        CV_Assert(objects.rows == 1);
+
+        NCVMemPtr objects_beg;
+        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
+        objects_beg.memtype = NCVMemoryTypeDevice;
+
+        NCVMemSegment objects_seg;
+        objects_seg.begin = objects_beg;
+        objects_seg.size = objects.step * objects.rows;
+        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
+        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
+
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+
+        NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
+
+        Ncv32u flags = 0;
+        flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;
+        flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace  : 0;
+
+        ncvStat = ncvDetectObjectsMultiScale_device(
+            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures,
+            winMinSize,
+            minNeighbors,
+            scaleStep, 1,
+            flags,
+            *gpuAllocator, *cpuAllocator, devProp, 0);
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+
+    unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
+                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
+    {
+        CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);
+
+        const int defaultObjSearchNum = 100;
+        if (objectsBuf.empty())
+        {
+            objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);
+        }
+
+        cv::Size ncvMinSize = this->getClassifierCvSize();
+
+        if (ncvMinSize.width < minSize.width && ncvMinSize.height < minSize.height)
+        {
+            ncvMinSize.width = minSize.width;
+            ncvMinSize.height = minSize.height;
+        }
+
+        unsigned int numDetections;
+        ncvSafeCall(this->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections));
+
+        return numDetections;
+    }
+
+    cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }
+
+private:
+    static void NCVDebugOutputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
+
+    NCVStatus load(const String& classifierFile)
+    {
+        int devId = cv::cuda::getDevice();
+        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
+
+        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
+        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, static_cast<int>(devProp.textureAlignment));
+        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
+
+        Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
+        ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR);
+
+        h_haarStages   = new NCVVectorAlloc<HaarStage64>(*cpuCascadeAllocator, haarNumStages);
+        h_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*cpuCascadeAllocator, haarNumNodes);
+        h_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*cpuCascadeAllocator, haarNumFeatures);
+
+        ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+
+        ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR);
+
+        d_haarStages   = new NCVVectorAlloc<HaarStage64>(*gpuCascadeAllocator, haarNumStages);
+        d_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*gpuCascadeAllocator, haarNumNodes);
+        d_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*gpuCascadeAllocator, haarNumFeatures);
+
+        ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+
+        ncvStat = h_haarStages->copySolid(*d_haarStages, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+        ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+        ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+
+    NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)
+    {
+        if (lastAllocatedFrameSize == frameSize)
+        {
+            return NCV_SUCCESS;
+        }
+
+        // Calculate memory requirements and create real allocators
+        NCVMemStackAllocator gpuCounter(static_cast<int>(devProp.textureAlignment));
+        NCVMemStackAllocator cpuCounter(static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);
+
+        NCVMatrixAlloc<Ncv8u> d_src(gpuCounter, frameSize.width, frameSize.height);
+        NCVMatrixAlloc<Ncv8u> h_src(cpuCounter, frameSize.width, frameSize.height);
+
+        ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+        ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        NCVVectorAlloc<NcvRect32u> d_rects(gpuCounter, 100);
+        ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+        Ncv32u numDetections;
+        ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0);
+
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
+        cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), static_cast<int>(devProp.textureAlignment));
+
+        ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR);
+
+        lastAllocatedFrameSize = frameSize;
+        return NCV_SUCCESS;
+    }
+
+    cudaDeviceProp devProp;
+    NCVStatus ncvStat;
+
+    Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
+    Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+
+    Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
+    Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
+    Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+
+    HaarClassifierCascadeDescriptor haar;
+
+    Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
+    Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
+    Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+
+    Size lastAllocatedFrameSize;
+
+    Ptr<NCVMemStackAllocator> gpuAllocator;
+    Ptr<NCVMemStackAllocator> cpuAllocator;
+
+    virtual ~HaarCascade(){}
+};
+
+#endif
+
+cv::Size operator -(const cv::Size& a, const cv::Size& b)
+{
+    return cv::Size(a.width - b.width, a.height - b.height);
+}
+
+cv::Size operator +(const cv::Size& a, const int& i)
+{
+    return cv::Size(a.width + i, a.height + i);
+}
+
+cv::Size operator *(const cv::Size& a, const float& f)
+{
+    return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
+}
+
+cv::Size operator /(const cv::Size& a, const float& f)
+{
+    return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
+}
+
+bool operator <=(const cv::Size& a, const cv::Size& b)
+{
+    return a.width <= b.width && a.height <= b.width;
+}
+
+struct PyrLavel
+{
+    PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
+    {
+        do
+        {
+            order = _order;
+            scale = pow(_scale, order);
+            sFrame = frame / scale;
+            workArea = sFrame - window + 1;
+            sWindow = window * scale;
+            _order++;
+        } while (sWindow <= minObjectSize);
+    }
+
+    bool isFeasible(cv::Size maxObj)
+    {
+        return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
+    }
+
+    PyrLavel next(float factor, cv::Size frame, cv::Size window, cv::Size minObjectSize)
+    {
+        return PyrLavel(order + 1, factor, frame, window, minObjectSize);
+    }
+
+    int order;
+    float scale;
+    cv::Size sFrame;
+    cv::Size workArea;
+    cv::Size sWindow;
+};
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace lbp
+    {
+        void classifyPyramid(int frameW,
+                             int frameH,
+                             int windowW,
+                             int windowH,
+                             float initalScale,
+                             float factor,
+                             int total,
+                             const PtrStepSzb& mstages,
+                             const int nstages,
+                             const PtrStepSzi& mnodes,
+                             const PtrStepSzf& mleaves,
+                             const PtrStepSzi& msubsets,
+                             const PtrStepSzb& mfeatures,
+                             const int subsetSize,
+                             PtrStepSz<int4> objects,
+                             unsigned int* classified,
+                             PtrStepSzi integral);
+
+        void connectedConmonents(PtrStepSz<int4>  candidates, int ncandidates, PtrStepSz<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
+    }
+}}}
+
+struct cv::cuda::CascadeClassifier_GPU::LbpCascade : cv::cuda::CascadeClassifier_GPU::CascadeClassifierImpl
+{
+public:
+    struct Stage
+    {
+        int    first;
+        int    ntrees;
+        float  threshold;
+    };
+
+    LbpCascade(){}
+    virtual ~LbpCascade(){}
+
+    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool /*findLargestObject*/,
+        bool /*visualizeInPlace*/, cv::Size minObjectSize, cv::Size maxObjectSize)
+    {
+        CV_Assert(scaleFactor > 1 && image.depth() == CV_8U);
+
+        // const int defaultObjSearchNum = 100;
+        const float grouping_eps = 0.2f;
+
+        if( !objects.empty() && objects.depth() == CV_32S)
+            objects.reshape(4, 1);
+        else
+            objects.create(1 , image.cols >> 4, CV_32SC4);
+
+        // used for debug
+        // candidates.setTo(cv::Scalar::all(0));
+        // objects.setTo(cv::Scalar::all(0));
+
+        if (maxObjectSize == cv::Size())
+            maxObjectSize = image.size();
+
+        allocateBuffers(image.size());
+
+        unsigned int classified = 0;
+        GpuMat dclassified(1, 1, CV_32S);
+        cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );
+
+        PyrLavel level(0, scaleFactor, image.size(), NxM, minObjectSize);
+
+        while (level.isFeasible(maxObjectSize))
+        {
+            int acc = level.sFrame.width + 1;
+            float iniScale = level.scale;
+
+            cv::Size area = level.workArea;
+            int step = 1 + (level.scale <= 2.f);
+
+            int total = 0, prev  = 0;
+
+            while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
+            {
+                // create sutable matrix headers
+                GpuMat src  = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
+                GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
+                GpuMat buff = integralBuffer;
+
+                // generate integral for scale
+                cuda::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
+                cuda::integral(src, sint, buff);
+
+                // calculate job
+                int totalWidth = level.workArea.width / step;
+                total += totalWidth * (level.workArea.height / step);
+
+                // go to next pyramide level
+                level = level.next(scaleFactor, image.size(), NxM, minObjectSize);
+                area = level.workArea;
+
+                step = (1 + (level.scale <= 2.f));
+                prev = acc;
+                acc += level.sFrame.width + 1;
+            }
+
+            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
+                leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
+        }
+
+        if (groupThreshold <= 0  || objects.empty())
+            return 0;
+
+        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
+        device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
+
+        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaDeviceSynchronize() );
+        return classified;
+    }
+
+    virtual cv::Size getClassifierCvSize() const { return NxM; }
+
+    bool read(const String& classifierAsXml)
+    {
+        FileStorage fs(classifierAsXml, FileStorage::READ);
+        return fs.isOpened() ? read(fs.getFirstTopLevelNode()) : false;
+    }
+
+private:
+
+    void allocateBuffers(cv::Size frame)
+    {
+        if (frame == cv::Size())
+            return;
+
+        if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
+        {
+            resuzeBuffer.create(frame, CV_8UC1);
+
+            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+            NcvSize32u roiSize;
+            roiSize.width = frame.width;
+            roiSize.height = frame.height;
+
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
+
+            Ncv32u bufSize;
+            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+            integralBuffer.create(1, bufSize, CV_8UC1);
+#endif
+
+            candidates.create(1 , frame.width >> 1, CV_32SC4);
+        }
+    }
+
+    bool read(const FileNode &root)
+    {
+        const char *GPU_CC_STAGE_TYPE       = "stageType";
+        const char *GPU_CC_FEATURE_TYPE     = "featureType";
+        const char *GPU_CC_BOOST            = "BOOST";
+        const char *GPU_CC_LBP              = "LBP";
+        const char *GPU_CC_MAX_CAT_COUNT    = "maxCatCount";
+        const char *GPU_CC_HEIGHT           = "height";
+        const char *GPU_CC_WIDTH            = "width";
+        const char *GPU_CC_STAGE_PARAMS     = "stageParams";
+        const char *GPU_CC_MAX_DEPTH        = "maxDepth";
+        const char *GPU_CC_FEATURE_PARAMS   = "featureParams";
+        const char *GPU_CC_STAGES           = "stages";
+        const char *GPU_CC_STAGE_THRESHOLD  = "stageThreshold";
+        const float GPU_THRESHOLD_EPS       = 1e-5f;
+        const char *GPU_CC_WEAK_CLASSIFIERS = "weakClassifiers";
+        const char *GPU_CC_INTERNAL_NODES   = "internalNodes";
+        const char *GPU_CC_LEAF_VALUES      = "leafValues";
+        const char *GPU_CC_FEATURES         = "features";
+        const char *GPU_CC_RECT             = "rect";
+
+        String stageTypeStr = (String)root[GPU_CC_STAGE_TYPE];
+        CV_Assert(stageTypeStr == GPU_CC_BOOST);
+
+        String featureTypeStr = (String)root[GPU_CC_FEATURE_TYPE];
+        CV_Assert(featureTypeStr == GPU_CC_LBP);
+
+        NxM.width =  (int)root[GPU_CC_WIDTH];
+        NxM.height = (int)root[GPU_CC_HEIGHT];
+        CV_Assert( NxM.height > 0 && NxM.width > 0 );
+
+        isStumps = ((int)(root[GPU_CC_STAGE_PARAMS][GPU_CC_MAX_DEPTH]) == 1) ? true : false;
+        CV_Assert(isStumps);
+
+        FileNode fn = root[GPU_CC_FEATURE_PARAMS];
+        if (fn.empty())
+            return false;
+
+        ncategories = fn[GPU_CC_MAX_CAT_COUNT];
+
+        subsetSize = (ncategories + 31) / 32;
+        nodeStep = 3 + ( ncategories > 0 ? subsetSize : 1 );
+
+        fn = root[GPU_CC_STAGES];
+        if (fn.empty())
+            return false;
+
+        std::vector<Stage> stages;
+        stages.reserve(fn.size());
+
+        std::vector<int> cl_trees;
+        std::vector<int> cl_nodes;
+        std::vector<float> cl_leaves;
+        std::vector<int> subsets;
+
+        FileNodeIterator it = fn.begin(), it_end = fn.end();
+        for (size_t si = 0; it != it_end; si++, ++it )
+        {
+            FileNode fns = *it;
+            Stage st;
+            st.threshold = (float)fns[GPU_CC_STAGE_THRESHOLD] - GPU_THRESHOLD_EPS;
+
+            fns = fns[GPU_CC_WEAK_CLASSIFIERS];
+            if (fns.empty())
+                return false;
+
+            st.ntrees = (int)fns.size();
+            st.first = (int)cl_trees.size();
+
+            stages.push_back(st);// (int, int, float)
+
+            cl_trees.reserve(stages[si].first + stages[si].ntrees);
+
+            // weak trees
+            FileNodeIterator it1 = fns.begin(), it1_end = fns.end();
+            for ( ; it1 != it1_end; ++it1 )
+            {
+                FileNode fnw = *it1;
+
+                FileNode internalNodes = fnw[GPU_CC_INTERNAL_NODES];
+                FileNode leafValues = fnw[GPU_CC_LEAF_VALUES];
+                if ( internalNodes.empty() || leafValues.empty() )
+                    return false;
+
+                int nodeCount = (int)internalNodes.size()/nodeStep;
+                cl_trees.push_back(nodeCount);
+
+                cl_nodes.reserve((cl_nodes.size() + nodeCount) * 3);
+                cl_leaves.reserve(cl_leaves.size() + leafValues.size());
+
+                if( subsetSize > 0 )
+                    subsets.reserve(subsets.size() + nodeCount * subsetSize);
+
+                // nodes
+                FileNodeIterator iIt = internalNodes.begin(), iEnd = internalNodes.end();
+
+                for( ; iIt != iEnd; )
+                {
+                    cl_nodes.push_back((int)*(iIt++));
+                    cl_nodes.push_back((int)*(iIt++));
+                    cl_nodes.push_back((int)*(iIt++));
+
+                    if( subsetSize > 0 )
+                        for( int j = 0; j < subsetSize; j++, ++iIt )
+                            subsets.push_back((int)*iIt);
+                }
+
+                // leaves
+                iIt = leafValues.begin(), iEnd = leafValues.end();
+                for( ; iIt != iEnd; ++iIt )
+                    cl_leaves.push_back((float)*iIt);
+            }
+        }
+
+        fn = root[GPU_CC_FEATURES];
+        if( fn.empty() )
+            return false;
+        std::vector<uchar> features;
+        features.reserve(fn.size() * 4);
+        FileNodeIterator f_it = fn.begin(), f_end = fn.end();
+        for (; f_it != f_end; ++f_it)
+        {
+            FileNode rect = (*f_it)[GPU_CC_RECT];
+            FileNodeIterator r_it = rect.begin();
+            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+            features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+        }
+
+        // copy data structures on gpu
+        stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
+        trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
+        nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
+        leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
+        subsets_mat.upload(cv::Mat(subsets).reshape(1,1));
+        features_mat.upload(cv::Mat(features).reshape(4,1));
+
+        return true;
+    }
+
+    enum stage { BOOST = 0 };
+    enum feature { LBP = 1, HAAR = 2 };
+    static const stage stageType = BOOST;
+    static const feature featureType = LBP;
+
+    cv::Size NxM;
+    bool isStumps;
+    int ncategories;
+    int subsetSize;
+    int nodeStep;
+
+    // gpu representation of classifier
+    GpuMat stage_mat;
+    GpuMat trees_mat;
+    GpuMat nodes_mat;
+    GpuMat leaves_mat;
+    GpuMat subsets_mat;
+    GpuMat features_mat;
+
+    GpuMat integral;
+    GpuMat integralBuffer;
+    GpuMat resuzeBuffer;
+
+    GpuMat candidates;
+    static const int integralFactor = 4;
+};
+
+cv::cuda::CascadeClassifier_GPU::CascadeClassifier_GPU()
+: findLargestObject(false), visualizeInPlace(false), impl(0) {}
+
+cv::cuda::CascadeClassifier_GPU::CascadeClassifier_GPU(const String& filename)
+: findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }
+
+cv::cuda::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); }
+
+void cv::cuda::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } }
+
+bool cv::cuda::CascadeClassifier_GPU::empty() const { return impl == 0; }
+
+Size cv::cuda::CascadeClassifier_GPU::getClassifierSize() const
+{
+    return this->empty() ? Size() : impl->getClassifierCvSize();
+}
+
+int cv::cuda::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
+{
+    CV_Assert( !this->empty());
+    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
+}
+
+int cv::cuda::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
+{
+    CV_Assert( !this->empty());
+    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, maxObjectSize);
+}
+
+bool cv::cuda::CascadeClassifier_GPU::load(const String& filename)
+{
+    release();
+
+    String fext = filename.substr(filename.find_last_of(".") + 1);
+    fext = fext.toLowerCase();
+
+    if (fext == "nvbin")
+    {
+        impl = new HaarCascade();
+        return impl->read(filename);
+    }
+
+    FileStorage fs(filename, FileStorage::READ);
+
+    if (!fs.isOpened())
+    {
+        impl = new HaarCascade();
+        return impl->read(filename);
+    }
+
+    const char *GPU_CC_LBP = "LBP";
+    String featureTypeStr = (String)fs.getFirstTopLevelNode()["featureType"];
+    if (featureTypeStr == GPU_CC_LBP)
+        impl = new LbpCascade();
+    else
+        impl = new HaarCascade();
+
+    impl->read(filename);
+    return !this->empty();
+}
+
+#endif
--- a/modules/cuda/src/cuda/calib3d.cu
+++ b/modules/cuda/src/cuda/calib3d.cu
@@ -0,0 +1,193 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
+
+    namespace transform_points
+    {
+        __constant__ float3 crot0;
+        __constant__ float3 crot1;
+        __constant__ float3 crot2;
+        __constant__ float3 ctransl;
+
+        struct TransformOp : unary_function<float3, float3>
+        {
+            __device__ __forceinline__ float3 operator()(const float3& p) const
+            {
+                return make_float3(
+                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+            }
+            __host__ __device__ __forceinline__ TransformOp() {}
+            __host__ __device__ __forceinline__ TransformOp(const TransformOp&) {}
+        };
+
+        void call(const PtrStepSz<float3> src, const float* rot,
+                  const float* transl, PtrStepSz<float3> dst,
+                  cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+            cv::cuda::device::transform(src, dst, TransformOp(), WithOutMask(), stream);
+        }
+    } // namespace transform_points
+
+    namespace project_points
+    {
+        __constant__ float3 crot0;
+        __constant__ float3 crot1;
+        __constant__ float3 crot2;
+        __constant__ float3 ctransl;
+        __constant__ float3 cproj0;
+        __constant__ float3 cproj1;
+
+        struct ProjectOp : unary_function<float3, float3>
+        {
+            __device__ __forceinline__ float2 operator()(const float3& p) const
+            {
+                // Rotate and translate in 3D
+                float3 t = make_float3(
+                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+                // Project on 2D plane
+                return make_float2(
+                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
+                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+            }
+            __host__ __device__ __forceinline__ ProjectOp() {}
+            __host__ __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
+        };
+
+        void call(const PtrStepSz<float3> src, const float* rot,
+                  const float* transl, const float* proj, PtrStepSz<float2> dst,
+                  cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
+            cv::cuda::device::transform(src, dst, ProjectOp(), WithOutMask(), stream);
+        }
+    } // namespace project_points
+
+    namespace solve_pnp_ransac
+    {
+        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
+        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
+
+        int maxNumIters()
+        {
+            return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
+        }
+
+        __device__ __forceinline__ float sqr(float x)
+        {
+            return x * x;
+        }
+
+        template <int BLOCK_SIZE>
+        __global__ void computeHypothesisScoresKernel(
+                const int num_points, const float3* object, const float2* image,
+                const float dist_threshold, int* g_num_inliers)
+        {
+            const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
+            const float3 &transl_vec = ctransl_vectors[blockIdx.x];
+            int num_inliers = 0;
+
+            for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+            {
+                float3 p = object[i];
+                p = make_float3(
+                        rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
+                        rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
+                        rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
+                p.x /= p.z;
+                p.y /= p.z;
+                float2 image_p = image[i];
+                if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
+                    ++num_inliers;
+            }
+
+            __shared__ int s_num_inliers[BLOCK_SIZE];
+            reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
+
+            if (threadIdx.x == 0)
+                g_num_inliers[blockIdx.x] = num_inliers;
+        }
+
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float dist_threshold, int* hypothesis_scores)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
+
+            dim3 threads(256);
+            dim3 grid(num_hypotheses);
+
+            computeHypothesisScoresKernel<256><<<grid, threads>>>(
+                    num_points, object, image, dist_threshold, hypothesis_scores);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } // namespace solvepnp_ransac
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cuda/src/cuda/ccomponetns.cu
+++ b/modules/cuda/src/cuda/ccomponetns.cu
@@ -0,0 +1,534 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/core/cuda/vec_traits.hpp>
+#include <opencv2/core/cuda/vec_math.hpp>
+#include <opencv2/core/cuda/emulation.hpp>
+
+#include <iostream>
+#include <stdio.h>
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace ccl
+    {
+        enum
+        {
+            WARP_SIZE  = 32,
+            WARP_LOG   = 5,
+
+            CTA_SIZE_X = 32,
+            CTA_SIZE_Y = 8,
+
+            STA_SIZE_MERGE_Y = 4,
+            STA_SIZE_MERGE_X = 32,
+
+            TPB_X = 1,
+            TPB_Y = 4,
+
+            TILE_COLS = CTA_SIZE_X * TPB_X,
+            TILE_ROWS = CTA_SIZE_Y * TPB_Y
+        };
+
+        template<typename T> struct IntervalsTraits
+        {
+            typedef T elem_type;
+        };
+
+        template<> struct IntervalsTraits<unsigned char>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<uchar3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<uchar4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<unsigned short>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<ushort3>
+        {
+            typedef int3 dist_type;
+            enum {ch = 3};
+        };
+
+        template<> struct IntervalsTraits<ushort4>
+        {
+            typedef int4 dist_type;
+            enum {ch = 4};
+        };
+
+        template<> struct IntervalsTraits<float>
+        {
+            typedef float dist_type;
+            enum {ch = 1};
+        };
+
+        template<> struct IntervalsTraits<int>
+        {
+            typedef int dist_type;
+            enum {ch = 1};
+        };
+
+        typedef unsigned char component;
+        enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
+
+        template<typename T, int CH> struct InInterval {};
+
+        template<typename T> struct InInterval<T, 1>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = a - b;
+                return lo <= d && d <= hi;
+            }
+        };
+
+
+        template<typename T> struct InInterval<T, 3>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
+            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = saturate_cast<I>(a - b);
+                return lo.x <= d.x && d.x <= hi.x &&
+                       lo.y <= d.y && d.y <= hi.y &&
+                       lo.z <= d.z && d.z <= hi.z;
+            }
+        };
+
+        template<typename T> struct InInterval<T, 4>
+        {
+            typedef typename VecTraits<T>::elem_type E;
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
+            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
+            T lo, hi;
+
+            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
+            {
+                I d = saturate_cast<I>(a - b);
+                return lo.x <= d.x && d.x <= hi.x &&
+                       lo.y <= d.y && d.y <= hi.y &&
+                       lo.z <= d.z && d.z <= hi.z &&
+                       lo.w <= d.w && d.w <= hi.w;
+            }
+        };
+
+
+        template<typename T, typename F>
+        __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= image.cols || y >= image.rows) return;
+
+            T intensity = image(y, x);
+            component c = 0;
+
+            if ( x > 0 && connected(intensity, image(y, x - 1)))
+                c |= LEFT;
+
+            if ( y > 0 && connected(intensity, image(y - 1, x)))
+                c |= UP;
+
+            if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
+                c |= RIGHT;
+
+            if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
+                c |= DOWN;
+
+            components(y, x) = c;
+        }
+
+        template< typename T>
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
+        {
+            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
+            dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
+
+            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
+
+            Int_t inInt(lo, hi);
+            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
+
+            cudaSafeCall( cudaGetLastError() );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+        __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
+        {
+            int x = threadIdx.x + blockIdx.x * TILE_COLS;
+            int y = threadIdx.y + blockIdx.y * TILE_ROWS;
+
+            if (x >= edges.cols || y >= edges.rows) return;
+
+            //currently x is 1
+            int bounds = ((y + TPB_Y) < edges.rows);
+
+            __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
+            __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
+
+            int new_labels[TPB_Y][TPB_X];
+            int old_labels[TPB_Y][TPB_X];
+
+            #pragma unroll
+            for (int i = 0; i < TPB_Y; ++i)
+                #pragma unroll
+                for (int j = 0; j < TPB_X; ++j)
+                {
+                    int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                    int xloc = threadIdx.x + CTA_SIZE_X * j;
+                    component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
+
+                    if (!xloc) c &= ~LEFT;
+                    if (!yloc) c &= ~UP;
+
+                    if (xloc == TILE_COLS -1) c &= ~RIGHT;
+                    if (yloc == TILE_ROWS -1) c &= ~DOWN;
+
+                    new_labels[i][j] = yloc * TILE_COLS + xloc;
+                    edgesTile[yloc][xloc] = c;
+                }
+
+            for (int k = 0; ;++k)
+            {
+                //1. backup
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                        int xloc = threadIdx.x + CTA_SIZE_X * j;
+
+                        old_labels[i][j]       = new_labels[i][j];
+                        labelsTile[yloc][xloc] = new_labels[i][j];
+                    }
+
+                __syncthreads();
+
+                //2. compare local arrays
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
+                        int xloc = threadIdx.x + CTA_SIZE_X * j;
+
+                        component c = edgesTile[yloc][xloc];
+                        int label = new_labels[i][j];
+
+                        if (c & UP)
+                           label = ::min(label, labelsTile[yloc - 1][xloc]);
+
+                        if (c &  DOWN)
+                           label = ::min(label, labelsTile[yloc + 1][xloc]);
+
+                        if (c & LEFT)
+                           label = ::min(label, labelsTile[yloc][xloc - 1]);
+
+                        if (c & RIGHT)
+                           label = ::min(label, labelsTile[yloc][xloc + 1]);
+
+                       new_labels[i][j] = label;
+                    }
+
+                __syncthreads();
+
+                //3. determine: Is any value changed?
+                int changed = 0;
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        if (new_labels[i][j] < old_labels[i][j])
+                        {
+                            changed = 1;
+                            Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
+                        }
+                    }
+
+                changed = Emulation::syncthreadsOr(changed);
+
+                if (!changed)
+                    break;
+
+                //4. Compact paths
+                const int *labels = &labelsTile[0][0];
+                #pragma unroll
+                for (int i = 0; i < TPB_Y; ++i)
+                    #pragma unroll
+                    for (int j = 0; j < TPB_X; ++j)
+                    {
+                        int label = new_labels[i][j];
+
+                        while( labels[label] < label ) label = labels[label];
+
+                        new_labels[i][j] = label;
+                    }
+                __syncthreads();
+            }
+
+            #pragma unroll
+            for (int i = 0; i < TPB_Y; ++i)
+            #pragma unroll
+                for (int j = 0; j < TPB_X; ++j)
+                {
+                    int label = new_labels[i][j];
+                    int yloc = label / TILE_COLS;
+                    int xloc = label - yloc * TILE_COLS;
+
+                    xloc += blockIdx.x * TILE_COLS;
+                    yloc += blockIdx.y * TILE_ROWS;
+
+                    label = yloc * edges.cols + xloc;
+                    // do it for x too.
+                    if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
+                }
+        }
+
+        __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
+        {
+            while(1)
+            {
+                int y = label / comps.cols;
+                int x = label - y * comps.cols;
+
+                int parent = comps(y, x);
+
+                if (label == parent) break;
+
+                label = parent;
+            }
+            return label;
+        }
+
+        __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
+        {
+            int r1 = root(comps, l1);
+            int r2 = root(comps, l2);
+
+            if (r1 == r2) return;
+
+            int mi = ::min(r1, r2);
+            int ma = ::max(r1, r2);
+
+            int y = ma / comps.cols;
+            int x = ma - y * comps.cols;
+
+            atomicMin(&comps.ptr(y)[x], mi);
+            changed = true;
+        }
+
+        __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
+            const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
+        {
+            int tid = threadIdx.y * blockDim.x + threadIdx.x;
+            int stride = blockDim.y * blockDim.x;
+
+            int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
+            int yend   = ybegin + tilesNumY * tileSizeY;
+
+            if (blockIdx.y == gridDim.y - 1)
+            {
+                yend -= yIncomplete * tileSizeY;
+                yend -= tileSizeY;
+                tileSizeY = (edges.rows % tileSizeY);
+
+                yend += tileSizeY;
+            }
+
+            int xbegin = blockIdx.x * tilesNumX * tileSizeX;
+            int xend   = xbegin + tilesNumX * tileSizeX;
+
+            if (blockIdx.x == gridDim.x - 1)
+            {
+                if (xIncomplete) yend = ybegin;
+                xend -= xIncomplete * tileSizeX;
+                xend -= tileSizeX;
+                tileSizeX = (edges.cols % tileSizeX);
+
+                xend += tileSizeX;
+            }
+
+            if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
+            {
+                xend = xbegin;
+            }
+
+            int tasksV = (tilesNumX - 1) * (yend - ybegin);
+            int tasksH = (tilesNumY - 1) * (xend - xbegin);
+
+            int total = tasksH + tasksV;
+
+            bool changed;
+            do
+            {
+                changed = false;
+                for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
+                {
+                    if (taskIdx < tasksH)
+                    {
+                        int indexH = taskIdx;
+
+                        int row = indexH / (xend - xbegin);
+                        int col = indexH - row * (xend - xbegin);
+
+                        int y = ybegin + (row + 1) * tileSizeY;
+                        int x = xbegin + col;
+
+                        component e = edges( x, y);
+                        if (e & UP)
+                        {
+                            int lc = comps(y,x);
+                            int lu = comps(y - 1, x);
+
+                            isConnected(comps, lc, lu, changed);
+                        }
+                    }
+                    else
+                    {
+                        int indexV = taskIdx - tasksH;
+
+                        int col = indexV / (yend - ybegin);
+                        int row = indexV - col * (yend - ybegin);
+
+                        int x = xbegin + (col + 1) * tileSizeX;
+                        int y = ybegin + row;
+
+                        component e = edges(x, y);
+                        if (e & LEFT)
+                        {
+                            int lc = comps(y, x);
+                            int ll = comps(y, x - 1);
+
+                            isConnected(comps, lc, ll, changed);
+                        }
+                    }
+                }
+            } while (Emulation::syncthreadsOr(changed));
+        }
+
+        __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if( x < comps.cols && y < comps.rows)
+                comps(y, x) = root(comps, comps(y, x));
+        }
+
+        enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
+
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
+        {
+            (void) flags;
+            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
+            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
+
+            lableTiles<<<grid, block, 0, stream>>>(edges, comps);
+            cudaSafeCall( cudaGetLastError() );
+
+            int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
+            while (grid.x > 1 || grid.y > 1)
+            {
+                dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
+                dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
+                // debug log
+                // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
+                crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
+                tileSizeX <<= 1;
+                tileSizeY <<= 1;
+                grid = mergeGrid;
+
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            grid.x = divUp(edges.cols, block.x);
+            grid.y = divUp(edges.rows, block.y);
+            flatten<<<grid, block, 0, stream>>>(edges, comps);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+} } }
+
+#endif /* CUDA_DISABLER */
--- a/modules/cuda/src/cuda/global_motion.cu
+++ b/modules/cuda/src/cuda/global_motion.cu
@@ -0,0 +1,117 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/remove.h>
+#include <thrust/functional.h>
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device { namespace globmotion {
+
+__constant__ float cml[9];
+__constant__ float cmr[9];
+
+int compactPoints(int N, float *points0, float *points1, const uchar *mask)
+{
+    thrust::device_ptr<float2> dpoints0((float2*)points0);
+    thrust::device_ptr<float2> dpoints1((float2*)points1);
+    thrust::device_ptr<const uchar> dmask(mask);
+
+    return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
+                             thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
+                             dmask, thrust::not1(thrust::identity<uchar>()))
+           - thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
+}
+
+
+__global__ void calcWobbleSuppressionMapsKernel(
+        const int left, const int idx, const int right, const int width, const int height,
+        PtrStepf mapx, PtrStepf mapy)
+{
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (x < width && y < height)
+    {
+        float xl = cml[0]*x + cml[1]*y + cml[2];
+        float yl = cml[3]*x + cml[4]*y + cml[5];
+        float izl = 1.f / (cml[6]*x + cml[7]*y + cml[8]);
+        xl *= izl;
+        yl *= izl;
+
+        float xr = cmr[0]*x + cmr[1]*y + cmr[2];
+        float yr = cmr[3]*x + cmr[4]*y + cmr[5];
+        float izr = 1.f / (cmr[6]*x + cmr[7]*y + cmr[8]);
+        xr *= izr;
+        yr *= izr;
+
+        float wl = idx - left;
+        float wr = right - idx;
+        mapx(y,x) = (wr * xl + wl * xr) / (wl + wr);
+        mapy(y,x) = (wr * yl + wl * yr) / (wl + wr);
+    }
+}
+
+
+void calcWobbleSuppressionMaps(
+        int left, int idx, int right, int width, int height,
+        const float *ml, const float *mr, PtrStepSzf mapx, PtrStepSzf mapy)
+{
+    cudaSafeCall(cudaMemcpyToSymbol(cml, ml, 9*sizeof(float)));
+    cudaSafeCall(cudaMemcpyToSymbol(cmr, mr, 9*sizeof(float)));
+
+    dim3 threads(32, 8);
+    dim3 grid(divUp(width, threads.x), divUp(height, threads.y));
+
+    calcWobbleSuppressionMapsKernel<<<grid, threads>>>(
+            left, idx, right, width, height, mapx, mapy);
+
+    cudaSafeCall(cudaGetLastError());
+    cudaSafeCall(cudaDeviceSynchronize());
+}
+
+}}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cuda/src/cuda/hog.cu
+++ b/modules/cuda/src/cuda/hog.cu
@@ -0,0 +1,814 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    // Other values are not supported
+    #define CELL_WIDTH 8
+    #define CELL_HEIGHT 8
+    #define CELLS_PER_BLOCK_X 2
+    #define CELLS_PER_BLOCK_Y 2
+
+    namespace hog
+    {
+        __constant__ int cnbins;
+        __constant__ int cblock_stride_x;
+        __constant__ int cblock_stride_y;
+        __constant__ int cnblocks_win_x;
+        __constant__ int cnblocks_win_y;
+        __constant__ int cblock_hist_size;
+        __constant__ int cblock_hist_size_2up;
+        __constant__ int cdescr_size;
+        __constant__ int cdescr_width;
+
+
+        /* Returns the nearest upper power of two, works only for
+        the typical GPU thread count (pert block) values */
+        int power_2up(unsigned int n)
+        {
+            if (n < 1) return 1;
+            else if (n < 2) return 2;
+            else if (n < 4) return 4;
+            else if (n < 8) return 8;
+            else if (n < 16) return 16;
+            else if (n < 32) return 32;
+            else if (n < 64) return 64;
+            else if (n < 128) return 128;
+            else if (n < 256) return 256;
+            else if (n < 512) return 512;
+            else if (n < 1024) return 1024;
+            return -1; // Input is too big
+        }
+
+
+        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
+                              int nblocks_win_x, int nblocks_win_y)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
+
+            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
+
+            int block_hist_size_2up = power_2up(block_hist_size);
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
+
+            int descr_width = nblocks_win_x * block_hist_size;
+            cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
+
+            int descr_size = descr_width * nblocks_win_y;
+            cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
+        }
+
+
+        //----------------------------------------------------------------------------
+        // Histogram computation
+
+
+        template <int nblocks> // Number of histogram blocks processed by single GPU thread block
+        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad,
+                                                         const PtrStepb qangle, float scale, float* block_hists)
+        {
+            const int block_x = threadIdx.z;
+            const int cell_x = threadIdx.x / 16;
+            const int cell_y = threadIdx.y;
+            const int cell_thread_x = threadIdx.x & 0xF;
+
+            if (blockIdx.x * blockDim.z + block_x >= img_block_width)
+                return;
+
+            extern __shared__ float smem[];
+            float* hists = smem;
+            float* final_hist = smem + cnbins * 48 * nblocks;
+
+            const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
+                                 4 * cell_x + cell_thread_x;
+            const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
+
+            const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
+            const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
+
+            // 12 means that 12 pixels affect on block's cell (in one row)
+            if (cell_thread_x < 12)
+            {
+                float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +
+                                            cell_x + block_x * CELLS_PER_BLOCK_X) +
+                                           cell_thread_x;
+                for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+                    hist[bin_id * 48 * nblocks] = 0.f;
+
+                const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;
+
+                const int dist_y_begin = -4 - 4 * (int)threadIdx.y;
+                for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+                {
+                    float2 vote = *(const float2*)grad_ptr;
+                    uchar2 bin = *(const uchar2*)qangle_ptr;
+
+                    grad_ptr += grad.step/sizeof(float);
+                    qangle_ptr += qangle.step;
+
+                    int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+                    int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+                    float gaussian = ::expf(-(dist_center_y * dist_center_y +
+                                              dist_center_x * dist_center_x) * scale);
+                    float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
+                                          (8.f - ::fabs(dist_x + 0.5f)) / 64.f;
+
+                    hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
+                    hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
+                }
+
+                volatile float* hist_ = hist;
+                for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)
+                {
+                    if (cell_thread_x < 6) hist_[0] += hist_[6];
+                    if (cell_thread_x < 3) hist_[0] += hist_[3];
+                    if (cell_thread_x == 0)
+                        final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]
+                            = hist_[0] + hist_[1] + hist_[2];
+                }
+            }
+
+            __syncthreads();
+
+            float* block_hist = block_hists + (blockIdx.y * img_block_width +
+                                               blockIdx.x * blockDim.z + block_x) *
+                                              cblock_hist_size;
+
+            int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
+            if (tid < cblock_hist_size)
+                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
+        }
+
+
+        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
+                           int height, int width, const PtrStepSzf& grad,
+                           const PtrStepSzb& qangle, float sigma, float* block_hists)
+        {
+            const int nblocks = 1;
+
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+                                  block_stride_x;
+            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /
+                                   block_stride_y;
+
+            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
+            dim3 threads(32, 2, nblocks);
+
+            cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,
+                                                cudaFuncCachePreferL1));
+
+            // Precompute gaussian spatial window parameter
+            float scale = 1.f / (2.f * sigma * sigma);
+
+            int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);
+            int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);
+            int smem = hists_size + final_hists_size;
+            compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
+                img_block_width, grad, qangle, scale, block_hists);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        //-------------------------------------------------------------
+        //  Normalization of histograms via L2Hys_norm
+        //
+
+
+        template<int size>
+        __device__ float reduce_smem(float* smem, float val)
+        {
+            unsigned int tid = threadIdx.x;
+            float sum = val;
+
+            reduce<size>(smem, sum, tid, plus<float>());
+
+            if (size == 32)
+            {
+            #if __CUDA_ARCH__ >= 300
+                return shfl(sum, 0);
+            #else
+                return smem[0];
+            #endif
+            }
+            else
+            {
+            #if __CUDA_ARCH__ >= 300
+                if (threadIdx.x == 0)
+                    smem[0] = sum;
+            #endif
+
+                __syncthreads();
+
+                return smem[0];
+            }
+        }
+
+
+        template <int nthreads, // Number of threads which process one block historgam
+                  int nblocks> // Number of block hisograms processed by one GPU thread block
+        __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
+                                                           const int img_block_width,
+                                                           float* block_hists, float threshold)
+        {
+            if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
+                return;
+
+            float* hist = block_hists + (blockIdx.y * img_block_width +
+                                         blockIdx.x * blockDim.z + threadIdx.z) *
+                                        block_hist_size + threadIdx.x;
+
+            __shared__ float sh_squares[nthreads * nblocks];
+            float* squares = sh_squares + threadIdx.z * nthreads;
+
+            float elem = 0.f;
+            if (threadIdx.x < block_hist_size)
+                elem = hist[0];
+
+            float sum = reduce_smem<nthreads>(squares, elem * elem);
+
+            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
+            elem = ::min(elem * scale, threshold);
+
+            sum = reduce_smem<nthreads>(squares, elem * elem);
+
+            scale = 1.0f / (::sqrtf(sum) + 1e-3f);
+
+            if (threadIdx.x < block_hist_size)
+                hist[0] = elem * scale;
+        }
+
+
+        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
+                             int height, int width, float* block_hists, float threshold)
+        {
+            const int nblocks = 1;
+
+            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            int nthreads = power_2up(block_hist_size);
+            dim3 threads(nthreads, 1, nblocks);
+
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
+            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
+
+            if (nthreads == 32)
+                normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 64)
+                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 128)
+                normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 256)
+                normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else if (nthreads == 512)
+                normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
+            else
+                CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins");
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        //---------------------------------------------------------------------
+        //  Linear SVM based classification
+        //
+
+       // return confidence values not just positive location
+       template <int nthreads, // Number of threads per one histogram block
+                 int nblocks>  // Number of histogram block processed by single GPU thread block
+       __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
+                                                                                                           const int win_block_stride_x, const int win_block_stride_y,
+                                                                                                           const float* block_hists, const float* coefs,
+                                                                                                           float free_coef, float threshold, float* confidences)
+       {
+           const int win_x = threadIdx.z;
+           if (blockIdx.x * blockDim.z + win_x >= img_win_width)
+                   return;
+
+           const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                                                                blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
+                                                                               cblock_hist_size;
+
+           float product = 0.f;
+           for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+           {
+                   int offset_y = i / cdescr_width;
+                   int offset_x = i - offset_y * cdescr_width;
+                   product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+           }
+
+           __shared__ float products[nthreads * nblocks];
+
+           const int tid = threadIdx.z * nthreads + threadIdx.x;
+
+           reduce<nthreads>(products, product, tid, plus<float>());
+
+           if (threadIdx.x == 0)
+               confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
+
+       }
+
+       void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                               int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                               float* coefs, float free_coef, float threshold, float *confidences)
+       {
+           const int nthreads = 256;
+           const int nblocks = 1;
+
+           int win_block_stride_x = win_stride_x / block_stride_x;
+           int win_block_stride_y = win_stride_y / block_stride_y;
+           int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+           int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+
+           dim3 threads(nthreads, 1, nblocks);
+           dim3 grid(divUp(img_win_width, nblocks), img_win_height);
+
+           cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
+                                                                                   cudaFuncCachePreferL1));
+
+           int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+                                                       block_stride_x;
+           compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
+                   img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
+                   block_hists, coefs, free_coef, threshold, confidences);
+           cudaSafeCall(cudaThreadSynchronize());
+       }
+
+
+
+        template <int nthreads, // Number of threads per one histogram block
+                  int nblocks>  // Number of histogram block processed by single GPU thread block
+        __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
+                                                          const int win_block_stride_x, const int win_block_stride_y,
+                                                          const float* block_hists, const float* coefs,
+                                                          float free_coef, float threshold, unsigned char* labels)
+        {
+            const int win_x = threadIdx.z;
+            if (blockIdx.x * blockDim.z + win_x >= img_win_width)
+                return;
+
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                               blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
+                                              cblock_hist_size;
+
+            float product = 0.f;
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int offset_y = i / cdescr_width;
+                int offset_x = i - offset_y * cdescr_width;
+                product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+            }
+
+            __shared__ float products[nthreads * nblocks];
+
+            const int tid = threadIdx.z * nthreads + threadIdx.x;
+
+            reduce<nthreads>(products, product, tid, plus<float>());
+
+            if (threadIdx.x == 0)
+                labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
+        }
+
+
+        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                            float* coefs, float free_coef, float threshold, unsigned char* labels)
+        {
+            const int nthreads = 256;
+            const int nblocks = 1;
+
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+
+            dim3 threads(nthreads, 1, nblocks);
+            dim3 grid(divUp(img_win_width, nblocks), img_win_height);
+
+            cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
+
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
+                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
+                block_hists, coefs, free_coef, threshold, labels);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //----------------------------------------------------------------------------
+        // Extract descriptors
+
+
+        template <int nthreads>
+        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
+                                                      const float* block_hists, PtrStepf descriptors)
+        {
+            // Get left top corner of the window in src
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                               blockIdx.x * win_block_stride_x) * cblock_hist_size;
+
+            // Get left top corner of the window in dst
+            float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
+
+            // Copy elements from src to dst
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int offset_y = i / cdescr_width;
+                int offset_x = i - offset_y * cdescr_width;
+                descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+            }
+        }
+
+
+        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
+                                    int height, int width, float* block_hists, PtrStepSzf descriptors)
+        {
+            const int nthreads = 256;
+
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+            dim3 threads(nthreads, 1);
+            dim3 grid(img_win_width, img_win_height);
+
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
+                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        template <int nthreads>
+        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
+                                                      const int win_block_stride_y, const float* block_hists,
+                                                      PtrStepf descriptors)
+        {
+            // Get left top corner of the window in src
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                               blockIdx.x * win_block_stride_x) * cblock_hist_size;
+
+            // Get left top corner of the window in dst
+            float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
+
+            // Copy elements from src to dst
+            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+            {
+                int block_idx = i / cblock_hist_size;
+                int idx_in_block = i - block_idx * cblock_hist_size;
+
+                int y = block_idx / cnblocks_win_x;
+                int x = block_idx - y * cnblocks_win_x;
+
+                descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block]
+                    = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+            }
+        }
+
+
+        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
+                                    PtrStepSzf descriptors)
+        {
+            const int nthreads = 256;
+
+            int win_block_stride_x = win_stride_x / block_stride_x;
+            int win_block_stride_y = win_stride_y / block_stride_y;
+            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+            dim3 threads(nthreads, 1);
+            dim3 grid(img_win_width, img_win_height);
+
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
+            extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
+                img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //----------------------------------------------------------------------------
+        // Gradients computation
+
+
+        template <int nthreads, int correct_gamma>
+        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrStepb img,
+                                                      float angle_scale, PtrStepf grad, PtrStepb qangle)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+            const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);
+
+            __shared__ float sh_row[(nthreads + 2) * 3];
+
+            uchar4 val;
+            if (x < width)
+                val = row[x];
+            else
+                val = row[width - 2];
+
+            sh_row[threadIdx.x + 1] = val.x;
+            sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;
+            sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;
+
+            if (threadIdx.x == 0)
+            {
+                val = row[::max(x - 1, 1)];
+                sh_row[0] = val.x;
+                sh_row[(nthreads + 2)] = val.y;
+                sh_row[2 * (nthreads + 2)] = val.z;
+            }
+
+            if (threadIdx.x == blockDim.x - 1)
+            {
+                val = row[::min(x + 1, width - 2)];
+                sh_row[blockDim.x + 1] = val.x;
+                sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
+                sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
+            }
+
+            __syncthreads();
+            if (x < width)
+            {
+                float3 a, b;
+
+                b.x = sh_row[threadIdx.x + 2];
+                b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];
+                b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];
+                a.x = sh_row[threadIdx.x];
+                a.y = sh_row[threadIdx.x + (nthreads + 2)];
+                a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];
+
+                float3 dx;
+                if (correct_gamma)
+                    dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
+                else
+                    dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
+
+                float3 dy = make_float3(0.f, 0.f, 0.f);
+
+                if (blockIdx.y > 0 && blockIdx.y < height - 1)
+                {
+                    val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];
+                    a = make_float3(val.x, val.y, val.z);
+
+                    val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];
+                    b = make_float3(val.x, val.y, val.z);
+
+                    if (correct_gamma)
+                        dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
+                    else
+                        dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
+                }
+
+                float best_dx = dx.x;
+                float best_dy = dy.x;
+
+                float mag0 = dx.x * dx.x + dy.x * dy.x;
+                float mag1 = dx.y * dx.y + dy.y * dy.y;
+                if (mag0 < mag1)
+                {
+                    best_dx = dx.y;
+                    best_dy = dy.y;
+                    mag0 = mag1;
+                }
+
+                mag1 = dx.z * dx.z + dy.z * dy.z;
+                if (mag0 < mag1)
+                {
+                    best_dx = dx.z;
+                    best_dy = dy.z;
+                    mag0 = mag1;
+                }
+
+                mag0 = ::sqrtf(mag0);
+
+                float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
+                int hidx = (int)::floorf(ang);
+                ang -= hidx;
+                hidx = (hidx + cnbins) % cnbins;
+
+                ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
+                ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);
+            }
+        }
+
+
+        void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
+                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
+        {
+            (void)nbins;
+            const int nthreads = 256;
+
+            dim3 bdim(nthreads, 1);
+            dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
+
+            if (correct_gamma)
+                compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+            else
+                compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int nthreads, int correct_gamma>
+        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrStepb img,
+                                                      float angle_scale, PtrStepf grad, PtrStepb qangle)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+            const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);
+
+            __shared__ float sh_row[nthreads + 2];
+
+            if (x < width)
+                sh_row[threadIdx.x + 1] = row[x];
+            else
+                sh_row[threadIdx.x + 1] = row[width - 2];
+
+            if (threadIdx.x == 0)
+                sh_row[0] = row[::max(x - 1, 1)];
+
+            if (threadIdx.x == blockDim.x - 1)
+                sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];
+
+            __syncthreads();
+            if (x < width)
+            {
+                float dx;
+
+                if (correct_gamma)
+                    dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);
+                else
+                    dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];
+
+                float dy = 0.f;
+                if (blockIdx.y > 0 && blockIdx.y < height - 1)
+                {
+                    float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
+                    float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
+                    if (correct_gamma)
+                        dy = ::sqrtf(a) - ::sqrtf(b);
+                    else
+                        dy = a - b;
+                }
+                float mag = ::sqrtf(dx * dx + dy * dy);
+
+                float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
+                int hidx = (int)::floorf(ang);
+                ang -= hidx;
+                hidx = (hidx + cnbins) % cnbins;
+
+                ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
+                ((float2*)  grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);
+            }
+        }
+
+
+        void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img,
+                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
+        {
+            (void)nbins;
+            const int nthreads = 256;
+
+            dim3 bdim(nthreads, 1);
+            dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
+
+            if (correct_gamma)
+                compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+            else
+                compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        //-------------------------------------------------------------------
+        // Resize
+
+        texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
+        texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
+
+        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
+        {
+            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
+        }
+
+        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
+        {
+            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
+                dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
+            }
+        }
+
+        template<class T, class TEX>
+        static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
+        {
+            tex.filterMode = cudaFilterModeLinear;
+
+            size_t texOfs = 0;
+            int colOfs = 0;
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+            cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+
+            if (texOfs != 0)
+            {
+                colOfs = static_cast<int>( texOfs/sizeof(T) );
+                cudaSafeCall( cudaUnbindTexture(tex) );
+                cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
+            }
+
+            dim3 threads(32, 8);
+            dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
+
+            float sx = static_cast<float>(src.cols) / dst.cols;
+            float sy = static_cast<float>(src.rows) / dst.rows;
+
+            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaUnbindTexture(tex) );
+        }
+
+        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
+        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+    } // namespace hog
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cuda/src/cuda/lbp.cu
+++ b/modules/cuda/src/cuda/lbp.cu
@@ -0,0 +1,303 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "lbp.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace lbp
+    {
+        struct LBP
+        {
+            __host__ __device__ __forceinline__ LBP() {}
+
+            __device__ __forceinline__ int operator() (const int* integral, int ty, int fh, int fw, int& shift) const
+            {
+                int anchors[9];
+
+                anchors[0]  = integral[ty];
+                anchors[1]  = integral[ty + fw];
+                anchors[0] -= anchors[1];
+                anchors[2]  = integral[ty + fw * 2];
+                anchors[1] -= anchors[2];
+                anchors[2] -= integral[ty + fw * 3];
+
+                ty += fh;
+                anchors[3]  = integral[ty];
+                anchors[4]  = integral[ty + fw];
+                anchors[3] -= anchors[4];
+                anchors[5]  = integral[ty + fw * 2];
+                anchors[4] -= anchors[5];
+                anchors[5] -= integral[ty + fw * 3];
+
+                anchors[0] -= anchors[3];
+                anchors[1] -= anchors[4];
+                anchors[2] -= anchors[5];
+                // 0 - 2 contains s0 - s2
+
+                ty += fh;
+                anchors[6]  = integral[ty];
+                anchors[7]  = integral[ty + fw];
+                anchors[6] -= anchors[7];
+                anchors[8]  = integral[ty + fw * 2];
+                anchors[7] -= anchors[8];
+                anchors[8] -= integral[ty + fw * 3];
+
+                anchors[3] -= anchors[6];
+                anchors[4] -= anchors[7];
+                anchors[5] -= anchors[8];
+                // 3 - 5 contains s3 - s5
+
+                anchors[0] -= anchors[4];
+                anchors[1] -= anchors[4];
+                anchors[2] -= anchors[4];
+                anchors[3] -= anchors[4];
+                anchors[5] -= anchors[4];
+
+                int response = (~(anchors[0] >> 31)) & 4;
+                response |= (~(anchors[1] >> 31)) & 2;;
+                response |= (~(anchors[2] >> 31)) & 1;
+
+                shift = (~(anchors[5] >> 31)) & 16;
+                shift |= (~(anchors[3] >> 31)) & 1;
+
+                ty += fh;
+                anchors[0]  = integral[ty];
+                anchors[1]  = integral[ty + fw];
+                anchors[0] -= anchors[1];
+                anchors[2]  = integral[ty + fw * 2];
+                anchors[1] -= anchors[2];
+                anchors[2] -= integral[ty + fw * 3];
+
+                anchors[6] -= anchors[0];
+                anchors[7] -= anchors[1];
+                anchors[8] -= anchors[2];
+                // 0 -2 contains s6 - s8
+
+                anchors[6] -= anchors[4];
+                anchors[7] -= anchors[4];
+                anchors[8] -= anchors[4];
+
+                shift |= (~(anchors[6] >> 31)) & 2;
+                shift |= (~(anchors[7] >> 31)) & 4;
+                shift |= (~(anchors[8] >> 31)) & 8;
+                return response;
+            }
+        };
+
+        template<typename Pr>
+        __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            unsigned int tid = threadIdx.x;
+            extern __shared__ int sbuff[];
+
+            int* labels = sbuff;
+            int* rrects = sbuff + n;
+
+            Pr predicate(grouping_eps);
+            partition(candidates, n, labels, predicate);
+
+            rrects[tid * 4 + 0] = 0;
+            rrects[tid * 4 + 1] = 0;
+            rrects[tid * 4 + 2] = 0;
+            rrects[tid * 4 + 3] = 0;
+            __syncthreads();
+
+            int cls = labels[tid];
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
+            Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
+
+            __syncthreads();
+            labels[tid] = 0;
+
+            __syncthreads();
+            Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
+
+            __syncthreads();
+            *nclasses = 0;
+
+            int active = labels[tid];
+            if (active)
+            {
+                int* r1 = rrects + tid * 4;
+                float s = 1.f / active;
+                r1[0] = saturate_cast<int>(r1[0] * s);
+                r1[1] = saturate_cast<int>(r1[1] * s);
+                r1[2] = saturate_cast<int>(r1[2] * s);
+                r1[3] = saturate_cast<int>(r1[3] * s);
+            }
+            __syncthreads();
+
+            if (active && active >= groupThreshold)
+            {
+                int* r1 = rrects + tid * 4;
+                int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
+
+                int aidx = Emulation::smem::atomicInc(nclasses, n);
+                objects[aidx] = r_out;
+            }
+        }
+
+        void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        {
+            if (!ncandidates) return;
+            int block = ncandidates;
+            int smem  = block * ( sizeof(int) + sizeof(int4) );
+            disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        struct Cascade
+        {
+            __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
+                const int* _subsets, const uchar4* _features, int _subsetSize)
+
+            : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
+
+            __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch) const
+            {
+                int current_node = 0;
+                int current_leave = 0;
+
+                for (int s = 0; s < nstages; ++s)
+                {
+                    float sum = 0;
+                    Stage stage = stages[s];
+                    for (int t = 0; t < stage.ntrees; t++)
+                    {
+                        ClNode node = nodes[current_node];
+                        uchar4 feature = features[node.featureIdx];
+
+                        int shift;
+                        int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
+                        int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
+                        sum += leaves[idx];
+
+                        current_node += 1;
+                        current_leave += 2;
+                    }
+
+                    if (sum < stage.threshold)
+                        return false;
+                }
+
+                return true;
+            }
+
+            const Stage*  stages;
+            const int nstages;
+
+            const ClNode* nodes;
+            const float* leaves;
+            const int* subsets;
+            const uchar4* features;
+
+            const int subsetSize;
+            const LBP evaluator;
+        };
+
+        // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
+        __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
+            const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified)
+        {
+            int ftid = blockIdx.x * blockDim.x + threadIdx.x;
+            if (ftid >= total) return;
+
+            int step = (scale <= 2.f);
+
+            int windowsForLine = (__float2int_rn( __fdividef(frameW, scale)) - windowW) >> step;
+            int stotal = windowsForLine * ( (__float2int_rn( __fdividef(frameH, scale)) - windowH) >> step);
+            int wshift = 0;
+
+            int scaleTid = ftid;
+
+            while (scaleTid >= stotal)
+            {
+                scaleTid -= stotal;
+                wshift += __float2int_rn(__fdividef(frameW, scale)) + 1;
+                scale *= factor;
+                step = (scale <= 2.f);
+                windowsForLine = ( ((__float2int_rn(__fdividef(frameW, scale)) - windowW) >> step));
+                stotal = windowsForLine * ( (__float2int_rn(__fdividef(frameH, scale)) - windowH) >> step);
+            }
+
+            int y = __fdividef(scaleTid, windowsForLine);
+            int x = scaleTid - y * windowsForLine;
+
+            x <<= step;
+            y <<= step;
+
+            if (cascade(y, x + wshift, integral, pitch))
+            {
+                if(x >= __float2int_rn(__fdividef(frameW, scale)) - windowW) return;
+
+                int4 rect;
+                rect.x = __float2int_rn(x * scale);
+                rect.y = __float2int_rn(y * scale);
+                rect.z = __float2int_rn(windowW * scale);
+                rect.w = __float2int_rn(windowH * scale);
+
+                int res = atomicInc(classified, (unsigned int)objects.cols);
+                objects(0, res) = rect;
+            }
+        }
+
+        void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
+            const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures,
+            const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral)
+        {
+            const int block = 128;
+            int grid = divUp(workAmount, block);
+            cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
+            Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
+            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cuda/src/cuda/lbp.hpp
+++ b/modules/cuda/src/cuda/lbp.hpp
@@ -0,0 +1,112 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_DEVICE_LBP_HPP_
+#define __OPENCV_GPU_DEVICE_LBP_HPP_
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+
+namespace cv { namespace cuda { namespace device {
+
+namespace lbp {
+
+    struct Stage
+    {
+        int    first;
+        int    ntrees;
+        float  threshold;
+    };
+
+    struct ClNode
+    {
+        int   left;
+        int   right;
+        int   featureIdx;
+    };
+
+    struct InSameComponint
+    {
+    public:
+        __device__ __forceinline__ InSameComponint(float _eps) : eps(_eps) {}
+        __device__ __forceinline__ InSameComponint(const InSameComponint& other) : eps(other.eps) {}
+
+        __device__ __forceinline__ bool operator()(const int4& r1, const int4& r2) const
+        {
+            float delta = eps * (::min(r1.z, r2.z) + ::min(r1.w, r2.w)) * 0.5f;
+
+            return ::abs(r1.x - r2.x) <= delta && ::abs(r1.y - r2.y) <= delta
+                && ::abs(r1.x + r1.z - r2.x - r2.z) <= delta && ::abs(r1.y + r1.w - r2.y - r2.w) <= delta;
+        }
+        float eps;
+    };
+
+    template<typename Pr>
+    __device__ __forceinline__ void partition(int4* vec, unsigned int n, int* labels, Pr predicate)
+    {
+        unsigned tid = threadIdx.x;
+        labels[tid] = tid;
+        __syncthreads();
+        for (unsigned int id = 0; id < n; id++)
+        {
+            if (tid != id && predicate(vec[tid], vec[id]))
+            {
+                int p = labels[tid];
+                int q = labels[id];
+                if (p < q)
+                {
+                    Emulation::smem::atomicMin(labels + id, p);
+                }
+                else if (p > q)
+                {
+                    Emulation::smem::atomicMin(labels + tid, q);
+                }
+            }
+        }
+        __syncthreads();
+    }
+
+} // lbp
+
+} } }// namespaces
+
+#endif
--- a/modules/cuda/src/global_motion.cpp
+++ b/modules/cuda/src/global_motion.cpp
@@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::compactPoints(GpuMat&, GpuMat&, const GpuMat&) { throw_no_cuda(); }
+void cv::cuda::calcWobbleSuppressionMaps(
+        int, int, int, Size, const Mat&, const Mat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device { namespace globmotion {
+
+    int compactPoints(int N, float *points0, float *points1, const uchar *mask);
+
+    void calcWobbleSuppressionMaps(
+            int left, int idx, int right, int width, int height,
+            const float *ml, const float *mr, PtrStepSzf mapx, PtrStepSzf mapy);
+
+}}}}
+
+void cv::cuda::compactPoints(GpuMat &points0, GpuMat &points1, const GpuMat &mask)
+{
+    CV_Assert(points0.rows == 1 && points1.rows == 1 && mask.rows == 1);
+    CV_Assert(points0.type() == CV_32FC2 && points1.type() == CV_32FC2 && mask.type() == CV_8U);
+    CV_Assert(points0.cols == mask.cols && points1.cols == mask.cols);
+
+    int npoints = points0.cols;
+    int remaining = cv::cuda::device::globmotion::compactPoints(
+            npoints, (float*)points0.data, (float*)points1.data, mask.data);
+
+    points0 = points0.colRange(0, remaining);
+    points1 = points1.colRange(0, remaining);
+}
+
+
+void cv::cuda::calcWobbleSuppressionMaps(
+        int left, int idx, int right, Size size, const Mat &ml, const Mat &mr,
+        GpuMat &mapx, GpuMat &mapy)
+{
+    CV_Assert(ml.size() == Size(3, 3) && ml.type() == CV_32F && ml.isContinuous());
+    CV_Assert(mr.size() == Size(3, 3) && mr.type() == CV_32F && mr.isContinuous());
+
+    mapx.create(size, CV_32F);
+    mapy.create(size, CV_32F);
+
+    cv::cuda::device::globmotion::calcWobbleSuppressionMaps(
+                left, idx, right, size.width, size.height,
+                ml.ptr<float>(), mr.ptr<float>(), mapx, mapy);
+}
+
+#endif
--- a/modules/cuda/src/graphcuts.cpp
+++ b/modules/cuda/src/graphcuts.cpp
@@ -0,0 +1,282 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_no_cuda(); }
+void cv::cuda::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace ccl
+    {
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream);
+
+        template<typename T>
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+    }
+}}}
+
+static float4 scalarToCudaType(const cv::Scalar& in)
+{
+  return make_float4((float)in[0], (float)in[1], (float)in[2], (float)in[3]);
+}
+
+void cv::cuda::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
+{
+    CV_Assert(!image.empty());
+
+    int ch = image.channels();
+    CV_Assert(ch <= 4);
+
+    int depth = image.depth();
+
+    typedef void (*func_t)(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
+
+    static const func_t suppotLookup[8][4] =
+    {   //    1,    2,     3,     4
+        { device::ccl::computeEdges<uchar>,  0,  device::ccl::computeEdges<uchar3>,  device::ccl::computeEdges<uchar4>  },// CV_8U
+        { 0,                                 0,  0,                                  0                                  },// CV_16U
+        { device::ccl::computeEdges<ushort>, 0,  device::ccl::computeEdges<ushort3>, device::ccl::computeEdges<ushort4> },// CV_8S
+        { 0,                                 0,  0,                                  0                                  },// CV_16S
+        { device::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
+        { device::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
+        { 0,                                 0,  0,                                  0                                  },// CV_64F
+        { 0,                                 0,  0,                                  0                                  } // CV_USRTYPE1
+    };
+
+    func_t f = suppotLookup[depth][ch - 1];
+    CV_Assert(f);
+
+    if (image.size() != mask.size() || mask.type() != CV_8UC1)
+        mask.create(image.size(), CV_8UC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    float4 culo = scalarToCudaType(lo), cuhi = scalarToCudaType(hi);
+    f(image, mask, culo, cuhi, stream);
+}
+
+void cv::cuda::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
+{
+    CV_Assert(!mask.empty() && mask.type() == CV_8U);
+
+    if (!deviceSupports(SHARED_ATOMICS))
+        CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
+
+    components.create(mask.size(), CV_32SC1);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    device::ccl::labelComponents(mask, components, flags, stream);
+}
+
+namespace
+{
+    typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
+
+    class NppiGraphcutStateHandler
+    {
+    public:
+        NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func)
+        {
+            nppSafeCall( func(sznpp, &pState, pDeviceMem) );
+        }
+
+        ~NppiGraphcutStateHandler()
+        {
+            nppSafeCall( nppiGraphcutFree(pState) );
+        }
+
+        operator NppiGraphcutState*()
+        {
+            return pState;
+        }
+
+    private:
+        NppiGraphcutState* pState;
+    };
+}
+
+void cv::cuda::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
+{
+#if (CUDA_VERSION < 5000)
+    CV_Assert(terminals.type() == CV_32S);
+#else
+    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
+#endif
+
+    Size src_size = terminals.size();
+
+    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(leftTransp.type() == terminals.type());
+
+    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(rightTransp.type() == terminals.type());
+
+    CV_Assert(top.size() == src_size);
+    CV_Assert(top.type() == terminals.type());
+
+    CV_Assert(bottom.size() == src_size);
+    CV_Assert(bottom.type() == terminals.type());
+
+    labels.create(src_size, CV_8U);
+
+    NppiSize sznpp;
+    sznpp.width = src_size.width;
+    sznpp.height = src_size.height;
+
+    int bufsz;
+    nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );
+
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
+
+#if (CUDA_VERSION < 5000)
+    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+#else
+    if (terminals.type() == CV_32S)
+    {
+        nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+    else
+    {
+        nppSafeCall( nppiGraphcut_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), bottom.ptr<Npp32f>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+#endif
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void cv::cuda::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
+              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
+{
+#if (CUDA_VERSION < 5000)
+    CV_Assert(terminals.type() == CV_32S);
+#else
+    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
+#endif
+
+    Size src_size = terminals.size();
+
+    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(leftTransp.type() == terminals.type());
+
+    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
+    CV_Assert(rightTransp.type() == terminals.type());
+
+    CV_Assert(top.size() == src_size);
+    CV_Assert(top.type() == terminals.type());
+
+    CV_Assert(topLeft.size() == src_size);
+    CV_Assert(topLeft.type() == terminals.type());
+
+    CV_Assert(topRight.size() == src_size);
+    CV_Assert(topRight.type() == terminals.type());
+
+    CV_Assert(bottom.size() == src_size);
+    CV_Assert(bottom.type() == terminals.type());
+
+    CV_Assert(bottomLeft.size() == src_size);
+    CV_Assert(bottomLeft.type() == terminals.type());
+
+    CV_Assert(bottomRight.size() == src_size);
+    CV_Assert(bottomRight.type() == terminals.type());
+
+    labels.create(src_size, CV_8U);
+
+    NppiSize sznpp;
+    sznpp.width = src_size.width;
+    sznpp.height = src_size.height;
+
+    int bufsz;
+    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );
+
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    NppStreamHandler h(stream);
+
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
+
+#if (CUDA_VERSION < 5000)
+    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
+        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
+        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+#else
+    if (terminals.type() == CV_32S)
+    {
+        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
+            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
+            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+    else
+    {
+        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
+            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
+            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
+            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
+    }
+#endif
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cuda/src/hog.cpp
+++ b/modules/cuda/src/hog.cpp
--- a/modules/cuda/src/precomp.cpp
+++ b/modules/cuda/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
--- a/modules/cuda/src/precomp.hpp
+++ b/modules/cuda/src/precomp.hpp
@@ -0,0 +1,60 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include "opencv2/cuda.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/calib3d.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cuda/test/test_calib3d.cpp
+++ b/modules/cuda/test/test_calib3d.cpp
@@ -0,0 +1,190 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// transformPoints
+
+struct TransformPoints : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(TransformPoints, Accuracy)
+{
+    cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
+    cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    cv::Mat tvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::transformPoints(loadMat(src), rvec, tvec, dst);
+
+    ASSERT_EQ(src.size(), dst.size());
+    ASSERT_EQ(src.type(), dst.type());
+
+    cv::Mat h_dst(dst);
+
+    cv::Mat rot;
+    cv::Rodrigues(rvec, rot);
+
+    for (int i = 0; i < h_dst.cols; ++i)
+    {
+        cv::Point3f res = h_dst.at<cv::Point3f>(0, i);
+
+        cv::Point3f p = src.at<cv::Point3f>(0, i);
+        cv::Point3f res_gold(
+                rot.at<float>(0, 0) * p.x + rot.at<float>(0, 1) * p.y + rot.at<float>(0, 2) * p.z + tvec.at<float>(0, 0),
+                rot.at<float>(1, 0) * p.x + rot.at<float>(1, 1) * p.y + rot.at<float>(1, 2) * p.z + tvec.at<float>(0, 1),
+                rot.at<float>(2, 0) * p.x + rot.at<float>(2, 1) * p.y + rot.at<float>(2, 2) * p.z + tvec.at<float>(0, 2));
+
+        ASSERT_POINT3_NEAR(res_gold, res, 1e-5);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Calib3D, TransformPoints, ALL_DEVICES);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// ProjectPoints
+
+struct ProjectPoints : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(ProjectPoints, Accuracy)
+{
+    cv::Mat src = randomMat(cv::Size(1000, 1), CV_32FC3, 0, 10);
+    cv::Mat rvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    cv::Mat tvec = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    cv::Mat camera_mat = randomMat(cv::Size(3, 3), CV_32F, 0.5, 1);
+    camera_mat.at<float>(0, 1) = 0.f;
+    camera_mat.at<float>(1, 0) = 0.f;
+    camera_mat.at<float>(2, 0) = 0.f;
+    camera_mat.at<float>(2, 1) = 0.f;
+
+    cv::cuda::GpuMat dst;
+    cv::cuda::projectPoints(loadMat(src), rvec, tvec, camera_mat, cv::Mat(), dst);
+
+    ASSERT_EQ(1, dst.rows);
+    ASSERT_EQ(MatType(CV_32FC2), MatType(dst.type()));
+
+    std::vector<cv::Point2f> dst_gold;
+    cv::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)), dst_gold);
+
+    ASSERT_EQ(dst_gold.size(), static_cast<size_t>(dst.cols));
+
+    cv::Mat h_dst(dst);
+
+    for (size_t i = 0; i < dst_gold.size(); ++i)
+    {
+        cv::Point2f res = h_dst.at<cv::Point2f>(0, (int)i);
+        cv::Point2f res_gold = dst_gold[i];
+
+        ASSERT_LE(cv::norm(res_gold - res) / cv::norm(res_gold), 1e-3f);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Calib3D, ProjectPoints, ALL_DEVICES);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SolvePnPRansac
+
+struct SolvePnPRansac : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(SolvePnPRansac, Accuracy)
+{
+    cv::Mat object = randomMat(cv::Size(5000, 1), CV_32FC3, 0, 100);
+    cv::Mat camera_mat = randomMat(cv::Size(3, 3), CV_32F, 0.5, 1);
+    camera_mat.at<float>(0, 1) = 0.f;
+    camera_mat.at<float>(1, 0) = 0.f;
+    camera_mat.at<float>(2, 0) = 0.f;
+    camera_mat.at<float>(2, 1) = 0.f;
+
+    std::vector<cv::Point2f> image_vec;
+    cv::Mat rvec_gold;
+    cv::Mat tvec_gold;
+    rvec_gold = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    tvec_gold = randomMat(cv::Size(3, 1), CV_32F, 0, 1);
+    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)), image_vec);
+
+    cv::Mat rvec, tvec;
+    std::vector<int> inliers;
+    cv::cuda::solvePnPRansac(object, cv::Mat(1, (int)image_vec.size(), CV_32FC2, &image_vec[0]),
+                            camera_mat, cv::Mat(1, 8, CV_32F, cv::Scalar::all(0)),
+                            rvec, tvec, false, 200, 2.f, 100, &inliers);
+
+    ASSERT_LE(cv::norm(rvec - rvec_gold), 1e-3);
+    ASSERT_LE(cv::norm(tvec - tvec_gold), 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Calib3D, SolvePnPRansac, ALL_DEVICES);
+
+#endif // HAVE_CUDA
--- a/modules/cuda/test/test_global_motion.cpp
+++ b/modules/cuda/test/test_global_motion.cpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace std;
+using namespace cv;
+
+struct CompactPoints : testing::TestWithParam<cuda::DeviceInfo>
+{
+    virtual void SetUp() { cuda::setDevice(GetParam().deviceID()); }
+};
+
+GPU_TEST_P(CompactPoints, CanCompactizeSmallInput)
+{
+    Mat src0(1, 3, CV_32FC2);
+    src0.at<Point2f>(0,0) = Point2f(0,0);
+    src0.at<Point2f>(0,1) = Point2f(0,1);
+    src0.at<Point2f>(0,2) = Point2f(0,2);
+
+    Mat src1(1, 3, CV_32FC2);
+    src1.at<Point2f>(0,0) = Point2f(1,0);
+    src1.at<Point2f>(0,1) = Point2f(1,1);
+    src1.at<Point2f>(0,2) = Point2f(1,2);
+
+    Mat mask(1, 3, CV_8U);
+    mask.at<uchar>(0,0) = 1;
+    mask.at<uchar>(0,1) = 0;
+    mask.at<uchar>(0,2) = 1;
+
+    cuda::GpuMat dsrc0(src0), dsrc1(src1), dmask(mask);
+    cuda::compactPoints(dsrc0, dsrc1, dmask);
+
+    dsrc0.download(src0);
+    dsrc1.download(src1);
+
+    ASSERT_EQ(2, src0.cols);
+    ASSERT_EQ(2, src1.cols);
+
+    ASSERT_TRUE(src0.at<Point2f>(0,0) == Point2f(0,0));
+    ASSERT_TRUE(src0.at<Point2f>(0,1) == Point2f(0,2));
+
+    ASSERT_TRUE(src1.at<Point2f>(0,0) == Point2f(1,0));
+    ASSERT_TRUE(src1.at<Point2f>(0,1) == Point2f(1,2));
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_GlobalMotion, CompactPoints, ALL_DEVICES);
+
+#endif // HAVE_CUDA
--- a/modules/cuda/test/test_gpumat.cpp
+++ b/modules/cuda/test/test_gpumat.cpp
@@ -0,0 +1,361 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+////////////////////////////////////////////////////////////////////////////////
+// SetTo
+
+PARAM_TEST_CASE(SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(SetTo, Zero)
+{
+    cv::Scalar zero = cv::Scalar::all(0);
+
+    cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+    mat.setTo(zero);
+
+    EXPECT_MAT_NEAR(cv::Mat::zeros(size, type), mat, 0.0);
+}
+
+GPU_TEST_P(SetTo, SameVal)
+{
+    cv::Scalar val = cv::Scalar::all(randomDouble(0.0, 255.0));
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+            mat.setTo(val);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+        mat.setTo(val);
+
+        EXPECT_MAT_NEAR(cv::Mat(size, type, val), mat, 0.0);
+    }
+}
+
+GPU_TEST_P(SetTo, DifferentVal)
+{
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+            mat.setTo(val);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+        mat.setTo(val);
+
+        EXPECT_MAT_NEAR(cv::Mat(size, type, val), mat, 0.0);
+    }
+}
+
+GPU_TEST_P(SetTo, Masked)
+{
+    cv::Scalar val = randomScalar(0.0, 255.0);
+    cv::Mat mat_gold = randomMat(size, type);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat mat = createMat(size, type, useRoi);
+            mat.setTo(val, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat mat = loadMat(mat_gold, useRoi);
+        mat.setTo(val, loadMat(mask, useRoi));
+
+        mat_gold.setTo(val, mask);
+
+        EXPECT_MAT_NEAR(mat_gold, mat, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_GpuMat, SetTo, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_TYPES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// CopyTo
+
+PARAM_TEST_CASE(CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(CopyTo, WithOutMask)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    d_src.copyTo(dst);
+
+    EXPECT_MAT_NEAR(src, dst, 0.0);
+}
+
+GPU_TEST_P(CopyTo, Masked)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
+            d_src.copyTo(dst, loadMat(mask, useRoi));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = loadMat(cv::Mat::zeros(size, type), useRoi);
+        d_src.copyTo(dst, loadMat(mask, useRoi));
+
+        cv::Mat dst_gold = cv::Mat::zeros(size, type);
+        src.copyTo(dst_gold, mask);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_GpuMat, CopyTo, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_TYPES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// ConvertTo
+
+PARAM_TEST_CASE(ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth1;
+    int depth2;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth1 = GET_PARAM(2);
+        depth2 = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(ConvertTo, WithOutScaling)
+{
+    cv::Mat src = randomMat(size, depth1);
+
+    if ((depth1 == CV_64F || depth2 == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
+            d_src.convertTo(dst, depth2);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = createMat(size, depth2, useRoi);
+        d_src.convertTo(dst, depth2);
+
+        cv::Mat dst_gold;
+        src.convertTo(dst_gold, depth2);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+GPU_TEST_P(ConvertTo, WithScaling)
+{
+    cv::Mat src = randomMat(size, depth1);
+    double a = randomDouble(0.0, 1.0);
+    double b = randomDouble(-10.0, 10.0);
+
+    if ((depth1 == CV_64F || depth2 == CV_64F) && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat d_src = loadMat(src);
+            cv::cuda::GpuMat dst;
+            d_src.convertTo(dst, depth2, a, b);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat d_src = loadMat(src, useRoi);
+        cv::cuda::GpuMat dst = createMat(size, depth2, useRoi);
+        d_src.convertTo(dst, depth2, a, b);
+
+        cv::Mat dst_gold;
+        src.convertTo(dst_gold, depth2, a, b);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth2 < CV_32F ? 1.0 : 1e-4);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_GpuMat, ConvertTo, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// ensureSizeIsEnough
+
+struct EnsureSizeIsEnough : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    virtual void SetUp()
+    {
+        cv::cuda::DeviceInfo devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(EnsureSizeIsEnough, BufferReuse)
+{
+    cv::cuda::GpuMat buffer(100, 100, CV_8U);
+    cv::cuda::GpuMat old = buffer;
+
+    // don't reallocate memory
+    cv::cuda::ensureSizeIsEnough(10, 20, CV_8U, buffer);
+    EXPECT_EQ(10, buffer.rows);
+    EXPECT_EQ(20, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
+
+    // don't reallocate memory
+    cv::cuda::ensureSizeIsEnough(20, 30, CV_8U, buffer);
+    EXPECT_EQ(20, buffer.rows);
+    EXPECT_EQ(30, buffer.cols);
+    EXPECT_EQ(CV_8UC1, buffer.type());
+    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_GpuMat, EnsureSizeIsEnough, ALL_DEVICES);
+
+#endif // HAVE_CUDA
--- a/modules/cuda/test/test_labeling.cpp
+++ b/modules/cuda/test/test_labeling.cpp
@@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    struct GreedyLabeling
+    {
+        struct dot
+        {
+            int x;
+            int y;
+
+            static dot make(int i, int j)
+            {
+                dot d; d.x = i; d.y = j;
+                return d;
+            }
+        };
+
+        struct InInterval
+        {
+            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
+            const int lo, hi;
+
+            bool operator() (const unsigned char a, const unsigned char b) const
+            {
+                int d = a - b;
+                return lo <= d && d <= hi;
+            }
+        };
+
+        GreedyLabeling(cv::Mat img)
+        : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {}
+
+        void operator() (cv::Mat labels) const
+        {
+            InInterval inInt(0, 2);
+            dot* stack = new dot[image.cols * image.rows];
+
+            int cc = -1;
+
+            int* dist_labels = (int*)labels.data;
+            int pitch = (int) labels.step1();
+
+            unsigned char* source = (unsigned char*)image.data;
+            int width = image.cols;
+            int height = image.rows;
+            int step1 = (int)image.step1();
+
+            for (int j = 0; j < image.rows; ++j)
+                for (int i = 0; i < image.cols; ++i)
+                {
+                    if (dist_labels[j * pitch + i] != -1) continue;
+
+                    dot* top = stack;
+                    dot p = dot::make(i, j);
+                    cc++;
+
+                    dist_labels[j * pitch + i] = cc;
+
+                    while (top >= stack)
+                    {
+                        int*  dl = &dist_labels[p.y * pitch + p.x];
+                        unsigned char* sp = &source[p.y * step1 + p.x];
+
+                        dl[0] = cc;
+
+                        //right
+                        if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                            *top++ = dot::make(p.x + 1, p.y);
+
+                        //left
+                        if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                            *top++ = dot::make(p.x - 1, p.y);
+
+                        //bottom
+                        if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+step1]))
+                            *top++ = dot::make(p.x, p.y + 1);
+
+                        //top
+                        if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-step1]))
+                            *top++ = dot::make(p.x, p.y - 1);
+
+                        p = *--top;
+                    }
+                }
+            delete[] stack;
+        }
+
+        void checkCorrectness(cv::Mat gpu)
+        {
+            cv::Mat diff = gpu - _labels;
+
+            int outliers = 0;
+            for (int j = 0; j < image.rows; ++j)
+                for (int i = 0; i < image.cols - 1; ++i)
+                {
+                    if ( (_labels.at<int>(j,i) == gpu.at<int>(j,i + 1)) && (diff.at<int>(j, i) != diff.at<int>(j,i + 1)))
+                    {
+                        outliers++;
+                    }
+                }
+            ASSERT_TRUE(outliers < gpu.cols + gpu.rows);
+        }
+
+        cv::Mat image;
+        cv::Mat _labels;
+    };
+}
+
+struct Labeling : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
+    cv::Mat loat_image()
+    {
+        return cv::imread(std::string( cvtest::TS::ptr()->get_data_path() ) + "labeling/label.png");
+    }
+};
+
+GPU_TEST_P(Labeling, DISABLED_ConnectedComponents)
+{
+    cv::Mat image;
+    cvtColor(loat_image(), image, cv::COLOR_BGR2GRAY);
+
+    cv::threshold(image, image, 150, 255, cv::THRESH_BINARY);
+
+    ASSERT_TRUE(image.type() == CV_8UC1);
+
+    GreedyLabeling host(image);
+    host(host._labels);
+
+    cv::cuda::GpuMat mask;
+    mask.create(image.rows, image.cols, CV_8UC1);
+
+    cv::cuda::GpuMat components;
+    components.create(image.rows, image.cols, CV_32SC1);
+
+    cv::cuda::connectivityMask(cv::cuda::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+    cv::cuda::labelComponents(mask, components);
+
+    host.checkCorrectness(cv::Mat(components));
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ConnectedComponents, Labeling, ALL_DEVICES);
+
+#endif // HAVE_CUDA
--- a/modules/cuda/test/test_main.cpp
+++ b/modules/cuda/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_GPU_TEST_MAIN("gpu")
--- a/modules/cuda/test/test_objdetect.cpp
+++ b/modules/cuda/test/test_objdetect.cpp
@@ -0,0 +1,427 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+//#define DUMP
+
+struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescriptor
+{
+    cv::cuda::DeviceInfo devInfo;
+
+#ifdef DUMP
+    std::ofstream f;
+#else
+    std::ifstream f;
+#endif
+
+    int wins_per_img_x;
+    int wins_per_img_y;
+    int blocks_per_win_x;
+    int blocks_per_win_y;
+    int block_hist_size;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
+#ifdef DUMP
+    void dump(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    {
+        f.write((char*)&blockHists.rows, sizeof(blockHists.rows));
+        f.write((char*)&blockHists.cols, sizeof(blockHists.cols));
+
+        for (int i = 0; i < blockHists.rows; ++i)
+        {
+            for (int j = 0; j < blockHists.cols; ++j)
+            {
+                float val = blockHists.at<float>(i, j);
+                f.write((char*)&val, sizeof(val));
+            }
+        }
+
+        int nlocations = locations.size();
+        f.write((char*)&nlocations, sizeof(nlocations));
+
+        for (int i = 0; i < locations.size(); ++i)
+            f.write((char*)&locations[i], sizeof(locations[i]));
+    }
+#else
+    void compare(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    {
+        int rows, cols;
+        f.read((char*)&rows, sizeof(rows));
+        f.read((char*)&cols, sizeof(cols));
+        ASSERT_EQ(rows, blockHists.rows);
+        ASSERT_EQ(cols, blockHists.cols);
+
+        for (int i = 0; i < blockHists.rows; ++i)
+        {
+            for (int j = 0; j < blockHists.cols; ++j)
+            {
+                float val;
+                f.read((char*)&val, sizeof(val));
+                ASSERT_NEAR(val, blockHists.at<float>(i, j), 1e-3);
+            }
+        }
+
+        int nlocations;
+        f.read((char*)&nlocations, sizeof(nlocations));
+        ASSERT_EQ(nlocations, static_cast<int>(locations.size()));
+
+        for (int i = 0; i < nlocations; ++i)
+        {
+            cv::Point location;
+            f.read((char*)&location, sizeof(location));
+            ASSERT_EQ(location, locations[i]);
+        }
+    }
+#endif
+
+    void testDetect(const cv::Mat& img)
+    {
+        gamma_correction = false;
+        setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+
+        std::vector<cv::Point> locations;
+
+        // Test detect
+        detect(loadMat(img), locations, 0);
+
+#ifdef DUMP
+        dump(cv::Mat(block_hists), locations);
+#else
+        compare(cv::Mat(block_hists), locations);
+#endif
+
+        // Test detect on smaller image
+        cv::Mat img2;
+        cv::resize(img, img2, cv::Size(img.cols / 2, img.rows / 2));
+        detect(loadMat(img2), locations, 0);
+
+#ifdef DUMP
+        dump(cv::Mat(block_hists), locations);
+#else
+        compare(cv::Mat(block_hists), locations);
+#endif
+
+        // Test detect on greater image
+        cv::resize(img, img2, cv::Size(img.cols * 2, img.rows * 2));
+        detect(loadMat(img2), locations, 0);
+
+#ifdef DUMP
+        dump(cv::Mat(block_hists), locations);
+#else
+        compare(cv::Mat(block_hists), locations);
+#endif
+    }
+
+    // Does not compare border value, as interpolation leads to delta
+    void compare_inner_parts(cv::Mat d1, cv::Mat d2)
+    {
+        for (int i = 1; i < blocks_per_win_y - 1; ++i)
+            for (int j = 1; j < blocks_per_win_x - 1; ++j)
+                for (int k = 0; k < block_hist_size; ++k)
+                {
+                    float a = d1.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
+                    float b = d2.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
+                    ASSERT_FLOAT_EQ(a, b);
+                }
+    }
+};
+
+// desabled while resize does not fixed
+GPU_TEST_P(HOG, Detect)
+{
+    cv::Mat img_rgb = readImage("hog/road.png");
+    ASSERT_FALSE(img_rgb.empty());
+
+#ifdef DUMP
+    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
+    ASSERT_TRUE(f.is_open());
+#else
+    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
+    ASSERT_TRUE(f.is_open());
+#endif
+
+    // Test on color image
+    cv::Mat img;
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+    testDetect(img);
+
+    // Test on gray image
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
+    testDetect(img);
+
+    f.close();
+}
+
+GPU_TEST_P(HOG, GetDescriptors)
+{
+    // Load image (e.g. train data, composed from windows)
+    cv::Mat img_rgb = readImage("hog/train_data.png");
+    ASSERT_FALSE(img_rgb.empty());
+
+    // Convert to C4
+    cv::Mat img;
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+
+    cv::cuda::GpuMat d_img(img);
+
+    // Convert train images into feature vectors (train table)
+    cv::cuda::GpuMat descriptors, descriptors_by_cols;
+    getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
+    getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);
+
+    // Check size of the result train table
+    wins_per_img_x = 3;
+    wins_per_img_y = 2;
+    blocks_per_win_x = 7;
+    blocks_per_win_y = 15;
+    block_hist_size = 36;
+    cv::Size descr_size_expected = cv::Size(blocks_per_win_x * blocks_per_win_y * block_hist_size,
+                                            wins_per_img_x * wins_per_img_y);
+    ASSERT_EQ(descr_size_expected, descriptors.size());
+
+    // Check both formats of output descriptors are handled correctly
+    cv::Mat dr(descriptors);
+    cv::Mat dc(descriptors_by_cols);
+    for (int i = 0; i < wins_per_img_x * wins_per_img_y; ++i)
+    {
+        const float* l = dr.rowRange(i, i + 1).ptr<float>();
+        const float* r = dc.rowRange(i, i + 1).ptr<float>();
+        for (int y = 0; y < blocks_per_win_y; ++y)
+            for (int x = 0; x < blocks_per_win_x; ++x)
+                for (int k = 0; k < block_hist_size; ++k)
+                    ASSERT_EQ(l[(y * blocks_per_win_x + x) * block_hist_size + k],
+                              r[(x * blocks_per_win_y + y) * block_hist_size + k]);
+    }
+
+    /* Now we want to extract the same feature vectors, but from single images. NOTE: results will
+    be defferent, due to border values interpolation. Using of many small images is slower, however we
+    wont't call getDescriptors and will use computeBlockHistograms instead of. computeBlockHistograms
+    works good, it can be checked in the gpu_hog sample */
+
+    img_rgb = readImage("hog/positive1.png");
+    ASSERT_TRUE(!img_rgb.empty());
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+    computeBlockHistograms(cv::cuda::GpuMat(img));
+    // Everything is fine with interpolation for left top subimage
+    ASSERT_EQ(0.0, cv::norm((cv::Mat)block_hists, (cv::Mat)descriptors.rowRange(0, 1)));
+
+    img_rgb = readImage("hog/positive2.png");
+    ASSERT_TRUE(!img_rgb.empty());
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+    computeBlockHistograms(cv::cuda::GpuMat(img));
+    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));
+
+    img_rgb = readImage("hog/negative1.png");
+    ASSERT_TRUE(!img_rgb.empty());
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+    computeBlockHistograms(cv::cuda::GpuMat(img));
+    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));
+
+    img_rgb = readImage("hog/negative2.png");
+    ASSERT_TRUE(!img_rgb.empty());
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+    computeBlockHistograms(cv::cuda::GpuMat(img));
+    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));
+
+    img_rgb = readImage("hog/positive3.png");
+    ASSERT_TRUE(!img_rgb.empty());
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+    computeBlockHistograms(cv::cuda::GpuMat(img));
+    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));
+
+    img_rgb = readImage("hog/negative3.png");
+    ASSERT_TRUE(!img_rgb.empty());
+    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
+    computeBlockHistograms(cv::cuda::GpuMat(img));
+    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, HOG, ALL_DEVICES);
+
+//============== caltech hog tests =====================//
+
+struct CalTech : public ::testing::TestWithParam<std::tr1::tuple<cv::cuda::DeviceInfo, std::string> >
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Mat img;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+    }
+};
+
+GPU_TEST_P(CalTech, HOG)
+{
+    cv::cuda::GpuMat d_img(img);
+    cv::Mat markedImage(img.clone());
+
+    cv::cuda::HOGDescriptor d_hog;
+    d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+    d_hog.nlevels = d_hog.nlevels + 32;
+
+    std::vector<cv::Rect> found_locations;
+    d_hog.detectMultiScale(d_img, found_locations);
+
+#if defined (LOG_CASCADE_STATISTIC)
+    for (int i = 0; i < (int)found_locations.size(); i++)
+    {
+        cv::Rect r = found_locations[i];
+
+        std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
+        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
+    }
+
+    cv::imshow("Res", markedImage); cv::waitKey();
+#endif
+}
+
+INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
+    ::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png",
+        "caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png",
+        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png")));
+
+
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+/// LBP classifier
+
+PARAM_TEST_CASE(LBP_Read_classifier, cv::cuda::DeviceInfo, int)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(LBP_Read_classifier, Accuracy)
+{
+    cv::cuda::CascadeClassifier_GPU classifier;
+    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
+    ASSERT_TRUE(classifier.load(classifierXmlPath));
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_Read_classifier,
+                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
+
+
+PARAM_TEST_CASE(LBP_classify, cv::cuda::DeviceInfo, int)
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(LBP_classify, Accuracy)
+{
+    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
+    std::string imagePath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/er.png";
+
+    cv::CascadeClassifier cpuClassifier(classifierXmlPath);
+    ASSERT_FALSE(cpuClassifier.empty());
+
+    cv::Mat image = cv::imread(imagePath);
+    image = image.colRange(0, image.cols/2);
+    cv::Mat grey;
+    cvtColor(image, grey, cv::COLOR_BGR2GRAY);
+    ASSERT_FALSE(image.empty());
+
+    std::vector<cv::Rect> rects;
+    cpuClassifier.detectMultiScale(grey, rects);
+    cv::Mat markedImage = image.clone();
+
+    std::vector<cv::Rect>::iterator it = rects.begin();
+    for (; it != rects.end(); ++it)
+        cv::rectangle(markedImage, *it, cv::Scalar(255, 0, 0));
+
+    cv::cuda::CascadeClassifier_GPU gpuClassifier;
+    ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
+
+    cv::cuda::GpuMat gpu_rects;
+    cv::cuda::GpuMat tested(grey);
+    int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
+
+#if defined (LOG_CASCADE_STATISTIC)
+    cv::Mat downloaded(gpu_rects);
+    const cv::Rect* faces = downloaded.ptr<cv::Rect>();
+    for (int i = 0; i < count; i++)
+    {
+        cv::Rect r = faces[i];
+
+        std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
+        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
+    }
+#endif
+
+#if defined (LOG_CASCADE_STATISTIC)
+    cv::imshow("Res", markedImage); cv::waitKey();
+#endif
+    (void)count;
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_classify,
+                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
+
+#endif // HAVE_CUDA
--- a/modules/cuda/test/test_opengl.cpp
+++ b/modules/cuda/test/test_opengl.cpp
@@ -0,0 +1,452 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#if defined(HAVE_CUDA) && defined(HAVE_OPENGL)
+
+using namespace cvtest;
+
+/////////////////////////////////////////////
+// Buffer
+
+PARAM_TEST_CASE(Buffer, cv::Size, MatType)
+{
+    static void SetUpTestCase()
+    {
+        cv::namedWindow("test", cv::WINDOW_OPENGL);
+    }
+
+    static void TearDownTestCase()
+    {
+        cv::destroyAllWindows();
+    }
+
+    cv::Size size;
+    int type;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        type = GET_PARAM(1);
+    }
+};
+
+GPU_TEST_P(Buffer, Constructor1)
+{
+    cv::ogl::Buffer buf(size.height, size.width, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+GPU_TEST_P(Buffer, Constructor2)
+{
+    cv::ogl::Buffer buf(size, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+GPU_TEST_P(Buffer, ConstructorFromMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(Buffer, ConstructorFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::cuda::GpuMat d_gold(gold);
+
+    cv::ogl::Buffer buf(d_gold, cv::ogl::Buffer::ARRAY_BUFFER);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(Buffer, ConstructorFromBuffer)
+{
+    cv::ogl::Buffer buf_gold(size, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::ogl::Buffer buf(buf_gold);
+
+    EXPECT_EQ(buf_gold.bufId(), buf.bufId());
+    EXPECT_EQ(buf_gold.rows(), buf.rows());
+    EXPECT_EQ(buf_gold.cols(), buf.cols());
+    EXPECT_EQ(buf_gold.type(), buf.type());
+}
+
+GPU_TEST_P(Buffer, Create)
+{
+    cv::ogl::Buffer buf;
+    buf.create(size.height, size.width, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_EQ(size.height, buf.rows());
+    EXPECT_EQ(size.width, buf.cols());
+    EXPECT_EQ(type, buf.type());
+}
+
+GPU_TEST_P(Buffer, CopyFromMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf;
+    buf.copyFrom(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(Buffer, CopyFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::cuda::GpuMat d_gold(gold);
+
+    cv::ogl::Buffer buf;
+    buf.copyFrom(d_gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(Buffer, CopyFromBuffer)
+{
+    cv::Mat gold = randomMat(size, type);
+    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::ogl::Buffer buf;
+    buf.copyFrom(buf_gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_NE(buf_gold.bufId(), buf.bufId());
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(Buffer, CopyToGpuMat)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::cuda::GpuMat dst;
+    buf.copyTo(dst);
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+}
+
+GPU_TEST_P(Buffer, CopyToBuffer)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::ogl::Buffer dst;
+    buf.copyTo(dst);
+    dst.setAutoRelease(true);
+
+    EXPECT_NE(buf.bufId(), dst.bufId());
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(Buffer, Clone)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::ogl::Buffer dst = buf.clone(cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    EXPECT_NE(buf.bufId(), dst.bufId());
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(Buffer, MapHostRead)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat dst = buf.mapHost(cv::ogl::Buffer::READ_ONLY);
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+
+    buf.unmapHost();
+}
+
+GPU_TEST_P(Buffer, MapHostWrite)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(size, type, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::Mat dst = buf.mapHost(cv::ogl::Buffer::WRITE_ONLY);
+    gold.copyTo(dst);
+    buf.unmapHost();
+    dst.release();
+
+    cv::Mat bufData;
+    buf.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 0);
+}
+
+GPU_TEST_P(Buffer, MapDevice)
+{
+    cv::Mat gold = randomMat(size, type);
+
+    cv::ogl::Buffer buf(gold, cv::ogl::Buffer::ARRAY_BUFFER, true);
+
+    cv::cuda::GpuMat dst = buf.mapDevice();
+
+    EXPECT_MAT_NEAR(gold, dst, 0);
+
+    buf.unmapDevice();
+}
+
+INSTANTIATE_TEST_CASE_P(OpenGL, Buffer, testing::Combine(DIFFERENT_SIZES, ALL_TYPES));
+
+/////////////////////////////////////////////
+// Texture2D
+
+PARAM_TEST_CASE(Texture2D, cv::Size, MatType)
+{
+    static void SetUpTestCase()
+    {
+        cv::namedWindow("test", cv::WINDOW_OPENGL);
+    }
+
+    static void TearDownTestCase()
+    {
+        cv::destroyAllWindows();
+    }
+
+    cv::Size size;
+    int type;
+    int depth;
+    int cn;
+    cv::ogl::Texture2D::Format format;
+
+    virtual void SetUp()
+    {
+        size = GET_PARAM(0);
+        type = GET_PARAM(1);
+
+        depth = CV_MAT_DEPTH(type);
+        cn = CV_MAT_CN(type);
+        format = cn == 1 ? cv::ogl::Texture2D::DEPTH_COMPONENT : cn == 3 ? cv::ogl::Texture2D::RGB : cn == 4 ? cv::ogl::Texture2D::RGBA : cv::ogl::Texture2D::NONE;
+    }
+};
+
+GPU_TEST_P(Texture2D, Constructor1)
+{
+    cv::ogl::Texture2D tex(size.height, size.width, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+GPU_TEST_P(Texture2D, Constructor2)
+{
+    cv::ogl::Texture2D tex(size, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+GPU_TEST_P(Texture2D, ConstructorFromMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::ogl::Texture2D tex(gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(Texture2D, ConstructorFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::cuda::GpuMat d_gold(gold);
+
+    cv::ogl::Texture2D tex(d_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(Texture2D, ConstructorFromBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::PIXEL_UNPACK_BUFFER, true);
+
+    cv::ogl::Texture2D tex(buf_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(Texture2D, ConstructorFromTexture2D)
+{
+    cv::ogl::Texture2D tex_gold(size, format, true);
+    cv::ogl::Texture2D tex(tex_gold);
+
+    EXPECT_EQ(tex_gold.texId(), tex.texId());
+    EXPECT_EQ(tex_gold.rows(), tex.rows());
+    EXPECT_EQ(tex_gold.cols(), tex.cols());
+    EXPECT_EQ(tex_gold.format(), tex.format());
+}
+
+GPU_TEST_P(Texture2D, Create)
+{
+    cv::ogl::Texture2D tex;
+    tex.create(size.height, size.width, format, true);
+
+    EXPECT_EQ(size.height, tex.rows());
+    EXPECT_EQ(size.width, tex.cols());
+    EXPECT_EQ(format, tex.format());
+}
+
+GPU_TEST_P(Texture2D, CopyFromMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::ogl::Texture2D tex;
+    tex.copyFrom(gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(Texture2D, CopyFromGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::cuda::GpuMat d_gold(gold);
+
+    cv::ogl::Texture2D tex;
+    tex.copyFrom(d_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(Texture2D, CopyFromBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+    cv::ogl::Buffer buf_gold(gold, cv::ogl::Buffer::PIXEL_UNPACK_BUFFER, true);
+
+    cv::ogl::Texture2D tex;
+    tex.copyFrom(buf_gold, true);
+
+    cv::Mat texData;
+    tex.copyTo(texData, depth);
+
+    EXPECT_MAT_NEAR(gold, texData, 1e-2);
+}
+
+GPU_TEST_P(Texture2D, CopyToGpuMat)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::ogl::Texture2D tex(gold, true);
+
+    cv::cuda::GpuMat dst;
+    tex.copyTo(dst, depth);
+
+    EXPECT_MAT_NEAR(gold, dst, 1e-2);
+}
+
+GPU_TEST_P(Texture2D, CopyToBuffer)
+{
+    cv::Mat gold = randomMat(size, type, 0, depth == CV_8U ? 255 : 1);
+
+    cv::ogl::Texture2D tex(gold, true);
+
+    cv::ogl::Buffer dst;
+    tex.copyTo(dst, depth, true);
+
+    cv::Mat bufData;
+    dst.copyTo(bufData);
+
+    EXPECT_MAT_NEAR(gold, bufData, 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(OpenGL, Texture2D, testing::Combine(DIFFERENT_SIZES, testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4)));
+
+#endif
--- a/modules/cuda/test/test_precomp.hpp
+++ b/modules/cuda/test/test_precomp.hpp
@@ -0,0 +1,67 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include <fstream>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_test.hpp"
+
+#include "opencv2/cuda.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/core/opengl.hpp"
+#include "opencv2/calib3d.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "cvconfig.h"
+
+#endif
--- a/modules/cuda/test/test_stream.cpp
+++ b/modules/cuda/test/test_stream.cpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include <cuda_runtime.h>
+
+#if CUDART_VERSION >= 5000
+
+using namespace cvtest;
+
+struct Async : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::CudaMem src;
+    cv::cuda::GpuMat d_src;
+
+    cv::cuda::CudaMem dst;
+    cv::cuda::GpuMat d_dst;
+
+    virtual void SetUp()
+    {
+        cv::cuda::DeviceInfo devInfo = GetParam();
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        src = cv::cuda::CudaMem(cv::cuda::CudaMem::PAGE_LOCKED);
+
+        cv::Mat m = randomMat(cv::Size(128, 128), CV_8UC1);
+        m.copyTo(src);
+    }
+};
+
+void checkMemSet(int status, void* userData)
+{
+    ASSERT_EQ(cudaSuccess, status);
+
+    Async* test = reinterpret_cast<Async*>(userData);
+
+    cv::cuda::CudaMem src = test->src;
+    cv::cuda::CudaMem dst = test->dst;
+
+    cv::Mat dst_gold = cv::Mat::zeros(src.size(), src.type());
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+GPU_TEST_P(Async, MemSet)
+{
+    cv::cuda::Stream stream;
+
+    d_dst.upload(src);
+
+    d_dst.setTo(cv::Scalar::all(0), stream);
+    d_dst.download(dst, stream);
+
+    Async* test = this;
+    stream.enqueueHostCallback(checkMemSet, test);
+
+    stream.waitForCompletion();
+}
+
+void checkConvert(int status, void* userData)
+{
+    ASSERT_EQ(cudaSuccess, status);
+
+    Async* test = reinterpret_cast<Async*>(userData);
+
+    cv::cuda::CudaMem src = test->src;
+    cv::cuda::CudaMem dst = test->dst;
+
+    cv::Mat dst_gold;
+    src.createMatHeader().convertTo(dst_gold, CV_32S);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+GPU_TEST_P(Async, Convert)
+{
+    cv::cuda::Stream stream;
+
+    d_src.upload(src, stream);
+    d_src.convertTo(d_dst, CV_32S, stream);
+    d_dst.download(dst, stream);
+
+    Async* test = this;
+    stream.enqueueHostCallback(checkConvert, test);
+
+    stream.waitForCompletion();
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Stream, Async, ALL_DEVICES);
+
+#endif // CUDART_VERSION >= 5000
+
+#endif // HAVE_CUDA