Doxygen documentation: cuda

2014-11-20 16:42:06 +03:00 · 2014-11-20 16:42:06 +03:00 · ceb6e8bd94
commit ceb6e8bd94
parent 472c210687
80 changed files with 2917 additions and 398 deletions
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -159,12 +159,18 @@ if(BUILD_DOCS AND HAVE_DOXYGEN)
  set(reflist) # modules reference
  foreach(m ${candidates})
    set(reflist "${reflist} \n- @subpage ${m}")
-    set(all_headers ${all_headers} "${OPENCV_MODULE_opencv_${m}_HEADERS}")
+
+    set(header_dir "${OPENCV_MODULE_opencv_${m}_LOCATION}/include")
+    if(EXISTS ${header_dir})
+        set(all_headers ${all_headers} ${header_dir})
+    endif()
+
    set(docs_dir "${OPENCV_MODULE_opencv_${m}_LOCATION}/doc")
    if(EXISTS ${docs_dir})
      set(all_images ${all_images} ${docs_dir})
      set(all_headers ${all_headers} ${docs_dir})
    endif()
+
  endforeach()

  # additional config
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@ -99,7 +99,7 @@ FILE_PATTERNS          =
 RECURSIVE              = YES
 EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
+EXCLUDE_PATTERNS       = *.inl.hpp *.impl.hpp *_detail.hpp */cudev/**/detail/*.hpp
 EXCLUDE_SYMBOLS        = cv::DataType<*> int
 EXAMPLE_PATH           = @CMAKE_DOXYGEN_EXAMPLE_PATH@
 EXAMPLE_PATTERNS       = *
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@ -52,9 +52,11 @@
 #include "opencv2/core/cuda_types.hpp"

 /**
-@defgroup cuda CUDA-accelerated Computer Vision
+  @addtogroup cuda
  @{
-    @defgroup cuda_struct Data structures
+    @defgroup cuda_init Initalization and Information
+    @defgroup cuda_struct Data Structures
+    @defgroup cuda_calib3d Camera Calibration and 3D Reconstruction
  @}
 */

@ -65,8 +67,28 @@ namespace cv { namespace cuda {

 //////////////////////////////// GpuMat ///////////////////////////////

-//! Smart pointer for GPU memory with reference counting.
-//! Its interface is mostly similar with cv::Mat.
+/** @brief Base storage class for GPU memory with reference counting.
+
+Its interface matches the Mat interface with the following limitations:
+
+-   no arbitrary dimensions support (only 2D)
+-   no functions that return references to their data (because references on GPU are not valid for
+    CPU)
+-   no expression templates technique support
+
+Beware that the latter limitation may lead to overloaded matrix operators that cause memory
+allocations. The GpuMat class is convertible to cuda::PtrStepSz and cuda::PtrStep so it can be
+passed directly to the kernel.
+
+@note In contrast with Mat, in most cases GpuMat::isContinuous() == false . This means that rows are
+aligned to a size depending on the hardware. Single-row GpuMat is always a continuous matrix.
+
+@note You are not recommended to leave static or global GpuMat variables allocated, that is, to rely
+on its destructor. The destruction order of such variables and CUDA context is undefined. GPU memory
+release function returns error if the CUDA context has been destroyed before.
+
+@sa Mat
+ */
 class CV_EXPORTS GpuMat
 {
 public:
@ -277,11 +299,28 @@ public:
    Allocator* allocator;
 };

-//! creates continuous matrix
+/** @brief Creates a continuous matrix.
+
+@param rows Row count.
+@param cols Column count.
+@param type Type of the matrix.
+@param arr Destination matrix. This parameter changes only if it has a proper type and area (
+\f$\texttt{rows} \times \texttt{cols}\f$ ).
+
+Matrix is called continuous if its elements are stored continuously, that is, without gaps at the
+end of each row.
+ */
 CV_EXPORTS void createContinuous(int rows, int cols, int type, OutputArray arr);

-//! ensures that size of the given matrix is not less than (rows, cols) size
-//! and matrix type is match specified one too
+/** @brief Ensures that the size of a matrix is big enough and the matrix has a proper type.
+
+@param rows Minimum desired number of rows.
+@param cols Minimum desired number of columns.
+@param type Desired matrix type.
+@param arr Destination matrix.
+
+The function does not reallocate memory if the matrix has proper attributes already.
+ */
 CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);

 CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat);
@ -292,10 +331,21 @@ CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCou

 //////////////////////////////// CudaMem ////////////////////////////////

-//! CudaMem is limited cv::Mat with page locked memory allocation.
-//! Page locked memory is only needed for async and faster coping to GPU.
-//! It is convertable to cv::Mat header without reference counting
-//! so you can use it with other opencv functions.
+/** @brief Class with reference counting wrapping special memory type allocation functions from CUDA.
+
+Its interface is also Mat-like but with additional memory type parameters.
+
+-   **PAGE\_LOCKED** sets a page locked memory type used commonly for fast and asynchronous
+    uploading/downloading data from/to GPU.
+-   **SHARED** specifies a zero copy memory allocation that enables mapping the host memory to GPU
+    address space, if supported.
+-   **WRITE\_COMBINED** sets the write combined buffer that is not cached by CPU. Such buffers are
+    used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache
+    utilization.
+
+@note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2
+Pinned Memory APIs* document or *CUDA C Programming Guide*.
+ */
 class CV_EXPORTS CudaMem
 {
 public:
@ -335,7 +385,13 @@ public:
    //! returns matrix header with disabled reference counting for CudaMem data.
    Mat createMatHeader() const;

-    //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+    /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting
+    for it.
+
+    This can be done only if memory was allocated with the SHARED flag and if it is supported by the
+    hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which
+    eliminates an extra copy.
+     */
    GpuMat createGpuMatHeader() const;

    // Please see cv::Mat for descriptions
@ -363,17 +419,28 @@ public:
    AllocType alloc_type;
 };

-//! page-locks the matrix m memory and maps it for the device(s)
+/** @brief Page-locks the memory of matrix and maps it for the device(s).
+
+@param m Input matrix.
+ */
 CV_EXPORTS void registerPageLocked(Mat& m);

-//! unmaps the memory of matrix m, and makes it pageable again
+/** @brief Unmaps the memory of matrix and makes it pageable again.
+
+@param m Input matrix.
+ */
 CV_EXPORTS void unregisterPageLocked(Mat& m);

 ///////////////////////////////// Stream //////////////////////////////////

-//! Encapculates Cuda Stream. Provides interface for async coping.
-//! Passed to each function that supports async kernel execution.
-//! Reference counting is enabled.
+/** @brief This class encapsulates a queue of asynchronous calls.
+
+@note Currently, you may face problems if an operation is enqueued twice with different data. Some
+functions use the constant GPU memory, and next call may update the memory before the previous one
+has been finished. But calling different operations asynchronously is safe because each operation
+has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are
+also safe. :
+ */
 class CV_EXPORTS Stream
 {
    typedef void (Stream::*bool_type)() const;
@ -385,16 +452,26 @@ public:
    //! creates a new asynchronous stream
    Stream();

-    //! queries an asynchronous stream for completion status
+    /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
+    */
    bool queryIfComplete() const;

-    //! waits for stream tasks to complete
+    /** @brief Blocks the current CPU thread until all operations in the stream are complete.
+    */
    void waitForCompletion();

-    //! makes a compute stream wait on an event
+    /** @brief Makes a compute stream wait on an event.
+    */
    void waitEvent(const Event& event);

-    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+    /** @brief Adds a callback to be called on the host after all currently enqueued items in the stream have
+    completed.
+
+    @note Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization
+    that may depend on outstanding device work or other callbacks that are not mandated to run earlier.
+    Callbacks without a mandated order (in independent streams) execute in undefined order and may be
+    serialized.
+     */
    void enqueueHostCallback(StreamCallback callback, void* userData);

    //! return Stream object for default CUDA stream
@ -446,21 +523,41 @@ private:
    friend struct EventAccessor;
 };

+//! @} cuda_struct
+
 //////////////////////////////// Initialization & Info ////////////////////////

-//! this is the only function that do not throw exceptions if the library is compiled without CUDA
+//! @addtogroup cuda_init
+//! @{
+
+/** @brief Returns the number of installed CUDA-enabled devices.
+
+Use this function before any other CUDA functions calls. If OpenCV is compiled without CUDA support,
+this function returns 0.
+ */
 CV_EXPORTS int getCudaEnabledDeviceCount();

-//! set device to be used for GPU executions for the calling host thread
+/** @brief Sets a device and initializes it for the current thread.
+
+@param device System index of a CUDA device starting with 0.
+
+If the call of this function is omitted, a default device is initialized at the fist CUDA usage.
+ */
 CV_EXPORTS void setDevice(int device);

-//! returns which device is currently being used for the calling host thread
+/** @brief Returns the current device index set by cuda::setDevice or initialized by default.
+ */
 CV_EXPORTS int getDevice();

-//! explicitly destroys and cleans up all resources associated with the current device in the current process
-//! any subsequent API call to this device will reinitialize the device
+/** @brief Explicitly destroys and cleans up all resources associated with the current device in the current
+process.
+
+Any subsequent API call to this device will reinitialize the device.
+ */
 CV_EXPORTS void resetDevice();

+/** @brief Enumeration providing CUDA computing features.
+ */
 enum FeatureSet
 {
    FEATURE_SET_COMPUTE_10 = 10,
@ -482,12 +579,27 @@ enum FeatureSet
 //! checks whether current device supports the given feature
 CV_EXPORTS bool deviceSupports(FeatureSet feature_set);

-//! information about what GPU archs this OpenCV CUDA module was compiled for
+/** @brief Class providing a set of static methods to check what NVIDIA\* card architecture the CUDA module was
+built for.
+
+According to the CUDA C Programming Guide Version 3.2: "PTX code produced for some specific compute
+capability can always be compiled to binary code of greater or equal compute capability".
+ */
 class CV_EXPORTS TargetArchs
 {
 public:
+    /** @brief The following method checks whether the module was built with the support of the given feature:
+
+    @param feature\_set Features to be checked. See :ocvcuda::FeatureSet.
+     */
    static bool builtWith(FeatureSet feature_set);

+    /** @brief There is a set of methods to check whether the module contains intermediate (PTX) or binary CUDA
+    code for the given architecture(s):
+
+    @param major Major compute capability version.
+    @param minor Minor compute capability version.
+     */
    static bool has(int major, int minor);
    static bool hasPtx(int major, int minor);
    static bool hasBin(int major, int minor);
@ -498,17 +610,25 @@ public:
    static bool hasEqualOrGreaterBin(int major, int minor);
 };

-//! information about the given GPU.
+/** @brief Class providing functionality for querying the specified GPU properties.
+ */
 class CV_EXPORTS DeviceInfo
 {
 public:
    //! creates DeviceInfo object for the current GPU
    DeviceInfo();

-    //! creates DeviceInfo object for the given GPU
+    /** @brief The constructors.
+
+    @param device\_id System index of the CUDA device starting with 0.
+
+    Constructs the DeviceInfo object for the specified device. If device\_id parameter is missed, it
+    constructs an object for the current device.
+     */
    DeviceInfo(int device_id);

-    //! device number.
+    /** @brief Returns system index of the CUDA device starting with 0.
+    */
    int deviceID() const;

    //! ASCII string identifying device
@ -680,10 +800,19 @@ public:
    size_t freeMemory() const;
    size_t totalMemory() const;

-    //! checks whether device supports the given feature
+    /** @brief Provides information on CUDA feature support.
+
+    @param feature\_set Features to be checked. See cuda::FeatureSet.
+
+    This function returns true if the device has the specified CUDA feature. Otherwise, it returns false
+     */
    bool supports(FeatureSet feature_set) const;

-    //! checks whether the CUDA module can be run on the given device
+    /** @brief Checks the CUDA module and device compatibility.
+
+    This function returns true if the CUDA module can be run on the specified device. Otherwise, it
+    returns false .
+     */
    bool isCompatible() const;

 private:
@ -693,7 +822,7 @@ private:
 CV_EXPORTS void printCudaDeviceInfo(int device);
 CV_EXPORTS void printShortCudaDeviceInfo(int device);

-//! @}
+//! @} cuda_init

 }} // namespace cv { namespace cuda {

--- a/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
@ -66,6 +66,11 @@ namespace cv
        class Stream;
        class Event;

+        /** @brief Class that enables getting cudaStream\_t from cuda::Stream
+
+        because it is the only public header that depends on the CUDA Runtime API. Including it
+        brings a dependency to your code.
+         */
        struct StreamAccessor
        {
            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
--- a/modules/core/include/opencv2/core/cuda_types.hpp
+++ b/modules/core/include/opencv2/core/cuda_types.hpp
@ -89,6 +89,11 @@ namespace cv
            size_t size;
        };

+        /** @brief Structure similar to cuda::PtrStepSz but containing only a pointer and row step.
+
+        Width and height fields are excluded due to performance reasons. The structure is intended
+        for internal use or for users who write device code.
+         */
        template <typename T> struct PtrStep : public DevPtr<T>
        {
            __CV_CUDA_HOST_DEVICE__ PtrStep() : step(0) {}
@ -104,6 +109,12 @@ namespace cv
            __CV_CUDA_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
        };

+        /** @brief Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA
+        kernels).
+
+        Typically, it is used internally by OpenCV and by users who write device code. You can call
+        its members from both host and device code.
+         */
        template <typename T> struct PtrStepSz : public PtrStep<T>
        {
            __CV_CUDA_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
--- a/modules/cuda/doc/introduction.markdown
+++ b/modules/cuda/doc/introduction.markdown
@ -0,0 +1,85 @@
+CUDA Module Introduction {#cuda_intro}
+========================
+
+General Information
+-------------------
+
+The OpenCV CUDA module is a set of classes and functions to utilize CUDA computational capabilities.
+It is implemented using NVIDIA\* CUDA\* Runtime API and supports only NVIDIA GPUs. The OpenCV CUDA
+module includes utility functions, low-level vision primitives, and high-level algorithms. The
+utility functions and low-level primitives provide a powerful infrastructure for developing fast
+vision algorithms taking advantage of CUDA whereas the high-level functionality includes some
+state-of-the-art algorithms (such as stereo correspondence, face and people detectors, and others)
+ready to be used by the application developers.
+
+The CUDA module is designed as a host-level API. This means that if you have pre-compiled OpenCV
+CUDA binaries, you are not required to have the CUDA Toolkit installed or write any extra code to
+make use of the CUDA.
+
+The OpenCV CUDA module is designed for ease of use and does not require any knowledge of CUDA.
+Though, such a knowledge will certainly be useful to handle non-trivial cases or achieve the highest
+performance. It is helpful to understand the cost of various operations, what the GPU does, what the
+preferred data formats are, and so on. The CUDA module is an effective instrument for quick
+implementation of CUDA-accelerated computer vision algorithms. However, if your algorithm involves
+many simple operations, then, for the best possible performance, you may still need to write your
+own kernels to avoid extra write and read operations on the intermediate results.
+
+To enable CUDA support, configure OpenCV using CMake with WITH\_CUDA=ON . When the flag is set and
+if CUDA is installed, the full-featured OpenCV CUDA module is built. Otherwise, the module is still
+built but at runtime all functions from the module throw Exception with CV\_GpuNotSupported error
+code, except for cuda::getCudaEnabledDeviceCount(). The latter function returns zero GPU count in
+this case. Building OpenCV without CUDA support does not perform device code compilation, so it does
+not require the CUDA Toolkit installed. Therefore, using the cuda::getCudaEnabledDeviceCount()
+function, you can implement a high-level algorithm that will detect GPU presence at runtime and
+choose an appropriate implementation (CPU or GPU) accordingly.
+
+Compilation for Different NVIDIA\* Platforms
+--------------------------------------------
+
+NVIDIA\* compiler enables generating binary code (cubin and fatbin) and intermediate code (PTX).
+Binary code often implies a specific GPU architecture and generation, so the compatibility with
+other GPUs is not guaranteed. PTX is targeted for a virtual platform that is defined entirely by the
+set of capabilities or features. Depending on the selected virtual platform, some of the
+instructions are emulated or disabled, even if the real hardware supports all the features.
+
+At the first call, the PTX code is compiled to binary code for the particular GPU using a JIT
+compiler. When the target GPU has a compute capability (CC) lower than the PTX code, JIT fails. By
+default, the OpenCV CUDA module includes:
+
+\*
+   Binaries for compute capabilities 1.3 and 2.0 (controlled by CUDA\_ARCH\_BIN in CMake)
+
+\*
+   PTX code for compute capabilities 1.1 and 1.3 (controlled by CUDA\_ARCH\_PTX in CMake)
+
+This means that for devices with CC 1.3 and 2.0 binary images are ready to run. For all newer
+platforms, the PTX code for 1.3 is JIT'ed to a binary image. For devices with CC 1.1 and 1.2, the
+PTX for 1.1 is JIT'ed. For devices with CC 1.0, no code is available and the functions throw
+Exception. For platforms where JIT compilation is performed first, the run is slow.
+
+On a GPU with CC 1.0, you can still compile the CUDA module and most of the functions will run
+flawlessly. To achieve this, add "1.0" to the list of binaries, for example,
+CUDA\_ARCH\_BIN="1.0 1.3 2.0" . The functions that cannot be run on CC 1.0 GPUs throw an exception.
+
+You can always determine at runtime whether the OpenCV GPU-built binaries (or PTX code) are
+compatible with your GPU. The function cuda::DeviceInfo::isCompatible returns the compatibility
+status (true/false).
+
+Utilizing Multiple GPUs
+-----------------------
+
+In the current version, each of the OpenCV CUDA algorithms can use only a single GPU. So, to utilize
+multiple GPUs, you have to manually distribute the work between GPUs. Switching active devie can be
+done using cuda::setDevice() function. For more details please read Cuda C Programming Guide.
+
+While developing algorithms for multiple GPUs, note a data passing overhead. For primitive functions
+and small images, it can be significant, which may eliminate all the advantages of having multiple
+GPUs. But for high-level algorithms, consider using multi-GPU acceleration. For example, the Stereo
+Block Matching algorithm has been successfully parallelized using the following algorithm:
+
+1.  Split each image of the stereo pair into two horizontal overlapping stripes.
+2.  Process each pair of stripes (from the left and right images) on a separate Fermi\* GPU.
+3.  Merge the results into a single disparity map.
+
+With this algorithm, a dual GPU gave a 180% performance increase comparing to the single Fermi GPU.
+For a source code example, see <https://github.com/Itseez/opencv/tree/master/samples/gpu/>.
--- a/modules/cuda/include/opencv2/cuda.hpp
+++ b/modules/cuda/include/opencv2/cuda.hpp
@ -49,10 +49,22 @@

 #include "opencv2/core/cuda.hpp"

+/**
+@defgroup cuda CUDA-accelerated Computer Vision
+    @ref cuda_intro "Introduction page"
+    @{
+        @defgroup cuda_objdetect Object Detection
+    @}
+
+ */
+
 namespace cv { namespace cuda {

 //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////

+//! @addtogroup cuda_objdetect
+//! @{
+
 struct CV_EXPORTS HOGConfidence
 {
   double scale;
@ -61,31 +73,92 @@ struct CV_EXPORTS HOGConfidence
   std::vector<double> part_scores[4];
 };

+/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
+
+Interfaces of all methods are kept similar to the CPU HOG descriptor and detector analogues as much
+as possible.
+
+@note
+   -   An example applying the HOG descriptor for people detection can be found at
+        opencv\_source\_code/samples/cpp/peopledetect.cpp
+    -   A CUDA example applying the HOG descriptor for people detection can be found at
+        opencv\_source\_code/samples/gpu/hog.cpp
+    -   (Python) An example applying the HOG descriptor for people detection can be found at
+        opencv\_source\_code/samples/python2/peopledetect.py
+ */
 struct CV_EXPORTS HOGDescriptor
 {
    enum { DEFAULT_WIN_SIGMA = -1 };
    enum { DEFAULT_NLEVELS = 64 };
    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };

+    /** @brief Creates the HOG descriptor and detector.
+
+    @param win\_size Detection window size. Align to block size and block stride.
+    @param block\_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
+    @param block\_stride Block stride. It must be a multiple of cell size.
+    @param cell\_size Cell size. Only (8, 8) is supported for now.
+    @param nbins Number of bins. Only 9 bins per cell are supported for now.
+    @param win\_sigma Gaussian smoothing window parameter.
+    @param threshold\_L2hys L2-Hys normalization method shrinkage.
+    @param gamma\_correction Flag to specify whether the gamma correction preprocessing is required or
+    not.
+    @param nlevels Maximum number of detection window increases.
+     */
    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
                  double threshold_L2hys=0.2, bool gamma_correction=true,
                  int nlevels=DEFAULT_NLEVELS);

+    /** @brief Returns the number of coefficients required for the classification.
+     */
    size_t getDescriptorSize() const;
+    /** @brief Returns the block histogram size.
+    */
    size_t getBlockHistogramSize() const;

+    /** @brief Sets coefficients for the linear SVM classifier.
+    */
    void setSVMDetector(const std::vector<float>& detector);

+    /** @brief Returns coefficients of the classifier trained for people detection (for default window size).
+    */
    static std::vector<float> getDefaultPeopleDetector();
+    /** @brief Returns coefficients of the classifier trained for people detection (for 48x96 windows).
+    */
    static std::vector<float> getPeopleDetector48x96();
+    /** @brief Returns coefficients of the classifier trained for people detection (for 64x128 windows).
+    */
    static std::vector<float> getPeopleDetector64x128();

+    /** @brief Performs object detection without a multi-scale window.
+
+    @param img Source image. CV\_8UC1 and CV\_8UC4 types are supported for now.
+    @param found\_locations Left-top corner points of detected objects boundaries.
+    @param hit\_threshold Threshold for the distance between features and SVM classifying plane.
+    Usually it is 0 and should be specfied in the detector coefficients (as the last free
+    coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
+    manually here.
+    @param win\_stride Window stride. It must be a multiple of block stride.
+    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
+     */
    void detect(const GpuMat& img, std::vector<Point>& found_locations,
                double hit_threshold=0, Size win_stride=Size(),
                Size padding=Size());

+    /** @brief Performs object detection with a multi-scale window.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param found\_locations Detected objects boundaries.
+    @param hit\_threshold Threshold for the distance between features and SVM classifying plane. See
+    cuda::HOGDescriptor::detect for details.
+    @param win\_stride Window stride. It must be a multiple of block stride.
+    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
+    @param scale0 Coefficient of the detection window increase.
+    @param group\_threshold Coefficient to regulate the similarity threshold. When detected, some
+    objects can be covered by many rectangles. 0 means not to perform grouping. See groupRectangles .
+     */
    void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
                          double hit_threshold=0, Size win_stride=Size(),
                          Size padding=Size(), double scale0=1.05,
@ -98,6 +171,17 @@ struct CV_EXPORTS HOGDescriptor
                                                                    double hit_threshold, Size win_stride, Size padding,
                                                                    std::vector<HOGConfidence> &conf_out, int group_threshold);

+    /** @brief Returns block descriptors computed for the whole image.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param win\_stride Window stride. It must be a multiple of block stride.
+    @param descriptors 2D array of descriptors.
+    @param descr\_format Descriptor storage format:
+    -   **DESCR\_FORMAT\_ROW\_BY\_ROW** - Row-major order.
+    -   **DESCR\_FORMAT\_COL\_BY\_COL** - Column-major order.
+
+    The function is mainly used to learn the classifier.
+     */
    void getDescriptors(const GpuMat& img, Size win_stride,
                        GpuMat& descriptors,
                        int descr_format=DESCR_FORMAT_COL_BY_COL);
@ -145,20 +229,82 @@ protected:

 //////////////////////////// CascadeClassifier ////////////////////////////

-// The cascade classifier class for object detection: supports old haar and new lbp xlm formats and nvbin for haar cascades olny.
+/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
+
+@note
+   -   A cascade classifier example can be found at
+        opencv\_source\_code/samples/gpu/cascadeclassifier.cpp
+    -   A Nvidea API specific cascade classifier example can be found at
+        opencv\_source\_code/samples/gpu/cascadeclassifier\_nvidia\_api.cpp
+ */
 class CV_EXPORTS CascadeClassifier_CUDA
 {
 public:
    CascadeClassifier_CUDA();
+    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
+
+    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
+    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
+    type of OpenCV XML cascade supported for LBP.
+     */
    CascadeClassifier_CUDA(const String& filename);
    ~CascadeClassifier_CUDA();

+    /** @brief Checks whether the classifier is loaded or not.
+    */
    bool empty() const;
+    /** @brief Loads the classifier from a file. The previous content is destroyed.
+
+    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
+    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
+    type of OpenCV XML cascade supported for LBP.
+     */
    bool load(const String& filename);
+    /** @brief Destroys the loaded classifier.
+    */
    void release();

-    /* returns number of detected objects */
+    /** @overload */
    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
+    /** @brief Detects objects of different sizes in the input image.
+
+    @param image Matrix of type CV\_8U containing an image where objects should be detected.
+    @param objectsBuf Buffer to store detected objects (rectangles). If it is empty, it is allocated
+    with the default size. If not empty, the function searches not more than N objects, where
+    N = sizeof(objectsBufer's data)/sizeof(cv::Rect).
+    @param maxObjectSize Maximum possible object size. Objects larger than that are ignored. Used for
+    second signature and supported only for LBP cascades.
+    @param scaleFactor Parameter specifying how much the image size is reduced at each image scale.
+    @param minNeighbors Parameter specifying how many neighbors each candidate rectangle should have
+    to retain it.
+    @param minSize Minimum possible object size. Objects smaller than that are ignored.
+
+    The detected objects are returned as a list of rectangles.
+
+    The function returns the number of detected objects, so you can retrieve them as in the following
+    example:
+    @code
+        cuda::CascadeClassifier_CUDA cascade_gpu(...);
+
+        Mat image_cpu = imread(...)
+        GpuMat image_gpu(image_cpu);
+
+        GpuMat objbuf;
+        int detections_number = cascade_gpu.detectMultiScale( image_gpu,
+                  objbuf, 1.2, minNeighbors);
+
+        Mat obj_host;
+        // download only detected number of rectangles
+        objbuf.colRange(0, detections_number).download(obj_host);
+
+        Rect* faces = obj_host.ptr<Rect>();
+        for(int i = 0; i < detections_num; ++i)
+           cv::rectangle(image_cpu, faces[i], Scalar(255));
+
+        imshow("Faces", image_cpu);
+    @endcode
+    @sa CascadeClassifier::detectMultiScale
+     */
    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);

    bool findLargestObject;
@ -174,8 +320,13 @@ private:
    friend class CascadeClassifier_CUDA_LBP;
 };

+//! @} cuda_objdetect
+
 //////////////////////////// Labeling ////////////////////////////

+//! @addtogroup cuda
+//! @{
+
 //!performs labeling via graph cuts of a 2D regular 4-connected graph.
 CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels,
                         GpuMat& buf, Stream& stream = Stream::Null());
@ -192,8 +343,13 @@ CV_EXPORTS void connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Sc
 //! performs connected componnents labeling.
 CV_EXPORTS void labelComponents(const GpuMat& mask, GpuMat& components, int flags = 0, Stream& stream = Stream::Null());

+//! @}
+
 //////////////////////////// Calib3d ////////////////////////////

+//! @addtogroup cuda_calib3d
+//! @{
+
 CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
                                GpuMat& dst, Stream& stream = Stream::Null());

@ -201,13 +357,34 @@ CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tve
                              const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
                              Stream& stream = Stream::Null());

+/** @brief Finds the object pose from 3D-2D point correspondences.
+
+@param object Single-row matrix of object points.
+@param image Single-row matrix of image points.
+@param camera\_mat 3x3 matrix of intrinsic camera parameters.
+@param dist\_coef Distortion coefficients. See undistortPoints for details.
+@param rvec Output 3D rotation vector.
+@param tvec Output 3D translation vector.
+@param use\_extrinsic\_guess Flag to indicate that the function must use rvec and tvec as an
+initial transformation guess. It is not supported for now.
+@param num\_iters Maximum number of RANSAC iterations.
+@param max\_dist Euclidean distance threshold to detect whether point is inlier or not.
+@param min\_inlier\_count Flag to indicate that the function must stop if greater or equal number
+of inliers is achieved. It is not supported for now.
+@param inliers Output vector of inlier indices.
+ */
 CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
                               const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,
                               int num_iters=100, float max_dist=8.0, int min_inlier_count=100,
                               std::vector<int>* inliers=NULL);

+//! @}
+
 //////////////////////////// VStab ////////////////////////////

+//! @addtogroup cuda
+//! @{
+
 //! removes points (CV_32FC2, single row matrix) with zero mask value
 CV_EXPORTS void compactPoints(GpuMat &points0, GpuMat &points1, const GpuMat &mask);

@ -215,6 +392,8 @@ CV_EXPORTS void calcWobbleSuppressionMaps(
        int left, int idx, int right, Size size, const Mat &ml, const Mat &mr,
        GpuMat &mapx, GpuMat &mapy);

+//! @}
+
 }} // namespace cv { namespace cuda {

 #endif /* __OPENCV_CUDA_HPP__ */
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
--- a/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp
+++ b/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp
@ -50,11 +50,33 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/video/background_segm.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudabgsegm Background Segmentation
+  @}
+ */
+
 namespace cv { namespace cuda {

+//! @addtogroup cudabgsegm
+//! @{
+
 ////////////////////////////////////////////////////
 // MOG

+/** @brief Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite MOG2001.
+
+@sa BackgroundSubtractorMOG
+
+@note
+   -   An example on gaussian mixture based background/foreground segmantation can be found at
+        opencv\_source\_code/samples/gpu/bgfg\_segm.cpp
+ */
 class CV_EXPORTS BackgroundSubtractorMOG : public cv::BackgroundSubtractor
 {
 public:
@ -78,6 +100,14 @@ public:
    virtual void setNoiseSigma(double noiseSigma) = 0;
 };

+/** @brief Creates mixture-of-gaussian background subtractor
+
+@param history Length of the history.
+@param nmixtures Number of Gaussian mixtures.
+@param backgroundRatio Background ratio.
+@param noiseSigma Noise strength (standard deviation of the brightness or each color channel). 0
+means some automatic value.
+ */
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorMOG>
    createBackgroundSubtractorMOG(int history = 200, int nmixtures = 5,
                                  double backgroundRatio = 0.7, double noiseSigma = 0);
@ -85,6 +115,14 @@ CV_EXPORTS Ptr<cuda::BackgroundSubtractorMOG>
 ////////////////////////////////////////////////////
 // MOG2

+/** @brief Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite MOG2004.
+
+@sa BackgroundSubtractorMOG2
+ */
 class CV_EXPORTS BackgroundSubtractorMOG2 : public cv::BackgroundSubtractorMOG2
 {
 public:
@ -96,6 +134,15 @@ public:
    virtual void getBackgroundImage(OutputArray backgroundImage, Stream& stream) const = 0;
 };

+/** @brief Creates MOG2 Background Subtractor
+
+@param history Length of the history.
+@param varThreshold Threshold on the squared Mahalanobis distance between the pixel and the model
+to decide whether a pixel is well described by the background model. This parameter does not
+affect the background update.
+@param detectShadows If true, the algorithm will detect shadows and mark them. It decreases the
+speed a bit, so if you do not need this feature, set the parameter to false.
+ */
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorMOG2>
    createBackgroundSubtractorMOG2(int history = 500, double varThreshold = 16,
                                   bool detectShadows = true);
@ -103,6 +150,12 @@ CV_EXPORTS Ptr<cuda::BackgroundSubtractorMOG2>
 ////////////////////////////////////////////////////
 // GMG

+/** @brief Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite GMG2012.
+ */
 class CV_EXPORTS BackgroundSubtractorGMG : public cv::BackgroundSubtractor
 {
 public:
@ -140,54 +193,71 @@ public:
    virtual void setMaxVal(double val) = 0;
 };

+/** @brief Creates GMG Background Subtractor
+
+@param initializationFrames Number of frames of video to use to initialize histograms.
+@param decisionThreshold Value above which pixel is determined to be FG.
+ */
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorGMG>
    createBackgroundSubtractorGMG(int initializationFrames = 120, double decisionThreshold = 0.8);

 ////////////////////////////////////////////////////
 // FGD

-/**
- * Foreground Object Detection from Videos Containing Complex Background.
- * Liyuan Li, Weimin Huang, Irene Y.H. Gu, and Qi Tian.
- * ACM MM2003 9p
+/** @brief The class discriminates between foreground and background pixels by building and maintaining a model
+of the background.
+
+Any pixel which does not fit this model is then deemed to be foreground. The class implements
+algorithm described in @cite FGD2003.
+@sa BackgroundSubtractor
 */
 class CV_EXPORTS BackgroundSubtractorFGD : public cv::BackgroundSubtractor
 {
 public:
+    /** @brief Returns the output foreground regions calculated by findContours.
+
+    @param foreground\_regions Output array (CPU memory).
+     */
    virtual void getForegroundRegions(OutputArrayOfArrays foreground_regions) = 0;
 };

 struct CV_EXPORTS FGDParams
 {
-    int Lc;  // Quantized levels per 'color' component. Power of two, typically 32, 64 or 128.
-    int N1c; // Number of color vectors used to model normal background color variation at a given pixel.
-    int N2c; // Number of color vectors retained at given pixel.  Must be > N1c, typically ~ 5/3 of N1c.
-    // Used to allow the first N1c vectors to adapt over time to changing background.
+    int Lc;  //!< Quantized levels per 'color' component. Power of two, typically 32, 64 or 128.
+    int N1c; //!< Number of color vectors used to model normal background color variation at a given pixel.
+    int N2c; //!< Number of color vectors retained at given pixel.  Must be > N1c, typically ~ 5/3 of N1c.
+    //!< Used to allow the first N1c vectors to adapt over time to changing background.

-    int Lcc;  // Quantized levels per 'color co-occurrence' component.  Power of two, typically 16, 32 or 64.
-    int N1cc; // Number of color co-occurrence vectors used to model normal background color variation at a given pixel.
-    int N2cc; // Number of color co-occurrence vectors retained at given pixel.  Must be > N1cc, typically ~ 5/3 of N1cc.
-    // Used to allow the first N1cc vectors to adapt over time to changing background.
+    int Lcc;  //!< Quantized levels per 'color co-occurrence' component.  Power of two, typically 16, 32 or 64.
+    int N1cc; //!< Number of color co-occurrence vectors used to model normal background color variation at a given pixel.
+    int N2cc; //!< Number of color co-occurrence vectors retained at given pixel.  Must be > N1cc, typically ~ 5/3 of N1cc.
+    //!< Used to allow the first N1cc vectors to adapt over time to changing background.

-    bool is_obj_without_holes; // If TRUE we ignore holes within foreground blobs. Defaults to TRUE.
-    int perform_morphing;     // Number of erode-dilate-erode foreground-blob cleanup iterations.
-    // These erase one-pixel junk blobs and merge almost-touching blobs. Default value is 1.
+    bool is_obj_without_holes; //!< If TRUE we ignore holes within foreground blobs. Defaults to TRUE.
+    int perform_morphing;     //!< Number of erode-dilate-erode foreground-blob cleanup iterations.
+    //!< These erase one-pixel junk blobs and merge almost-touching blobs. Default value is 1.

-    float alpha1; // How quickly we forget old background pixel values seen. Typically set to 0.1.
-    float alpha2; // "Controls speed of feature learning". Depends on T. Typical value circa 0.005.
-    float alpha3; // Alternate to alpha2, used (e.g.) for quicker initial convergence. Typical value 0.1.
+    float alpha1; //!< How quickly we forget old background pixel values seen. Typically set to 0.1.
+    float alpha2; //!< "Controls speed of feature learning". Depends on T. Typical value circa 0.005.
+    float alpha3; //!< Alternate to alpha2, used (e.g.) for quicker initial convergence. Typical value 0.1.

-    float delta;   // Affects color and color co-occurrence quantization, typically set to 2.
-    float T;       // A percentage value which determines when new features can be recognized as new background. (Typically 0.9).
-    float minArea; // Discard foreground blobs whose bounding box is smaller than this threshold.
+    float delta;   //!< Affects color and color co-occurrence quantization, typically set to 2.
+    float T;       //!< A percentage value which determines when new features can be recognized as new background. (Typically 0.9).
+    float minArea; //!< Discard foreground blobs whose bounding box is smaller than this threshold.

-    // default Params
+    //! default Params
    FGDParams();
 };

+/** @brief Creates FGD Background Subtractor
+
+@param params Algorithm's parameters. See @cite FGD2003 for explanation.
+ */
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorFGD>
    createBackgroundSubtractorFGD(const FGDParams& params = FGDParams());

+//! @}
+
 }} // namespace cv { namespace cuda {

 #endif /* __OPENCV_CUDABGSEGM_HPP__ */
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@ -50,8 +50,18 @@

 #include "opencv2/core/cuda.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudacodec Video Encoding/Decoding
+  @}
+ */
+
 namespace cv { namespace cudacodec {

+//! @addtogroup cudacodec
+//! @{
+
 ////////////////////////////////// Video Encoding //////////////////////////////////

 // Works only under Windows.
@ -68,35 +78,53 @@ enum SurfaceFormat
    SF_GRAY = SF_BGR
 };

+/** @brief Different parameters for CUDA video encoder.
+ */
 struct CV_EXPORTS EncoderParams
 {
-    int P_Interval;      // NVVE_P_INTERVAL,
-    int IDR_Period;      // NVVE_IDR_PERIOD,
-    int DynamicGOP;      // NVVE_DYNAMIC_GOP,
-    int RCType;          // NVVE_RC_TYPE,
-    int AvgBitrate;      // NVVE_AVG_BITRATE,
-    int PeakBitrate;     // NVVE_PEAK_BITRATE,
-    int QP_Level_Intra;  // NVVE_QP_LEVEL_INTRA,
-    int QP_Level_InterP; // NVVE_QP_LEVEL_INTER_P,
-    int QP_Level_InterB; // NVVE_QP_LEVEL_INTER_B,
-    int DeblockMode;     // NVVE_DEBLOCK_MODE,
-    int ProfileLevel;    // NVVE_PROFILE_LEVEL,
-    int ForceIntra;      // NVVE_FORCE_INTRA,
-    int ForceIDR;        // NVVE_FORCE_IDR,
-    int ClearStat;       // NVVE_CLEAR_STAT,
-    int DIMode;          // NVVE_SET_DEINTERLACE,
-    int Presets;         // NVVE_PRESETS,
-    int DisableCabac;    // NVVE_DISABLE_CABAC,
-    int NaluFramingType; // NVVE_CONFIGURE_NALU_FRAMING_TYPE
-    int DisableSPSPPS;   // NVVE_DISABLE_SPS_PPS
+    int P_Interval;      //!< NVVE_P_INTERVAL,
+    int IDR_Period;      //!< NVVE_IDR_PERIOD,
+    int DynamicGOP;      //!< NVVE_DYNAMIC_GOP,
+    int RCType;          //!< NVVE_RC_TYPE,
+    int AvgBitrate;      //!< NVVE_AVG_BITRATE,
+    int PeakBitrate;     //!< NVVE_PEAK_BITRATE,
+    int QP_Level_Intra;  //!< NVVE_QP_LEVEL_INTRA,
+    int QP_Level_InterP; //!< NVVE_QP_LEVEL_INTER_P,
+    int QP_Level_InterB; //!< NVVE_QP_LEVEL_INTER_B,
+    int DeblockMode;     //!< NVVE_DEBLOCK_MODE,
+    int ProfileLevel;    //!< NVVE_PROFILE_LEVEL,
+    int ForceIntra;      //!< NVVE_FORCE_INTRA,
+    int ForceIDR;        //!< NVVE_FORCE_IDR,
+    int ClearStat;       //!< NVVE_CLEAR_STAT,
+    int DIMode;          //!< NVVE_SET_DEINTERLACE,
+    int Presets;         //!< NVVE_PRESETS,
+    int DisableCabac;    //!< NVVE_DISABLE_CABAC,
+    int NaluFramingType; //!< NVVE_CONFIGURE_NALU_FRAMING_TYPE
+    int DisableSPSPPS;   //!< NVVE_DISABLE_SPS_PPS

    EncoderParams();
+    /** @brief Constructors.
+
+    @param configFile Config file name.
+
+    Creates default parameters or reads parameters from config file.
+     */
    explicit EncoderParams(const String& configFile);

+    /** @brief Reads parameters from config file.
+
+    @param configFile Config file name.
+     */
    void load(const String& configFile);
+    /** @brief Saves parameters to config file.
+
+    @param configFile Config file name.
+     */
    void save(const String& configFile) const;
 };

+/** @brief Callbacks for CUDA video encoder.
+ */
 class CV_EXPORTS EncoderCallBack
 {
 public:
@ -109,41 +137,109 @@ public:

    virtual ~EncoderCallBack() {}

-    //! callback function to signal the start of bitstream that is to be encoded
-    //! callback must allocate host buffer for CUDA encoder and return pointer to it and it's size
+    /** @brief Callback function to signal the start of bitstream that is to be encoded.
+
+    Callback must allocate buffer for CUDA encoder and return pointer to it and it's size.
+     */
    virtual uchar* acquireBitStream(int* bufferSize) = 0;

-    //! callback function to signal that the encoded bitstream is ready to be written to file
+    /** @brief Callback function to signal that the encoded bitstream is ready to be written to file.
+    */
    virtual void releaseBitStream(unsigned char* data, int size) = 0;

-    //! callback function to signal that the encoding operation on the frame has started
+    /** @brief Callback function to signal that the encoding operation on the frame has started.
+
+    @param frameNumber
+    @param picType Specify frame type (I-Frame, P-Frame or B-Frame).
+     */
    virtual void onBeginFrame(int frameNumber, PicType picType) = 0;

-    //! callback function signals that the encoding operation on the frame has finished
+    /** @brief Callback function signals that the encoding operation on the frame has finished.
+
+    @param frameNumber
+    @param picType Specify frame type (I-Frame, P-Frame or B-Frame).
+     */
    virtual void onEndFrame(int frameNumber, PicType picType) = 0;
 };

+/** @brief Video writer interface.
+
+The implementation uses H264 video codec.
+
+@note Currently only Windows platform is supported.
+
+@note
+   -   An example on how to use the videoWriter class can be found at
+        opencv\_source\_code/samples/gpu/video\_writer.cpp
+ */
 class CV_EXPORTS VideoWriter
 {
 public:
    virtual ~VideoWriter() {}

-    //! writes the next frame from GPU memory
+    /** @brief Writes the next video frame.
+
+    @param frame The written frame.
+    @param lastFrame Indicates that it is end of stream. The parameter can be ignored.
+
+    The method write the specified image to video file. The image must have the same size and the same
+    surface format as has been specified when opening the video writer.
+     */
    virtual void write(InputArray frame, bool lastFrame = false) = 0;

    virtual EncoderParams getEncoderParams() const = 0;
 };

-//! create VideoWriter for specified output file (only AVI file format is supported)
+/** @brief Creates video writer.
+
+@param fileName Name of the output video file. Only AVI file format is supported.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param format Surface format of input frames ( SF\_UYVY , SF\_YUY2 , SF\_YV12 , SF\_NV12 ,
+SF\_IYUV , SF\_BGR or SF\_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+
+The constructors initialize video writer. FFMPEG is used to write videos. User can implement own
+multiplexing with cudacodec::EncoderCallBack .
+ */
 CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+/** @overload
+@param fileName Name of the output video file. Only AVI file format is supported.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param params Encoder parameters. See cudacodec::EncoderParams .
+@param format Surface format of input frames ( SF\_UYVY , SF\_YUY2 , SF\_YV12 , SF\_NV12 ,
+SF\_IYUV , SF\_BGR or SF\_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
 CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);

-//! create VideoWriter for user-defined callbacks
+/** @overload
+@param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallBack . Use it if you
+want to work with raw video stream.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param format Surface format of input frames ( SF\_UYVY , SF\_YUY2 , SF\_YV12 , SF\_NV12 ,
+SF\_IYUV , SF\_BGR or SF\_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
 CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+/** @overload
+@param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallBack . Use it if you
+want to work with raw video stream.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param params Encoder parameters. See cudacodec::EncoderParams .
+@param format Surface format of input frames ( SF\_UYVY , SF\_YUY2 , SF\_YV12 , SF\_NV12 ,
+SF\_IYUV , SF\_BGR or SF\_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
 CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);

 ////////////////////////////////// Video Decoding //////////////////////////////////////////

+/** @brief Video codecs supported by cudacodec::VideoReader .
+ */
 enum Codec
 {
    MPEG1 = 0,
@ -155,13 +251,15 @@ enum Codec
    H264_SVC,
    H264_MVC,

-    Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   // Y,U,V (4:2:0)
-    Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,V,U (4:2:0)
-    Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,UV  (4:2:0)
-    Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   // YUYV/YUY2 (4:2:2)
-    Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    // UYVY (4:2:2)
+    Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   //!< Y,U,V (4:2:0)
+    Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   //!< Y,V,U (4:2:0)
+    Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   //!< Y,UV  (4:2:0)
+    Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   //!< YUYV/YUY2 (4:2:2)
+    Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    //!< UYVY (4:2:2)
 };

+/** @brief Chroma formats supported by cudacodec::VideoReader .
+ */
 enum ChromaFormat
 {
    Monochrome = 0,
@ -170,6 +268,8 @@ enum ChromaFormat
    YUV444
 };

+/** @brief Struct providing information about video file format. :
+ */
 struct FormatInfo
 {
    Codec codec;
@ -178,29 +278,65 @@ struct FormatInfo
    int height;
 };

+/** @brief Video reader interface.
+
+@note
+   -   An example on how to use the videoReader class can be found at
+        opencv\_source\_code/samples/gpu/video\_reader.cpp
+ */
 class CV_EXPORTS VideoReader
 {
 public:
    virtual ~VideoReader() {}

+    /** @brief Grabs, decodes and returns the next video frame.
+
+    If no frames has been grabbed (there are no more frames in video file), the methods return false .
+    The method throws Exception if error occurs.
+     */
    virtual bool nextFrame(OutputArray frame) = 0;

+    /** @brief Returns information about video file format.
+    */
    virtual FormatInfo format() const = 0;
 };

+/** @brief Interface for video demultiplexing. :
+
+User can implement own demultiplexing by implementing this interface.
+ */
 class CV_EXPORTS RawVideoSource
 {
 public:
    virtual ~RawVideoSource() {}

+    /** @brief Returns next packet with RAW video frame.
+
+    @param data Pointer to frame data.
+    @param size Size in bytes of current frame.
+    @param endOfFile Indicates that it is end of stream.
+     */
    virtual bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0;

+    /** @brief Returns information about video file format.
+    */
    virtual FormatInfo format() const = 0;
 };

+/** @brief Creates video reader.
+
+@param filename Name of the input video file.
+
+FFMPEG is used to read videos. User can implement own demultiplexing with cudacodec::RawVideoSource
+ */
 CV_EXPORTS Ptr<VideoReader> createVideoReader(const String& filename);
+/** @overload
+@param source RAW video source implemented by user.
+*/
 CV_EXPORTS Ptr<VideoReader> createVideoReader(const Ptr<RawVideoSource>& source);

+//! @}
+
 }} // namespace cv { namespace cudacodec {

 #endif /* __OPENCV_CUDACODEC_HPP__ */
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@ -50,150 +50,175 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/cudafilters.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudafeatures2d Feature Detection and Description
+  @}
+ */
+
 namespace cv { namespace cuda {

+//! @addtogroup cudafeatures2d
+//! @{
+
+/** @brief Brute-force descriptor matcher.
+
+For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+by trying each one. This descriptor matcher supports masking permissible matches between descriptor
+sets.
+
+The class BFMatcher\_CUDA has an interface similar to the class DescriptorMatcher. It has two groups
+of match methods: for matching descriptors of one image with another image or with an image set.
+Also, all functions have an alternative to save results either to the GPU memory or to the CPU
+memory.
+
+@sa DescriptorMatcher, BFMatcher
+ */
 class CV_EXPORTS BFMatcher_CUDA
 {
 public:
    explicit BFMatcher_CUDA(int norm = cv::NORM_L2);

-    // Add descriptors to train descriptor collection
+    //! Add descriptors to train descriptor collection
    void add(const std::vector<GpuMat>& descCollection);

-    // Get train descriptors collection
+    //! Get train descriptors collection
    const std::vector<GpuMat>& getTrainDescriptors() const;

-    // Clear train descriptors collection
+    //! Clear train descriptors collection
    void clear();

-    // Return true if there are not train descriptors in collection
+    //! Return true if there are not train descriptors in collection
    bool empty() const;

-    // Return true if the matcher supports mask in match methods
+    //! Return true if the matcher supports mask in match methods
    bool isMaskSupported() const;

-    // Find one best match for each query descriptor
+    //! Find one best match for each query descriptor
    void matchSingle(const GpuMat& query, const GpuMat& train,
        GpuMat& trainIdx, GpuMat& distance,
        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());

-    // Download trainIdx and distance and convert it to CPU vector with DMatch
+    //! Download trainIdx and distance and convert it to CPU vector with DMatch
    static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    // Convert trainIdx and distance to vector with DMatch
+    //! Convert trainIdx and distance to vector with DMatch
    static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);

-    // Find one best match for each query descriptor
+    //! Find one best match for each query descriptor
    void match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());

-    // Make gpu collection of trains and masks in suitable format for matchCollection function
+    //! Make gpu collection of trains and masks in suitable format for matchCollection function
    void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());

-    // Find one best match from train collection for each query descriptor
+    //! Find one best match from train collection for each query descriptor
    void matchCollection(const GpuMat& query, const GpuMat& trainCollection,
        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
        const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());

-    // Download trainIdx, imgIdx and distance and convert it to vector with DMatch
+    //! Download trainIdx, imgIdx and distance and convert it to vector with DMatch
    static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    // Convert trainIdx, imgIdx and distance to vector with DMatch
+    //! Convert trainIdx, imgIdx and distance to vector with DMatch
    static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);

-    // Find one best match from train collection for each query descriptor.
+    //! Find one best match from train collection for each query descriptor.
    void match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());

-    // Find k best matches for each query descriptor (in increasing order of distances)
+    //! Find k best matches for each query descriptor (in increasing order of distances)
    void knnMatchSingle(const GpuMat& query, const GpuMat& train,
        GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());

-    // Download trainIdx and distance and convert it to vector with DMatch
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Download trainIdx and distance and convert it to vector with DMatch
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
    static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    // Convert trainIdx and distance to vector with DMatch
+    //! Convert trainIdx and distance to vector with DMatch
    static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);

-    // Find k best matches for each query descriptor (in increasing order of distances).
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Find k best matches for each query descriptor (in increasing order of distances).
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
    void knnMatch(const GpuMat& query, const GpuMat& train,
        std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),
        bool compactResult = false);

-    // Find k best matches from train collection for each query descriptor (in increasing order of distances)
+    //! Find k best matches from train collection for each query descriptor (in increasing order of distances)
    void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
        const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());

-    // Download trainIdx and distance and convert it to vector with DMatch
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Download trainIdx and distance and convert it to vector with DMatch
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
+    //! @see BFMatcher_CUDA::knnMatchDownload
    static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    // Convert trainIdx and distance to vector with DMatch
+    //! Convert trainIdx and distance to vector with DMatch
+    //! @see BFMatcher_CUDA::knnMatchConvert
    static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);

-    // Find k best matches  for each query descriptor (in increasing order of distances).
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Find k best matches  for each query descriptor (in increasing order of distances).
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
    void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);

-    // Find best matches for each query descriptor which have distance less than maxDistance.
-    // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
-    // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
-    // because it didn't have enough memory.
-    // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
-    // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    // Matches doesn't sorted.
+    //! Find best matches for each query descriptor which have distance less than maxDistance.
+    //! nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
+    //! carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
+    //! because it didn't have enough memory.
+    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
+    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
+    //! Matches doesn't sorted.
    void radiusMatchSingle(const GpuMat& query, const GpuMat& train,
        GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());

-    // Download trainIdx, nMatches and distance and convert it to vector with DMatch.
-    // matches will be sorted in increasing order of distances.
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Download trainIdx, nMatches and distance and convert it to vector with DMatch.
+    //! matches will be sorted in increasing order of distances.
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    // Convert trainIdx, nMatches and distance to vector with DMatch.
+    //! Convert trainIdx, nMatches and distance to vector with DMatch.
    static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);

-    // Find best matches for each query descriptor which have distance less than maxDistance
-    // in increasing order of distances).
+    //! Find best matches for each query descriptor which have distance less than maxDistance
+    //! in increasing order of distances).
    void radiusMatch(const GpuMat& query, const GpuMat& train,
        std::vector< std::vector<DMatch> >& matches, float maxDistance,
        const GpuMat& mask = GpuMat(), bool compactResult = false);

-    // Find best matches for each query descriptor which have distance less than maxDistance.
-    // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
-    // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    // Matches doesn't sorted.
+    //! Find best matches for each query descriptor which have distance less than maxDistance.
+    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
+    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
+    //! Matches doesn't sorted.
    void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());

-    // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
-    // matches will be sorted in increasing order of distances.
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
+    //! matches will be sorted in increasing order of distances.
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    // Convert trainIdx, nMatches and distance to vector with DMatch.
+    //! Convert trainIdx, nMatches and distance to vector with DMatch.
    static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);

-    // Find best matches from train collection for each query descriptor which have distance less than
-    // maxDistance (in increasing order of distances).
+    //! Find best matches from train collection for each query descriptor which have distance less than
+    //! maxDistance (in increasing order of distances).
    void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);

@ -203,6 +228,8 @@ private:
    std::vector<GpuMat> trainDescCollection;
 };

+/** @brief Class used for corner detection using the FAST algorithm. :
+ */
 class CV_EXPORTS FAST_CUDA
 {
 public:
@ -213,23 +240,45 @@ public:
        ROWS_COUNT
    };

-    // all features have same size
+    //! all features have same size
    static const int FEATURE_SIZE = 7;

+    /** @brief Constructor.
+
+    @param threshold Threshold on difference between intensity of the central pixel and pixels on a
+    circle around this pixel.
+    @param nonmaxSuppression If it is true, non-maximum suppression is applied to detected corners
+    (keypoints).
+    @param keypointsRatio Inner buffer size for keypoints store is determined as (keypointsRatio \*
+    image\_width \* image\_height).
+     */
    explicit FAST_CUDA(int threshold, bool nonmaxSuppression = true, double keypointsRatio = 0.05);

-    //! finds the keypoints using FAST detector
-    //! supports only CV_8UC1 images
+    /** @brief Finds the keypoints using FAST detector.
+
+    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
+    supported.
+    @param mask Optional input mask that marks the regions where we should detect features.
+    @param keypoints The output vector of keypoints. Can be stored both in CPU and GPU memory. For GPU
+    memory:
+    -   keypoints.ptr\<Vec2s\>(LOCATION\_ROW)[i] will contain location of i'th point
+    -   keypoints.ptr\<float\>(RESPONSE\_ROW)[i] will contain response of i'th point (if non-maximum
+    suppression is applied)
+     */
    void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
+    /** @overload */
    void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);

-    //! download keypoints from device to host memory
+    /** @brief Download keypoints from GPU to CPU memory.
+    */
    static void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);

-    //! convert keypoints to KeyPoint vector
+    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
+    */
    static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);

-    //! release temporary buffer's memory
+    /** @brief Releases inner buffer memory.
+    */
    void release();

    bool nonmaxSuppression;
@ -239,13 +288,22 @@ public:
    //! max keypoints = keypointsRatio * img.size().area()
    double keypointsRatio;

-    //! find keypoints and compute it's response if nonmaxSuppression is true
-    //! return count of detected keypoints
+    /** @brief Find keypoints and compute it's response if nonmaxSuppression is true.
+
+    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
+    supported.
+    @param mask Optional input mask that marks the regions where we should detect features.
+
+    The function returns count of detected keypoints.
+     */
    int calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask);

-    //! get final array of keypoints
-    //! performs nonmax suppression if needed
-    //! return final count of keypoints
+    /** @brief Gets final array of keypoints.
+
+    @param keypoints The output vector of keypoints.
+
+    The function performs non-max suppression if needed and returns final count of keypoints.
+     */
    int getKeyPoints(GpuMat& keypoints);

 private:
@ -257,6 +315,8 @@ private:
    GpuMat d_keypoints_;
 };

+/** @brief Class for extracting ORB features and descriptors from an image. :
+ */
 class CV_EXPORTS ORB_CUDA
 {
 public:
@ -276,28 +336,51 @@ public:
        DEFAULT_FAST_THRESHOLD = 20
    };

-    //! Constructor
+    /** @brief Constructor.
+
+    @param nFeatures The number of desired features.
+    @param scaleFactor Coefficient by which we divide the dimensions from one scale pyramid level to
+    the next.
+    @param nLevels The number of levels in the scale pyramid.
+    @param edgeThreshold How far from the boundary the points should be.
+    @param firstLevel The level at which the image is given. If 1, that means we will also look at the
+    image scaleFactor times bigger.
+    @param WTA_K
+    @param scoreType
+    @param patchSize
+     */
    explicit ORB_CUDA(int nFeatures = 500, float scaleFactor = 1.2f, int nLevels = 8, int edgeThreshold = 31,
                     int firstLevel = 0, int WTA_K = 2, int scoreType = 0, int patchSize = 31);

-    //! Compute the ORB features on an image
-    //! image - the image to compute the features (supports only CV_8UC1 images)
-    //! mask - the mask to apply
-    //! keypoints - the resulting keypoints
+    /** @overload */
    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
+    /** @overload */
    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);

-    //! Compute the ORB features and descriptors on an image
-    //! image - the image to compute the features (supports only CV_8UC1 images)
-    //! mask - the mask to apply
-    //! keypoints - the resulting keypoints
-    //! descriptors - descriptors array
+    /** @brief Detects keypoints and computes descriptors for them.
+
+    @param image Input 8-bit grayscale image.
+    @param mask Optional input mask that marks the regions where we should detect features.
+    @param keypoints The input/output vector of keypoints. Can be stored both in CPU and GPU memory.
+    For GPU memory:
+    -   keypoints.ptr\<float\>(X\_ROW)[i] contains x coordinate of the i'th feature.
+    -   keypoints.ptr\<float\>(Y\_ROW)[i] contains y coordinate of the i'th feature.
+    -   keypoints.ptr\<float\>(RESPONSE\_ROW)[i] contains the response of the i'th feature.
+    -   keypoints.ptr\<float\>(ANGLE\_ROW)[i] contains orientation of the i'th feature.
+    -   keypoints.ptr\<float\>(OCTAVE\_ROW)[i] contains the octave of the i'th feature.
+    -   keypoints.ptr\<float\>(SIZE\_ROW)[i] contains the size of the i'th feature.
+    @param descriptors Computed descriptors. if blurForDescriptor is true, image will be blurred
+    before descriptors calculation.
+     */
    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors);
+    /** @overload */
    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors);

-    //! download keypoints from device to host memory
+    /** @brief Download keypoints from GPU to CPU memory.
+    */
    static void downloadKeyPoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-    //! convert keypoints to KeyPoint vector
+    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
+    */
    static void convertKeyPoints(const Mat& d_keypoints, std::vector<KeyPoint>& keypoints);

    //! returns the descriptor size in bytes
@ -309,7 +392,8 @@ public:
        fastDetector_.nonmaxSuppression = nonmaxSuppression;
    }

-    //! release temporary buffer's memory
+    /** @brief Releases inner buffer memory.
+    */
    void release();

    //! if true, image will be blurred before descriptors calculation
@ -335,10 +419,10 @@ private:
    int scoreType_;
    int patchSize_;

-    // The number of desired features per scale
+    //! The number of desired features per scale
    std::vector<size_t> n_features_per_level_;

-    // Points to compute BRIEF descriptors from
+    //! Points to compute BRIEF descriptors from
    GpuMat pattern_;

    std::vector<GpuMat> imagePyr_;
@ -356,6 +440,8 @@ private:
    GpuMat d_keypoints_;
 };

+//! @}
+
 }} // namespace cv { namespace cuda {

 #endif /* __OPENCV_CUDAFEATURES2D_HPP__ */
--- a/modules/cudafilters/include/opencv2/cudafilters.hpp
+++ b/modules/cudafilters/include/opencv2/cudafilters.hpp
@ -50,65 +50,189 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/imgproc.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudafilters Image Filtering
+
+Functions and classes described in this section are used to perform various linear or non-linear
+filtering operations on 2D images.
+
+@note
+   -   An example containing all basic morphology operators like erode and dilate can be found at
+        opencv\_source\_code/samples/gpu/morphology.cpp
+
+  @}
+ */
+
 namespace cv { namespace cuda {

+//! @addtogroup cudafilters
+//! @{
+
+/** @brief Common interface for all CUDA filters :
+ */
 class CV_EXPORTS Filter : public Algorithm
 {
 public:
+    /** @brief Applies the specified filter to the image.
+
+    @param src Input image.
+    @param dst Output image.
+    @param stream Stream for the asynchronous version.
+     */
    virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Box Filter

-//! creates a normalized 2D box filter
-//! supports CV_8UC1, CV_8UC4 types
+/** @brief Creates a normalized 2D box filter.
+
+@param srcType Input image type. Only CV\_8UC1 and CV\_8UC4 are supported for now.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel
+center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa boxFilter
+ */
 CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
                                       int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Linear Filter

-//! Creates a non-separable linear 2D filter
-//! supports 1 and 4 channel CV_8U, CV_16U and CV_32F input
+/** @brief Creates a non-separable linear 2D filter.
+
+@param srcType Input image type. Supports CV\_8U , CV\_16U and CV\_32F one and four channel image.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param kernel 2D array of filter coefficients.
+@param anchor Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel
+center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa filter2D
+ */
 CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Laplacian Filter

-//! creates a Laplacian operator
-//! supports only ksize = 1 and ksize = 3
+/** @brief Creates a Laplacian operator.
+
+@param srcType Input image type. Supports CV\_8U , CV\_16U and CV\_32F one and four channel image.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param ksize Aperture size used to compute the second-derivative filters (see getDerivKernels). It
+must be positive and odd. Only ksize = 1 and ksize = 3 are supported.
+@param scale Optional scale factor for the computed Laplacian values. By default, no scaling is
+applied (see getDerivKernels ).
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa Laplacian
+ */
 CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
                                             int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter

-//! creates a separable linear filter
+/** @brief Creates a separable linear filter.
+
+@param srcType Source array type.
+@param dstType Destination array type.
+@param rowKernel Horizontal filter coefficients. Support kernels with size \<= 32 .
+@param columnKernel Vertical filter coefficients. Support kernels with size \<= 32 .
+@param anchor Anchor position within the kernel. Negative values mean that anchor is positioned at
+the aperture center.
+@param rowBorderMode Pixel extrapolation method in the vertical direction For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa sepFilter2D
+ */
 CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
                                                   Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deriv Filter

-//! creates a generalized Deriv operator
+/** @brief Creates a generalized Deriv operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Derivative order in respect of x.
+@param dy Derivative order in respect of y.
+@param ksize Aperture size. See getDerivKernels for details.
+@param normalize Flag indicating whether to normalize (scale down) the filter coefficients or not.
+See getDerivKernels for details.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. For details, see getDerivKernels .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+ */
 CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
                                         int ksize, bool normalize = false, double scale = 1,
                                         int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);

-//! creates a Sobel operator
+/** @brief Creates a Sobel operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Derivative order in respect of x.
+@param dy Derivative order in respect of y.
+@param ksize Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. For details, see getDerivKernels .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa Sobel
+ */
 CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
                                         double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);

-//! creates a vertical or horizontal Scharr operator
+/** @brief Creates a vertical or horizontal Scharr operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Order of the derivative x.
+@param dy Order of the derivative y.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. See getDerivKernels for details.
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa Scharr
+ */
 CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter

-//! creates a Gaussian filter
+/** @brief Creates a Gaussian filter.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param ksize Aperture size. See getGaussianKernel for details.
+@param sigma1 Gaussian sigma in the horizontal direction. See getGaussianKernel for details.
+@param sigma2 Gaussian sigma in the vertical direction. If 0, then
+\f$\texttt{sigma2}\leftarrow\texttt{sigma1}\f$ .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa GaussianBlur
+ */
 CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
                                            double sigma1, double sigma2 = 0,
                                            int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
@ -116,19 +240,49 @@ CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Morphology Filter

-//! creates a 2D morphological filter
-//! supports CV_8UC1 and CV_8UC4 types
+/** @brief Creates a 2D morphological filter.
+
+@param op Type of morphological operation. The following types are possible:
+-   **MORPH\_ERODE** erode
+-   **MORPH\_DILATE** dilate
+-   **MORPH\_OPEN** opening
+-   **MORPH\_CLOSE** closing
+-   **MORPH\_GRADIENT** morphological gradient
+-   **MORPH\_TOPHAT** "top hat"
+-   **MORPH\_BLACKHAT** "black hat"
+@param srcType Input/output image type. Only CV\_8UC1 and CV\_8UC4 are supported.
+@param kernel 2D 8-bit structuring element for the morphological operation.
+@param anchor Anchor position within the structuring element. Negative values mean that the anchor
+is at the center.
+@param iterations Number of times erosion and dilation to be applied.
+
+@sa morphologyEx
+ */
 CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);

 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Image Rank Filter

-//! result pixel value is the maximum of pixel values under the rectangular mask region
+/** @brief Creates the maximum filter.
+
+@param srcType Input/output image type. Only CV\_8UC1 and CV\_8UC4 are supported.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
 CV_EXPORTS Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
                                          Point anchor = Point(-1, -1),
                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

-//! result pixel value is the maximum of pixel values under the rectangular mask region
+/** @brief Creates the minimum filter.
+
+@param srcType Input/output image type. Only CV\_8UC1 and CV\_8UC4 are supported.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
 CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
                                          Point anchor = Point(-1, -1),
                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
@ -136,14 +290,30 @@ CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // 1D Sum Filter

-//! creates a horizontal 1D box filter
-//! supports only CV_8UC1 source type and CV_32FC1 sum type
+/** @brief Creates a horizontal 1D box filter.
+
+@param srcType Input image type. Only CV\_8UC1 type is supported for now.
+@param dstType Output image type. Only CV\_32FC1 type is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
 CV_EXPORTS Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

-//! creates a vertical 1D box filter
-//! supports only CV_8UC1 sum type and CV_32FC1 dst type
+/** @brief Creates a vertical 1D box filter.
+
+@param srcType Input image type. Only CV\_8UC1 type is supported for now.
+@param dstType Output image type. Only CV\_32FC1 type is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
 CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

+//! @}
+
 }} // namespace cv { namespace cuda {

 #endif /* __OPENCV_CUDAFILTERS_HPP__ */
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@ -50,16 +50,48 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/imgproc.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudaimgproc Image Processing
+    @{
+      @defgroup cudaimgproc_color Color space processing
+      @defgroup cudaimgproc_hist Histogram Calculation
+      @defgroup cudaimgproc_hough Hough Transform
+      @defgroup cudaimgproc_feature Feature Detection
+    @}
+  @}
+*/
+
 namespace cv { namespace cuda {

+//! @addtogroup cudaimgproc
+//! @{
+
 /////////////////////////// Color Processing ///////////////////////////

-//! converts image from one color space to another
+//! @addtogroup cudaimgproc_color
+//! @{
+
+/** @brief Converts an image from one color space to another.
+
+@param src Source image with CV\_8U , CV\_16U , or CV\_32F depth and 1, 3, or 4 channels.
+@param dst Destination image.
+@param code Color space conversion code. For details, see cvtColor .
+@param dcn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream Stream for the asynchronous version.
+
+3-channel color spaces (like HSV, XYZ, and so on) can be stored in a 4-channel image for better
+performance.
+
+@sa cvtColor
+ */
 CV_EXPORTS void cvtColor(InputArray src, OutputArray dst, int code, int dcn = 0, Stream& stream = Stream::Null());

 enum
 {
-    // Bayer Demosaicing (Malvar, He, and Cutler)
+    //! Bayer Demosaicing (Malvar, He, and Cutler)
    COLOR_BayerBG2BGR_MHT = 256,
    COLOR_BayerGB2BGR_MHT = 257,
    COLOR_BayerRG2BGR_MHT = 258,
@ -75,105 +107,228 @@ enum
    COLOR_BayerRG2GRAY_MHT = 262,
    COLOR_BayerGR2GRAY_MHT = 263
 };
+
+/** @brief Converts an image from Bayer pattern to RGB or grayscale.
+
+@param src Source image (8-bit or 16-bit single channel).
+@param dst Destination image.
+@param code Color space conversion code (see the description below).
+@param dcn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream Stream for the asynchronous version.
+
+The function can do the following transformations:
+
+-   Demosaicing using bilinear interpolation
+
+    > -   COLOR\_BayerBG2GRAY , COLOR\_BayerGB2GRAY , COLOR\_BayerRG2GRAY , COLOR\_BayerGR2GRAY
+    > -   COLOR\_BayerBG2BGR , COLOR\_BayerGB2BGR , COLOR\_BayerRG2BGR , COLOR\_BayerGR2BGR
+
+-   Demosaicing using Malvar-He-Cutler algorithm (@cite MHT2011)
+
+    > -   COLOR\_BayerBG2GRAY\_MHT , COLOR\_BayerGB2GRAY\_MHT , COLOR\_BayerRG2GRAY\_MHT ,
+    >     COLOR\_BayerGR2GRAY\_MHT
+    > -   COLOR\_BayerBG2BGR\_MHT , COLOR\_BayerGB2BGR\_MHT , COLOR\_BayerRG2BGR\_MHT ,
+    >     COLOR\_BayerGR2BGR\_MHT
+
+@sa cvtColor
+ */
 CV_EXPORTS void demosaicing(InputArray src, OutputArray dst, int code, int dcn = -1, Stream& stream = Stream::Null());

-//! swap channels
-//! dstOrder - Integer array describing how channel values are permutated. The n-th entry
-//!            of the array contains the number of the channel that is stored in the n-th channel of
-//!            the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR
-//!            channel order.
+/** @brief Exchanges the color channels of an image in-place.
+
+@param image Source image. Supports only CV\_8UC4 type.
+@param dstOrder Integer array describing how channel values are permutated. The n-th entry of the
+array contains the number of the channel that is stored in the n-th channel of the output image.
+E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR channel order.
+@param stream Stream for the asynchronous version.
+
+The methods support arbitrary permutations of the original channels, including replication.
+ */
 CV_EXPORTS void swapChannels(InputOutputArray image, const int dstOrder[4], Stream& stream = Stream::Null());

-//! Routines for correcting image color gamma
+/** @brief Routines for correcting image color gamma.
+
+@param src Source image (3- or 4-channel 8 bit).
+@param dst Destination image.
+@param forward true for forward gamma correction or false for inverse gamma correction.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void gammaCorrection(InputArray src, OutputArray dst, bool forward = true, Stream& stream = Stream::Null());

 enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
       ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};

-//! Composite two images using alpha opacity values contained in each image
-//! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types
+/** @brief Composites two images using alpha opacity values contained in each image.
+
+@param img1 First image. Supports CV\_8UC4 , CV\_16UC4 , CV\_32SC4 and CV\_32FC4 types.
+@param img2 Second image. Must have the same size and the same type as img1 .
+@param dst Destination image.
+@param alpha\_op Flag specifying the alpha-blending operation:
+-   **ALPHA\_OVER**
+-   **ALPHA\_IN**
+-   **ALPHA\_OUT**
+-   **ALPHA\_ATOP**
+-   **ALPHA\_XOR**
+-   **ALPHA\_PLUS**
+-   **ALPHA\_OVER\_PREMUL**
+-   **ALPHA\_IN\_PREMUL**
+-   **ALPHA\_OUT\_PREMUL**
+-   **ALPHA\_ATOP\_PREMUL**
+-   **ALPHA\_XOR\_PREMUL**
+-   **ALPHA\_PLUS\_PREMUL**
+-   **ALPHA\_PREMUL**
+@param stream Stream for the asynchronous version.
+
+@note
+   -   An example demonstrating the use of alphaComp can be found at
+        opencv\_source\_code/samples/gpu/alpha\_comp.cpp
+ */
 CV_EXPORTS void alphaComp(InputArray img1, InputArray img2, OutputArray dst, int alpha_op, Stream& stream = Stream::Null());

+//! @} cudaimgproc_color
+
 ////////////////////////////// Histogram ///////////////////////////////

-//! Calculates histogram for 8u one channel image
-//! Output hist will have one row, 256 cols and CV32SC1 type.
+//! @addtogroup cudaimgproc_hist
+//! @{
+
+/** @brief Calculates histogram for one channel 8-bit image.
+
+@param src Source image with CV\_8UC1 type.
+@param hist Destination histogram with one row, 256 columns, and the CV\_32SC1 type.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stream::Null());

-//! normalizes the grayscale image brightness and contrast by normalizing its histogram
+/** @brief Equalizes the histogram of a grayscale image.
+
+@param src Source image with CV\_8UC1 type.
+@param dst Destination image.
+@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+@param stream Stream for the asynchronous version.
+
+@sa equalizeHist
+ */
 CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, InputOutputArray buf, Stream& stream = Stream::Null());

+/** @overload */
 static inline void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
 {
    GpuMat buf;
    cuda::equalizeHist(src, dst, buf, stream);
 }

+/** @brief Base class for Contrast Limited Adaptive Histogram Equalization. :
+ */
 class CV_EXPORTS CLAHE : public cv::CLAHE
 {
 public:
    using cv::CLAHE::apply;
+    /** @brief Equalizes the histogram of a grayscale image using Contrast Limited Adaptive Histogram Equalization.
+
+    @param src Source image with CV\_8UC1 type.
+    @param dst Destination image.
+    @param stream Stream for the asynchronous version.
+     */
    virtual void apply(InputArray src, OutputArray dst, Stream& stream) = 0;
 };
+
+/** @brief Creates implementation for cuda::CLAHE .
+
+@param clipLimit Threshold for contrast limiting.
+@param tileGridSize Size of grid for histogram equalization. Input image will be divided into
+equally sized rectangular tiles. tileGridSize defines the number of tiles in row and column.
+ */
 CV_EXPORTS Ptr<cuda::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));

-//! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.
+/** @brief Computes levels with even distribution.
+
+@param levels Destination array. levels has 1 row, nLevels columns, and the CV\_32SC1 type.
+@param nLevels Number of computed levels. nLevels must be at least 2.
+@param lowerLevel Lower boundary value of the lowest level.
+@param upperLevel Upper boundary value of the greatest level.
+ */
 CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel);

-//! Calculates histogram with evenly distributed bins for signle channel source.
-//! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types.
-//! Output hist will have one row and histSize cols and CV_32SC1 type.
+/** @brief Calculates a histogram with evenly distributed bins.
+
+@param src Source image. CV\_8U, CV\_16U, or CV\_16S depth and 1 or 4 channels are supported. For
+a four-channel image, all channels are processed separately.
+@param hist Destination histogram with one row, histSize columns, and the CV\_32S type.
+@param histSize Size of the histogram.
+@param lowerLevel Lower boundary of lowest-level bin.
+@param upperLevel Upper boundary of highest-level bin.
+@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void histEven(InputArray src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());

+/** @overload */
 static inline void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
 {
    GpuMat buf;
    cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
 }

-//! Calculates histogram with evenly distributed bins for four-channel source.
-//! All channels of source are processed separately.
-//! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types.
-//! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type.
+/** @overload */
 CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());

+/** @overload */
 static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null())
 {
    GpuMat buf;
    cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
 }

-//! Calculates histogram with bins determined by levels array.
-//! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
-//! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types.
-//! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type.
+/** @brief Calculates a histogram with bins determined by the levels array.
+
+@param src Source image. CV\_8U , CV\_16U , or CV\_16S depth and 1 or 4 channels are supported.
+For a four-channel image, all channels are processed separately.
+@param hist Destination histogram with one row, (levels.cols-1) columns, and the CV\_32SC1 type.
+@param levels Number of levels in the histogram.
+@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, InputOutputArray buf, Stream& stream = Stream::Null());

+/** @overload */
 static inline void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null())
 {
    GpuMat buf;
    cuda::histRange(src, hist, levels, buf, stream);
 }

-//! Calculates histogram with bins determined by levels array.
-//! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
-//! All channels of source are processed separately.
-//! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types.
-//! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type.
+/** @overload */
 CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream = Stream::Null());

+/** @overload */
 static inline void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null())
 {
    GpuMat buf;
    cuda::histRange(src, hist, levels, buf, stream);
 }

+//! @} cudaimgproc_hist
+
 //////////////////////////////// Canny ////////////////////////////////

+/** @brief Base class for Canny Edge Detector. :
+ */
 class CV_EXPORTS CannyEdgeDetector : public Algorithm
 {
 public:
+    /** @brief Finds edges in an image using the @cite Canny86 algorithm.
+
+    @param image Single-channel 8-bit input image.
+    @param edges Output edge map. It has the same size and type as image .
+     */
    virtual void detect(InputArray image, OutputArray edges) = 0;
+    /** @overload
+    @param dx First derivative of image in the vertical direction. Support only CV\_32S type.
+    @param dy First derivative of image in the horizontal direction. Support only CV\_32S type.
+    @param edges Output edge map. It has the same size and type as image .
+    */
    virtual void detect(InputArray dx, InputArray dy, OutputArray edges) = 0;

    virtual void setLowThreshold(double low_thresh) = 0;
@ -189,6 +344,16 @@ public:
    virtual bool getL2Gradient() const = 0;
 };

+/** @brief Creates implementation for cuda::CannyEdgeDetector .
+
+@param low\_thresh First threshold for the hysteresis procedure.
+@param high\_thresh Second threshold for the hysteresis procedure.
+@param apperture\_size Aperture size for the Sobel operator.
+@param L2gradient Flag indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to compute the image gradient magnitude (
+L2gradient=true ), or a faster default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough ( L2gradient=false
+).
+ */
 CV_EXPORTS Ptr<CannyEdgeDetector> createCannyEdgeDetector(double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);

 /////////////////////////// Hough Transform ////////////////////////////
@ -196,10 +361,32 @@ CV_EXPORTS Ptr<CannyEdgeDetector> createCannyEdgeDetector(double low_thresh, dou
 //////////////////////////////////////
 // HoughLines

+//! @addtogroup cudaimgproc_hough
+//! @{
+
+/** @brief Base class for lines detector algorithm. :
+ */
 class CV_EXPORTS HoughLinesDetector : public Algorithm
 {
 public:
+    /** @brief Finds lines in a binary image using the classical Hough transform.
+
+    @param src 8-bit, single-channel binary source image.
+    @param lines Output vector of lines. Each line is represented by a two-element vector
+    \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of
+    the image). \f$\theta\f$ is the line rotation angle in radians (
+    \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ).
+
+    @sa HoughLines
+     */
    virtual void detect(InputArray src, OutputArray lines) = 0;
+
+    /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory.
+
+    @param d\_lines Result of cuda::HoughLinesDetector::detect .
+    @param h\_lines Output host array.
+    @param h\_votes Optional output array for line's votes.
+     */
    virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) = 0;

    virtual void setRho(float rho) = 0;
@ -218,16 +405,35 @@ public:
    virtual int getMaxLines() const = 0;
 };

+/** @brief Creates implementation for cuda::HoughLinesDetector .
+
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param doSort Performs lines sort by votes.
+@param maxLines Maximum number of output lines.
+ */
 CV_EXPORTS Ptr<HoughLinesDetector> createHoughLinesDetector(float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);


 //////////////////////////////////////
 // HoughLinesP

-//! finds line segments in the black-n-white image using probabilistic Hough transform
+/** @brief Base class for line segments detector algorithm. :
+ */
 class CV_EXPORTS HoughSegmentDetector : public Algorithm
 {
 public:
+    /** @brief Finds line segments in a binary image using the probabilistic Hough transform.
+
+    @param src 8-bit, single-channel binary source image.
+    @param lines Output vector of lines. Each line is represented by a 4-element vector
+    \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected
+    line segment.
+
+    @sa HoughLinesP
+     */
    virtual void detect(InputArray src, OutputArray lines) = 0;

    virtual void setRho(float rho) = 0;
@ -246,14 +452,32 @@ public:
    virtual int getMaxLines() const = 0;
 };

+/** @brief Creates implementation for cuda::HoughSegmentDetector .
+
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param minLineLength Minimum line length. Line segments shorter than that are rejected.
+@param maxLineGap Maximum allowed gap between points on the same line to link them.
+@param maxLines Maximum number of output lines.
+ */
 CV_EXPORTS Ptr<HoughSegmentDetector> createHoughSegmentDetector(float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096);

 //////////////////////////////////////
 // HoughCircles

+/** @brief Base class for circles detector algorithm. :
+ */
 class CV_EXPORTS HoughCirclesDetector : public Algorithm
 {
 public:
+    /** @brief Finds circles in a grayscale image using the Hough transform.
+
+    @param src 8-bit, single-channel grayscale input image.
+    @param circles Output vector of found circles. Each vector is encoded as a 3-element
+    floating-point vector \f$(x, y, radius)\f$ .
+
+    @sa HoughCircles
+     */
    virtual void detect(InputArray src, OutputArray circles) = 0;

    virtual void setDp(float dp) = 0;
@ -278,85 +502,257 @@ public:
    virtual int getMaxCircles() const = 0;
 };

+/** @brief Creates implementation for cuda::HoughCirclesDetector .
+
+@param dp Inverse ratio of the accumulator resolution to the image resolution. For example, if
+dp=1 , the accumulator has the same resolution as the input image. If dp=2 , the accumulator has
+half as big width and height.
+@param minDist Minimum distance between the centers of the detected circles. If the parameter is
+too small, multiple neighbor circles may be falsely detected in addition to a true one. If it is
+too large, some circles may be missed.
+@param cannyThreshold The higher threshold of the two passed to Canny edge detector (the lower one
+is twice smaller).
+@param votesThreshold The accumulator threshold for the circle centers at the detection stage. The
+smaller it is, the more false circles may be detected.
+@param minRadius Minimum circle radius.
+@param maxRadius Maximum circle radius.
+@param maxCircles Maximum number of output circles.
+ */
 CV_EXPORTS Ptr<HoughCirclesDetector> createHoughCirclesDetector(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);

 //////////////////////////////////////
 // GeneralizedHough

-//! Ballard, D.H. (1981). Generalizing the Hough transform to detect arbitrary shapes. Pattern Recognition 13 (2): 111-122.
-//! Detects position only without traslation and rotation
+/** @brief Creates implementation for generalized hough transform from @cite Ballard1981 .
+ */
 CV_EXPORTS Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();

-//! Guil, N., González-Linares, J.M. and Zapata, E.L. (1999). Bidimensional shape detection using an invariant approach. Pattern Recognition 32 (6): 1025-1038.
-//! Detects position, traslation and rotation
+/** @brief Creates implementation for generalized hough transform from @cite Guil1999 .
+ */
 CV_EXPORTS Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();

+//! @} cudaimgproc_hough
+
 ////////////////////////// Corners Detection ///////////////////////////

+//! @addtogroup cudaimgproc_feature
+//! @{
+
+/** @brief Base class for Cornerness Criteria computation. :
+ */
 class CV_EXPORTS CornernessCriteria : public Algorithm
 {
 public:
+    /** @brief Computes the cornerness criteria at each image pixel.
+
+    @param src Source image.
+    @param dst Destination image containing cornerness values. It will have the same size as src and
+    CV\_32FC1 type.
+    @param stream Stream for the asynchronous version.
+     */
    virtual void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 };

-//! computes Harris cornerness criteria at each image pixel
+/** @brief Creates implementation for Harris cornerness criteria.
+
+@param srcType Input source type. Only CV\_8UC1 and CV\_32FC1 are supported for now.
+@param blockSize Neighborhood size.
+@param ksize Aperture parameter for the Sobel operator.
+@param k Harris detector free parameter.
+@param borderType Pixel extrapolation method. Only BORDER\_REFLECT101 and BORDER\_REPLICATE are
+supported for now.
+
+@sa cornerHarris
+ */
 CV_EXPORTS Ptr<CornernessCriteria> createHarrisCorner(int srcType, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);

-//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
+/** @brief Creates implementation for the minimum eigen value of a 2x2 derivative covariation matrix (the
+cornerness criteria).
+
+@param srcType Input source type. Only CV\_8UC1 and CV\_32FC1 are supported for now.
+@param blockSize Neighborhood size.
+@param ksize Aperture parameter for the Sobel operator.
+@param borderType Pixel extrapolation method. Only BORDER\_REFLECT101 and BORDER\_REPLICATE are
+supported for now.
+
+@sa cornerMinEigenVal
+ */
 CV_EXPORTS Ptr<CornernessCriteria> createMinEigenValCorner(int srcType, int blockSize, int ksize, int borderType = BORDER_REFLECT101);

 ////////////////////////// Corners Detection ///////////////////////////

+/** @brief Base class for Corners Detector. :
+ */
 class CV_EXPORTS CornersDetector : public Algorithm
 {
 public:
-    //! return 1 rows matrix with CV_32FC2 type
+    /** @brief Determines strong corners on an image.
+
+    @param image Input 8-bit or floating-point 32-bit, single-channel image.
+    @param corners Output vector of detected corners (1-row matrix with CV\_32FC2 type with corners
+    positions).
+    @param mask Optional region of interest. If the image is not empty (it needs to have the type
+    CV\_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+     */
    virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0;
 };

+/** @brief Creates implementation for cuda::CornersDetector .
+
+@param srcType Input source type. Only CV\_8UC1 and CV\_32FC1 are supported for now.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see cornerMinEigenVal ) or the Harris function response (see cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see cornerHarris)
+or cornerMinEigenVal.
+@param harrisK Free parameter of the Harris detector.
+ */
 CV_EXPORTS Ptr<CornersDetector> createGoodFeaturesToTrackDetector(int srcType, int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
                                                                  int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);

+//! @} cudaimgproc_feature
+
 ///////////////////////////// Mean Shift //////////////////////////////

-//! Does mean shift filtering on GPU.
+/** @brief Performs mean-shift filtering for each point of the source image.
+
+@param src Source image. Only CV\_8UC4 images are supported for now.
+@param dst Destination image containing the color of mapped points. It has the same size and type
+as src .
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param criteria Termination criteria. See TermCriteria.
+@param stream
+
+It maps each point of the source image into another point. As a result, you have a new color and new
+position of each point.
+ */
 CV_EXPORTS void meanShiftFiltering(InputArray src, OutputArray dst, int sp, int sr,
                                   TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
                                   Stream& stream = Stream::Null());

-//! Does mean shift procedure on GPU.
+/** @brief Performs a mean-shift procedure and stores information about processed points (their colors and
+positions) in two images.
+
+@param src Source image. Only CV\_8UC4 images are supported for now.
+@param dstr Destination image containing the color of mapped points. The size and type is the same
+as src .
+@param dstsp Destination image containing the position of mapped points. The size is the same as
+src size. The type is CV\_16SC2 .
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param criteria Termination criteria. See TermCriteria.
+@param stream
+
+@sa cuda::meanShiftFiltering
+ */
 CV_EXPORTS void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dstsp, int sp, int sr,
                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
                              Stream& stream = Stream::Null());

-//! Does mean shift segmentation with elimination of small regions.
+/** @brief Performs a mean-shift segmentation of the source image and eliminates small segments.
+
+@param src Source image. Only CV\_8UC4 images are supported for now.
+@param dst Segmented image with the same size and type as src (host memory).
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param minsize Minimum segment size. Smaller segments are merged.
+@param criteria Termination criteria. See TermCriteria.
+ */
 CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize,
                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));

 /////////////////////////// Match Template ////////////////////////////

-//! computes the proximity map for the raster template and the image where the template is searched for
+/** @brief Base class for Template Matching. :
+ */
 class CV_EXPORTS TemplateMatching : public Algorithm
 {
 public:
+    /** @brief Computes a proximity map for a raster template and an image where the template is searched for.
+
+    @param image Source image.
+    @param templ Template image with the size and type the same as image .
+    @param result Map containing comparison results ( CV\_32FC1 ). If image is *W x H* and templ is *w
+    x h*, then result must be *W-w+1 x H-h+1*.
+    @param stream Stream for the asynchronous version.
+     */
    virtual void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null()) = 0;
 };

+/** @brief Creates implementation for cuda::TemplateMatching .
+
+@param srcType Input source type. CV\_32F and CV\_8U depth images (1..4 channels) are supported
+for now.
+@param method Specifies the way to compare the template with the image.
+@param user\_block\_size You can use field user\_block\_size to set specific block size. If you
+leave its default value Size(0,0) then automatic estimation of block size will be used (which is
+optimized for speed). By varying user\_block\_size you can reduce memory requirements at the cost
+of speed.
+
+The following methods are supported for the CV\_8U depth images for now:
+
+-   CV\_TM\_SQDIFF
+-   CV\_TM\_SQDIFF\_NORMED
+-   CV\_TM\_CCORR
+-   CV\_TM\_CCORR\_NORMED
+-   CV\_TM\_CCOEFF
+-   CV\_TM\_CCOEFF\_NORMED
+
+The following methods are supported for the CV\_32F images for now:
+
+-   CV\_TM\_SQDIFF
+-   CV\_TM\_CCORR
+
+@sa matchTemplate
+ */
 CV_EXPORTS Ptr<TemplateMatching> createTemplateMatching(int srcType, int method, Size user_block_size = Size());

 ////////////////////////// Bilateral Filter ///////////////////////////

-//! Performa bilateral filtering of passsed image
+/** @brief Performs bilateral filtering of passed image
+
+@param src Source image. Supports only (channles != 2 && depth() != CV\_8S && depth() != CV\_32S
+&& depth() != CV\_64F).
+@param dst Destination imagwe.
+@param kernel\_size Kernel window size.
+@param sigma\_color Filter sigma in the color space.
+@param sigma\_spatial Filter sigma in the coordinate space.
+@param borderMode Border type. See borderInterpolate for details. BORDER\_REFLECT101 ,
+BORDER\_REPLICATE , BORDER\_CONSTANT , BORDER\_REFLECT and BORDER\_WRAP are supported for now.
+@param stream Stream for the asynchronous version.
+
+@sa bilateralFilter
+ */
 CV_EXPORTS void bilateralFilter(InputArray src, OutputArray dst, int kernel_size, float sigma_color, float sigma_spatial,
                                int borderMode = BORDER_DEFAULT, Stream& stream = Stream::Null());

 ///////////////////////////// Blending ////////////////////////////////

-//! performs linear blending of two images
-//! to avoid accuracy errors sum of weigths shouldn't be very close to zero
+/** @brief Performs linear blending of two images.
+
+@param img1 First image. Supports only CV\_8U and CV\_32F depth.
+@param img2 Second image. Must have the same size and the same type as img1 .
+@param weights1 Weights for first image. Must have tha same size as img1 . Supports only CV\_32F
+type.
+@param weights2 Weights for second image. Must have tha same size as img2 . Supports only CV\_32F
+type.
+@param result Destination image.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void blendLinear(InputArray img1, InputArray img2, InputArray weights1, InputArray weights2,
                            OutputArray result, Stream& stream = Stream::Null());

+//! @}
+
 }} // namespace cv { namespace cuda {

 #endif /* __OPENCV_CUDAIMGPROC_HPP__ */
--- a/modules/cudalegacy/include/opencv2/cudalegacy.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@ -49,4 +49,11 @@
 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudalegacy Legacy support
+  @}
+*/
+
 #endif /* __OPENCV_CUDALEGACY_HPP__ */
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
@ -60,6 +60,8 @@
 //
 //==============================================================================

+//! @addtogroup cudalegacy
+//! @{

 /**
 * Compile-time assert namespace
@ -1023,6 +1025,6 @@ CV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Nc
    NCVMatrixAlloc<type> name(alloc, width, height); \
    ncvAssertReturn(name.isMemAllocated(), err);

-
+//! @}

 #endif // _ncv_hpp_
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp
@ -62,6 +62,9 @@

 #include "opencv2/cudalegacy/NCV.hpp"

+//! @addtogroup cudalegacy
+//! @{
+
 /// \brief Model and solver parameters
 struct NCVBroxOpticalFlowDescriptor
 {
@ -89,6 +92,7 @@ struct NCVBroxOpticalFlowDescriptor
 /// \param [in]  frame1            frame to track
 /// \param [out] u                 flow horizontal component (along \b x axis)
 /// \param [out] v                 flow vertical component (along \b y axis)
+/// \param       stream
 /// \return                        computation status
 /////////////////////////////////////////////////////////////////////////////////////////

@ -101,4 +105,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                             NCVMatrix<Ncv32f> &v,
                             cudaStream_t stream);

+//! @}
+
 #endif
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp
@ -61,6 +61,8 @@

 #include "opencv2/cudalegacy/NCV.hpp"

+//! @addtogroup cudalegacy
+//! @{

 //==============================================================================
 //
@ -456,6 +458,6 @@ CV_EXPORTS NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,
                                             NCVVector<HaarFeature64> &h_HaarFeatures);

-
+//! @}

 #endif // _ncvhaarobjectdetection_hpp_
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp
@ -48,6 +48,8 @@
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/core/cuda/common.hpp"

+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
    namespace pyramid
@ -106,4 +108,6 @@ private:

 #endif //_WIN32

+//! @endcond
+
 #endif //_ncvpyramid_hpp_
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
@ -45,19 +45,14 @@

 #include "opencv2/cudalegacy/NCV.hpp"

-
-/**
-* \file NPP_staging.hpp
-* NPP Staging Library
-*/
-
+//! @addtogroup cudalegacy
+//! @{

 /** \defgroup core_npp NPPST Core
 * Basic functions for CUDA streams management.
 * @{
 */

-
 /**
 * Gets an active CUDA stream used by NPPST
 * NOT THREAD SAFE
@ -168,6 +163,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
 * \param nSrcStep          [IN]  Source image line step
 * \param pDst              [OUT] Destination image pointer (CUDA device memory)
 * \param dstSize           [OUT] Destination image size
+ * \param nDstStep
 * \param oROI              [IN]  Region of interest in the source image
 * \param borderType        [IN]  Type of border
 * \param pKernel           [IN]  Pointer to row kernel values (CUDA device memory)
@ -201,6 +197,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
 * \param nSrcStep          [IN]  Source image line step
 * \param pDst              [OUT] Destination image pointer (CUDA device memory)
 * \param dstSize           [OUT] Destination image size
+ * \param nDstStep          [IN]
 * \param oROI              [IN]  Region of interest in the source image
 * \param borderType        [IN]  Type of border
 * \param pKernel           [IN]  Pointer to column kernel values (CUDA device memory)
@ -228,7 +225,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
 /** Size of buffer required for vector image warping.
 *
 * \param srcSize           [IN]  Source image size
- * \param nStep             [IN]  Source image line step
+ * \param nSrcStep          [IN]  Source image line step
 * \param hpSize            [OUT] Where to store computed size (host memory)
 *
 * \return NCV status code
@ -285,6 +282,7 @@ NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
 * \param pU                [IN]  Pointer to horizontal displacement field (CUDA device memory)
 * \param pV                [IN]  Pointer to vertical displacement field (CUDA device memory)
 * \param nVFStep           [IN]  Displacement field line step
+ * \param pBuffer
 * \param timeScale         [IN]  Value by which displacement field will be scaled for warping
 * \param pDst              [OUT] Destination image pointer (CUDA device memory)
 *
@ -903,5 +901,6 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,

 /*@}*/

+//! @}

 #endif // _npp_staging_hpp_
--- a/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
@ -56,6 +56,8 @@

 #include "opencv2/cudalegacy.hpp"

+//! @cond IGNORED
+
 namespace cv { namespace cuda
 {
    class NppStStreamHandler
@ -89,4 +91,6 @@ namespace cv { namespace cuda

 #define ncvSafeCall(expr)  cv::cuda::checkNcvError(expr, __FILE__, __LINE__, CV_Func)

+//! @endcond
+
 #endif // __OPENCV_CORE_CUDALEGACY_PRIVATE_HPP__
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@ -49,8 +49,21 @@

 #include "opencv2/core/cuda.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudaoptflow Optical Flow
+  @}
+ */
+
 namespace cv { namespace cuda {

+//! @addtogroup cudaoptflow
+//! @{
+
+/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm
+(@cite Brox2004). :
+ */
 class CV_EXPORTS BroxOpticalFlow
 {
 public:
@ -88,16 +101,58 @@ public:
    GpuMat buf;
 };

+/** @brief Class used for calculating an optical flow.
+
+The class can calculate an optical flow for a sparse feature set or dense optical flow using the
+iterative Lucas-Kanade method with pyramids.
+
+@sa calcOpticalFlowPyrLK
+
+@note
+   -   An example of the Lucas Kanade optical flow algorithm can be found at
+        opencv\_source\_code/samples/gpu/pyrlk\_optical\_flow.cpp
+ */
 class CV_EXPORTS PyrLKOpticalFlow
 {
 public:
    PyrLKOpticalFlow();

+    /** @brief Calculate an optical flow for a sparse feature set.
+
+    @param prevImg First 8-bit input image (supports both grayscale and color images).
+    @param nextImg Second input image of the same size and the same type as prevImg .
+    @param prevPts Vector of 2D points for which the flow needs to be found. It must be one row matrix
+    with CV\_32FC2 type.
+    @param nextPts Output vector of 2D points (with single-precision floating-point coordinates)
+    containing the calculated new positions of input features in the second image. When useInitialFlow
+    is true, the vector must have the same size as in the input.
+    @param status Output status vector (CV\_8UC1 type). Each element of the vector is set to 1 if the
+    flow for the corresponding features has been found. Otherwise, it is set to 0.
+    @param err Output vector (CV\_32FC1 type) that contains the difference between patches around the
+    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
+    needed.
+
+    @sa calcOpticalFlowPyrLK
+     */
    void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
        GpuMat& status, GpuMat* err = 0);

+    /** @brief Calculate dense optical flow.
+
+    @param prevImg First 8-bit grayscale input image.
+    @param nextImg Second input image of the same size and the same type as prevImg .
+    @param u Horizontal component of the optical flow of the same size as input images, 32-bit
+    floating-point, single-channel
+    @param v Vertical component of the optical flow of the same size as input images, 32-bit
+    floating-point, single-channel
+    @param err Output vector (CV\_32FC1 type) that contains the difference between patches around the
+    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
+    needed.
+     */
    void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);

+    /** @brief Releases inner buffers memory.
+    */
    void releaseMemory();

    Size winSize;
@ -115,6 +170,8 @@ private:
    GpuMat vPyr_[2];
 };

+/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm. :
+ */
 class CV_EXPORTS FarnebackOpticalFlow
 {
 public:
@ -139,8 +196,20 @@ public:
    double polySigma;
    int flags;

+    /** @brief Computes a dense optical flow using the Gunnar Farneback’s algorithm.
+
+    @param frame0 First 8-bit gray-scale input image
+    @param frame1 Second 8-bit gray-scale input image
+    @param flowx Flow horizontal component
+    @param flowy Flow vertical component
+    @param s Stream
+
+    @sa calcOpticalFlowFarneback
+     */
    void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());

+    /** @brief Releases unused auxiliary memory buffers.
+     */
    void releaseMemory()
    {
        frames_[0].release();
@ -295,20 +364,22 @@ private:
    GpuMat extended_I1;
 };

-//! Interpolate frames (images) using provided optical flow (displacement field).
-//! frame0   - frame 0 (32-bit floating point images, single channel)
-//! frame1   - frame 1 (the same type and size)
-//! fu       - forward horizontal displacement
-//! fv       - forward vertical displacement
-//! bu       - backward horizontal displacement
-//! bv       - backward vertical displacement
-//! pos      - new frame position
-//! newFrame - new frame
-//! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat;
-//!            occlusion masks            0, occlusion masks            1,
-//!            interpolated forward flow  0, interpolated forward flow  1,
-//!            interpolated backward flow 0, interpolated backward flow 1
-//!
+/** @brief Interpolates frames (images) using provided optical flow (displacement field).
+
+@param frame0 First frame (32-bit floating point images, single channel).
+@param frame1 Second frame. Must have the same type and size as frame0 .
+@param fu Forward horizontal displacement.
+@param fv Forward vertical displacement.
+@param bu Backward horizontal displacement.
+@param bv Backward vertical displacement.
+@param pos New frame position.
+@param newFrame Output image.
+@param buf Temporary buffer, will have width x 6\*height size, CV\_32FC1 type and contain 6
+GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward
+horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow,
+interpolated backward vertical flow.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
                                  const GpuMat& fu, const GpuMat& fv,
                                  const GpuMat& bu, const GpuMat& bv,
@ -317,6 +388,8 @@ CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,

 CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);

+//! @}
+
 }} // namespace cv { namespace cuda {

 #endif /* __OPENCV_CUDAOPTFLOW_HPP__ */
--- a/modules/cudastereo/include/opencv2/cudastereo.hpp
+++ b/modules/cudastereo/include/opencv2/cudastereo.hpp
@ -50,11 +50,25 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/calib3d.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudastereo Stereo Correspondence
+  @}
+ */
+
 namespace cv { namespace cuda {

+//! @addtogroup cudastereo
+//! @{
+
 /////////////////////////////////////////
 // StereoBM

+/** @brief Class computing stereo correspondence (disparity map) using the block matching algorithm. :
+
+@sa StereoBM
+ */
 class CV_EXPORTS StereoBM : public cv::StereoBM
 {
 public:
@ -63,20 +77,70 @@ public:
    virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
 };

+/** @brief Creates StereoBM object.
+
+@param numDisparities the disparity search range. For each pixel algorithm will find the best
+disparity from 0 (default minimum disparity) to numDisparities. The search range can then be
+shifted by changing the minimum disparity.
+@param blockSize the linear size of the blocks compared by the algorithm. The size should be odd
+(as the block is centered at the current pixel). Larger block size implies smoother, though less
+accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher
+chance for algorithm to find a wrong correspondence.
+ */
 CV_EXPORTS Ptr<cuda::StereoBM> createStereoBM(int numDisparities = 64, int blockSize = 19);

 /////////////////////////////////////////
 // StereoBeliefPropagation

-//! "Efficient Belief Propagation for Early Vision" P.Felzenszwalb
+/** @brief Class computing stereo correspondence using the belief propagation algorithm. :
+
+The class implements algorithm described in @cite Felzenszwalb2006 . It can compute own data cost
+(using a truncated linear model) or use a user-provided data cost.
+
+@note
+   StereoBeliefPropagation requires a lot of memory for message storage:
+
+    \f[width \_ step  \cdot height  \cdot ndisp  \cdot 4  \cdot (1 + 0.25)\f]
+
+    and for data cost storage:
+
+    \f[width\_step \cdot height \cdot ndisp \cdot (1 + 0.25 + 0.0625 +  \dotsm + \frac{1}{4^{levels}})\f]
+
+    width\_step is the number of bytes in a line including padding.
+
+StereoBeliefPropagation uses a truncated linear model for the data cost and discontinuity terms:
+
+\f[DataCost = data \_ weight  \cdot \min ( \lvert Img_Left(x,y)-Img_Right(x-d,y)  \rvert , max \_ data \_ term)\f]
+
+\f[DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)\f]
+
+For more details, see @cite Felzenszwalb2006.
+
+By default, StereoBeliefPropagation uses floating-point arithmetics and the CV\_32FC1 type for
+messages. But it can also use fixed-point arithmetics and the CV\_16SC1 message type for better
+performance. To avoid an overflow in this case, the parameters must satisfy the following
+requirement:
+
+\f[10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX\f]
+
+@sa StereoMatcher
+ */
 class CV_EXPORTS StereoBeliefPropagation : public cv::StereoMatcher
 {
 public:
    using cv::StereoMatcher::compute;

+    /** @overload */
    virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;

-    //! version for user specified data term
+    /** @brief Enables the stereo correspondence operator that finds the disparity for the specified data cost.
+
+    @param data User-specified data cost, a matrix of msg\_type type and
+    Size(\<image columns\>\*ndisp, \<image rows\>) size.
+    @param disparity Output disparity map. If disparity is empty, the output type is CV\_16SC1 .
+    Otherwise, the type is retained.
+    @param stream Stream for the asynchronous version.
+     */
    virtual void compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null()) = 0;

    //! number of BP iterations on each level
@ -107,18 +171,48 @@ public:
    virtual int getMsgType() const = 0;
    virtual void setMsgType(int msg_type) = 0;

+    /** @brief Uses a heuristic method to compute the recommended parameters ( ndisp, iters and levels ) for the
+    specified image size ( width and height ).
+     */
    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
 };

+/** @brief Creates StereoBeliefPropagation object.
+
+@param ndisp Number of disparities.
+@param iters Number of BP iterations on each level.
+@param levels Number of levels.
+@param msg\_type Type for messages. CV\_16SC1 and CV\_32FC1 types are supported.
+ */
 CV_EXPORTS Ptr<cuda::StereoBeliefPropagation>
    createStereoBeliefPropagation(int ndisp = 64, int iters = 5, int levels = 5, int msg_type = CV_32F);

 /////////////////////////////////////////
 // StereoConstantSpaceBP

-//! "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
-//! Qingxiong Yang, Liang Wang, Narendra Ahuja
-//! http://vision.ai.uiuc.edu/~qyang6/
+/** @brief Class computing stereo correspondence using the constant space belief propagation algorithm. :
+
+The class implements algorithm described in @cite Yang2010. StereoConstantSpaceBP supports both local
+minimum and global minimum data cost initialization algorithms. For more details, see the paper
+mentioned above. By default, a local algorithm is used. To enable a global algorithm, set
+use\_local\_init\_data\_cost to false .
+
+StereoConstantSpaceBP uses a truncated linear model for the data cost and discontinuity terms:
+
+\f[DataCost = data \_ weight  \cdot \min ( \lvert I_2-I_1  \rvert , max \_ data \_ term)\f]
+
+\f[DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)\f]
+
+For more details, see @cite Yang2010.
+
+By default, StereoConstantSpaceBP uses floating-point arithmetics and the CV\_32FC1 type for
+messages. But it can also use fixed-point arithmetics and the CV\_16SC1 message type for better
+performance. To avoid an overflow in this case, the parameters must satisfy the following
+requirement:
+
+\f[10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX\f]
+
+ */
 class CV_EXPORTS StereoConstantSpaceBP : public cuda::StereoBeliefPropagation
 {
 public:
@ -129,23 +223,40 @@ public:
    virtual bool getUseLocalInitDataCost() const = 0;
    virtual void setUseLocalInitDataCost(bool use_local_init_data_cost) = 0;

+    /** @brief Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified
+    image size (widthand height).
+     */
    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);
 };

+/** @brief Creates StereoConstantSpaceBP object.
+
+@param ndisp Number of disparities.
+@param iters Number of BP iterations on each level.
+@param levels Number of levels.
+@param nr\_plane Number of disparity levels on the first level.
+@param msg\_type Type for messages. CV\_16SC1 and CV\_32FC1 types are supported.
+ */
 CV_EXPORTS Ptr<cuda::StereoConstantSpaceBP>
    createStereoConstantSpaceBP(int ndisp = 128, int iters = 8, int levels = 4, int nr_plane = 4, int msg_type = CV_32F);

 /////////////////////////////////////////
 // DisparityBilateralFilter

-//! Disparity map refinement using joint bilateral filtering given a single color image.
-//! Qingxiong Yang, Liang Wang, Narendra Ahuja
-//! http://vision.ai.uiuc.edu/~qyang6/
+/** @brief Class refining a disparity map using joint bilateral filtering. :
+
+The class implements @cite Yang2010 algorithm.
+ */
 class CV_EXPORTS DisparityBilateralFilter : public cv::Algorithm
 {
 public:
-    //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.
-    //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.
+    /** @brief Refines a disparity map using joint bilateral filtering.
+
+    @param disparity Input disparity map. CV\_8UC1 and CV\_16SC1 types are supported.
+    @param image Input image. CV\_8UC1 and CV\_8UC3 types are supported.
+    @param dst Destination disparity map. It has the same size and type as disparity .
+    @param stream Stream for the asynchronous version.
+     */
    virtual void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null()) = 0;

    virtual int getNumDisparities() const = 0;
@ -170,24 +281,48 @@ public:
    virtual void setSigmaRange(double sigma_range) = 0;
 };

+/** @brief Creates DisparityBilateralFilter object.
+
+@param ndisp Number of disparities.
+@param radius Filter radius.
+@param iters Number of iterations.
+ */
 CV_EXPORTS Ptr<cuda::DisparityBilateralFilter>
    createDisparityBilateralFilter(int ndisp = 64, int radius = 3, int iters = 1);

 /////////////////////////////////////////
 // Utility

-//! Reprojects disparity image to 3D space.
-//! Supports CV_8U and CV_16S types of input disparity.
-//! The output is a 3- or 4-channel floating-point matrix.
-//! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.
-//! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.
+/** @brief Reprojects a disparity image to 3D space.
+
+@param disp Input disparity image. CV\_8U and CV\_16S types are supported.
+@param xyzw Output 3- or 4-channel floating-point image of the same size as disp . Each element of
+xyzw(x,y) contains 3D coordinates (x,y,z) or (x,y,z,1) of the point (x,y) , computed from the
+disparity map.
+@param Q \f$4 \times 4\f$ perspective transformation matrix that can be obtained via stereoRectify .
+@param dst\_cn The number of channels for output image. Can be 3 or 4.
+@param stream Stream for the asynchronous version.
+
+@sa reprojectImageTo3D
+ */
 CV_EXPORTS void reprojectImageTo3D(InputArray disp, OutputArray xyzw, InputArray Q, int dst_cn = 4, Stream& stream = Stream::Null());

-//! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.
-//! Supported types of input disparity: CV_8U, CV_16S.
-//! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).
+/** @brief Colors a disparity image.
+
+@param src\_disp Source disparity image. CV\_8UC1 and CV\_16SC1 types are supported.
+@param dst\_disp Output disparity image. It has the same size as src\_disp . The type is CV\_8UC4
+in BGRA format (alpha = 255).
+@param ndisp Number of disparities.
+@param stream Stream for the asynchronous version.
+
+This function draws a colored disparity map by converting disparity values from [0..ndisp) interval
+first to HSV color space (where different disparity values correspond to different hues) and then
+converting the pixels to RGB for visualization.
+ */
 CV_EXPORTS void drawColorDisp(InputArray src_disp, OutputArray dst_disp, int ndisp, Stream& stream = Stream::Null());

+//! @}
+
 }} // namespace cv { namespace cuda {

 #endif /* __OPENCV_CUDASTEREO_HPP__ */
--- a/modules/cudawarping/include/opencv2/cudawarping.hpp
+++ b/modules/cudawarping/include/opencv2/cudawarping.hpp
@ -50,54 +50,178 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/imgproc.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudawarping Image Warping
+  @}
+ */
+
 namespace cv { namespace cuda {

-//! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]
-//! supports only CV_32FC1 map type
+//! @addtogroup cudawarping
+//! @{
+
+/** @brief Applies a generic geometrical transformation to an image.
+
+@param src Source image.
+@param dst Destination image with the size the same as xmap and the type the same as src .
+@param xmap X values. Only CV\_32FC1 type is supported.
+@param ymap Y values. Only CV\_32FC1 type is supported.
+@param interpolation Interpolation method (see resize ). INTER\_NEAREST , INTER\_LINEAR and
+INTER\_CUBIC are supported for now.
+@param borderMode Pixel extrapolation method (see borderInterpolate ). BORDER\_REFLECT101 ,
+BORDER\_REPLICATE , BORDER\_CONSTANT , BORDER\_REFLECT and BORDER\_WRAP are supported for now.
+@param borderValue Value used in case of a constant border. By default, it is 0.
+@param stream Stream for the asynchronous version.
+
+The function transforms the source image using the specified map:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} (xmap(x,y), ymap(x,y))\f]
+
+Values of pixels with non-integer coordinates are computed using the bilinear interpolation.
+
+@sa remap
+ */
 CV_EXPORTS void remap(InputArray src, OutputArray dst, InputArray xmap, InputArray ymap,
                      int interpolation, int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(),
                      Stream& stream = Stream::Null());

-//! resizes the image
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
+/** @brief Resizes an image.
+
+@param src Source image.
+@param dst Destination image with the same type as src . The size is dsize (when it is non-zero)
+or the size is computed from src.size() , fx , and fy .
+@param dsize Destination image size. If it is zero, it is computed as:
+\f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
+Either dsize or both fx and fy must be non-zero.
+@param fx Scale factor along the horizontal axis. If it is zero, it is computed as:
+\f[\texttt{(double)dsize.width/src.cols}\f]
+@param fy Scale factor along the vertical axis. If it is zero, it is computed as:
+\f[\texttt{(double)dsize.height/src.rows}\f]
+@param interpolation Interpolation method. INTER\_NEAREST , INTER\_LINEAR and INTER\_CUBIC are
+supported for now.
+@param stream Stream for the asynchronous version.
+
+@sa resize
+ */
 CV_EXPORTS void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());

-//! warps the image using affine transformation
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+/** @brief Applies an affine transformation to an image.
+
+@param src Source image. CV\_8U , CV\_16U , CV\_32S , or CV\_32F depth and 1, 3, or 4 channels are
+supported.
+@param dst Destination image with the same type as src . The size is dsize .
+@param M *2x3* transformation matrix.
+@param dsize Size of the destination image.
+@param flags Combination of interpolation methods (see resize) and the optional flag
+WARP\_INVERSE\_MAP specifying that M is an inverse transformation ( dst=\>src ). Only
+INTER\_NEAREST , INTER\_LINEAR , and INTER\_CUBIC interpolation methods are supported.
+@param borderMode
+@param borderValue
+@param stream Stream for the asynchronous version.
+
+@sa warpAffine
+ */
 CV_EXPORTS void warpAffine(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags = INTER_LINEAR,
    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());

+/** @brief Builds transformation maps for affine transformation.
+
+@param M *2x3* transformation matrix.
+@param inverse Flag specifying that M is an inverse transformation ( dst=\>src ).
+@param dsize Size of the destination image.
+@param xmap X values with CV\_32FC1 type.
+@param ymap Y values with CV\_32FC1 type.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpAffine , cuda::remap
+ */
 CV_EXPORTS void buildWarpAffineMaps(InputArray M, bool inverse, Size dsize, OutputArray xmap, OutputArray ymap, Stream& stream = Stream::Null());

-//! warps the image using perspective transformation
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+/** @brief Applies a perspective transformation to an image.
+
+@param src Source image. CV\_8U , CV\_16U , CV\_32S , or CV\_32F depth and 1, 3, or 4 channels are
+supported.
+@param dst Destination image with the same type as src . The size is dsize .
+@param M *3x3* transformation matrix.
+@param dsize Size of the destination image.
+@param flags Combination of interpolation methods (see resize ) and the optional flag
+WARP\_INVERSE\_MAP specifying that M is the inverse transformation ( dst =\> src ). Only
+INTER\_NEAREST , INTER\_LINEAR , and INTER\_CUBIC interpolation methods are supported.
+@param borderMode
+@param borderValue
+@param stream Stream for the asynchronous version.
+
+@sa warpPerspective
+ */
 CV_EXPORTS void warpPerspective(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags = INTER_LINEAR,
    int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());

+/** @brief Builds transformation maps for perspective transformation.
+
+@param M *3x3* transformation matrix.
+@param inverse Flag specifying that M is an inverse transformation ( dst=\>src ).
+@param dsize Size of the destination image.
+@param xmap X values with CV\_32FC1 type.
+@param ymap Y values with CV\_32FC1 type.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpPerspective , cuda::remap
+ */
 CV_EXPORTS void buildWarpPerspectiveMaps(InputArray M, bool inverse, Size dsize, OutputArray xmap, OutputArray ymap, Stream& stream = Stream::Null());

-//! builds plane warping maps
+/** @brief Builds plane warping maps.
+ */
 CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, InputArray T, float scale,
                                   OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());

-//! builds cylindrical warping maps
+/** @brief Builds cylindrical warping maps.
+ */
 CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, float scale,
                                         OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());

-//! builds spherical warping maps
+/** @brief Builds spherical warping maps.
+ */
 CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, float scale,
                                       OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());

-//! rotates an image around the origin (0,0) and then shifts it
-//! supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-//! supports 1, 3 or 4 channels images with CV_8U, CV_16U or CV_32F depth
+/** @brief Rotates an image around the origin (0,0) and then shifts it.
+
+@param src Source image. Supports 1, 3 or 4 channels images with CV\_8U , CV\_16U or CV\_32F
+depth.
+@param dst Destination image with the same type as src . The size is dsize .
+@param dsize Size of the destination image.
+@param angle Angle of rotation in degrees.
+@param xShift Shift along the horizontal axis.
+@param yShift Shift along the vertical axis.
+@param interpolation Interpolation method. Only INTER\_NEAREST , INTER\_LINEAR , and INTER\_CUBIC
+are supported.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpAffine
+ */
 CV_EXPORTS void rotate(InputArray src, OutputArray dst, Size dsize, double angle, double xShift = 0, double yShift = 0,
                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());

-//! smoothes the source image and downsamples it
+/** @brief Smoothes an image and downsamples it.
+
+@param src Source image.
+@param dst Destination image. Will have Size((src.cols+1)/2, (src.rows+1)/2) size and the same
+type as src .
+@param stream Stream for the asynchronous version.
+
+@sa pyrDown
+ */
 CV_EXPORTS void pyrDown(InputArray src, OutputArray dst, Stream& stream = Stream::Null());

-//! upsamples the source image and then smoothes it
+/** @brief Upsamples an image and then smoothes it.
+
+@param src Source image.
+@param dst Destination image. Will have Size(src.cols\*2, src.rows\*2) size and the same type as
+src .
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void pyrUp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());

 class CV_EXPORTS ImagePyramid : public Algorithm
@ -108,6 +232,8 @@ public:

 CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());

+//! @}
+
 }} // namespace cv { namespace cuda {

 #endif /* __OPENCV_CUDAWARPING_HPP__ */
--- a/modules/cudev/include/opencv2/cudev.hpp
+++ b/modules/cudev/include/opencv2/cudev.hpp
@ -109,4 +109,11 @@
 #include "cudev/expr/unary_op.hpp"
 #include "cudev/expr/warping.hpp"

+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudev Device layer
+  @}
+*/
+
 #endif
--- a/modules/cudev/include/opencv2/cudev/block/block.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/block.hpp
@ -50,6 +50,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 struct Block
 {
    __device__ __forceinline__ static uint blockId()
@ -122,6 +125,9 @@ __device__ __forceinline__ static void blockTransfrom(InIt1 beg1, InIt1 end1, In
    for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
        *o = op(*t1, *t2);
 }
+
+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp
@ -50,6 +50,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class T> struct DynamicSharedMem
 {
    __device__ __forceinline__ operator T*()
@ -81,6 +84,8 @@ template <> struct DynamicSharedMem<double>
    }
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/block/reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/reduce.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // blockReduce

 template <int N, typename T, class Op>
@ -123,6 +126,8 @@ __device__ __forceinline__ void blockReduceKeyVal(const tuple<KP0, KP1, KP2, KP3
            >(skeys, key, svals, val, tid, cmp);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <int THREADS_NUM, typename T>
 __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 {
@ -96,6 +99,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp
@ -53,6 +53,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // NormL1

 template <typename T> struct NormL1
@ -179,6 +182,8 @@ struct NormHamming
    }
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/common.hpp
+++ b/modules/cudev/include/opencv2/cudev/common.hpp
@ -52,6 +52,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 using namespace cv::cuda;

 // CV_CUDEV_ARCH
@ -84,6 +87,8 @@ __host__ __device__ __forceinline__ int divUp(int total, int grain)
 #define CV_PI_F   ((float)CV_PI)
 #define CV_LOG2_F ((float)CV_LOG2)

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp
@ -55,6 +55,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 #define CV_CUDEV_EXPR_BINARY_FUNC(name) \
    template <class SrcPtr1, class SrcPtr2> \
    __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<SrcPtr1>::ptr_type, typename PtrTraits<SrcPtr2>::ptr_type, name ## _func<typename LargerType<typename PtrTraits<SrcPtr1>::value_type, typename PtrTraits<SrcPtr2>::value_type>::type> > > \
@ -70,6 +73,8 @@ CV_CUDEV_EXPR_BINARY_FUNC(absdiff)

 #undef CV_CUDEV_EXPR_BINARY_FUNC

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp
@ -58,6 +58,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // Binary Operations

 #define CV_CUDEV_EXPR_BINOP_INST(op, functor) \
@ -230,6 +233,8 @@ CV_CUDEV_EXPR_BINOP_INST(>>, bit_rshift)

 #undef CV_CUDEV_EXPR_BINOP_INST

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/color.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/color.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 #define CV_CUDEV_EXPR_CVTCOLOR_INST(name) \
    template <class SrcPtr> \
    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, name ## _func<typename VecTraits<typename PtrTraits<SrcPtr>::value_type>::elem_type> > > \
@ -277,6 +280,8 @@ CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_LBGRA)

 #undef CV_CUDEV_EXPR_CVTCOLOR_INST

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/deriv.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/deriv.hpp
@ -53,6 +53,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // derivX

 template <class SrcPtr>
@ -116,6 +119,8 @@ laplacian_(const SrcPtr& src)
    return makeExpr(laplacianPtr<ksize>(src));
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/expr.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/expr.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Body> struct Expr
 {
    Body body;
@ -87,6 +90,8 @@ template <class Body> struct PtrTraits< Expr<Body> >
    }
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp
@ -56,6 +56,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // min/max

 template <class SrcPtr1, class SrcPtr2>
@ -127,6 +130,8 @@ lut_(const SrcPtr& src, const TablePtr& tbl)
    return makeExpr(lutPtr(src, tbl));
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
@ -56,6 +56,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // sum

 template <class SrcPtr> struct SumExprBody
@ -254,6 +257,8 @@ integral_(const SrcPtr& src)
    return makeExpr(body);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 #define CV_CUDEV_EXPR_UNARY_FUNC(name) \
    template <class SrcPtr> \
    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, name ## _func<typename PtrTraits<SrcPtr>::value_type> > > \
@ -93,6 +96,8 @@ pow_(const SrcPtr& src, float power)
    return makeExpr(transformPtr(src, bind2nd(pow_func<typename PtrTraits<SrcPtr>::value_type>(), power)));
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp
@ -57,6 +57,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 #define CV_CUDEV_EXPR_UNOP_INST(op, functor) \
    template <typename T> \
    __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, functor<T> > > \
@ -89,6 +92,8 @@ CV_CUDEV_EXPR_UNOP_INST(~, bit_not)

 #undef CV_CUDEV_EXPR_UNOP_INST

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/expr/warping.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/warping.hpp
@ -57,6 +57,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // resize

 template <class SrcPtr>
@ -166,6 +169,8 @@ transpose_(const SrcPtr& src)
    return makeExpr(body);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // Various 3/4-channel to 3/4-channel RGB transformations

 #define CV_CUDEV_RGB2RGB_INST(name, scn, dcn, bidx) \
@ -469,6 +472,8 @@ CV_CUDEV_RGB5x52GRAY_INST(BGR565_to_GRAY, 6)

 #undef CV_CUDEV_RGB5x52GRAY_INST

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/functional/functional.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // Function Objects

 template <typename _Arg, typename _Result> struct unary_function
@ -873,6 +876,8 @@ template <typename F> struct IsBinaryFunction
    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Op, int n> struct UnaryTupleAdapter
 {
    typedef typename Op::result_type result_type;
@ -93,6 +96,8 @@ __host__ __device__ BinaryTupleAdapter<Op, n0, n1> binaryTupleAdapter(const Op&
    return a;
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/copy.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/copy.hpp
@ -57,6 +57,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
 __host__ void gridCopy_(const SrcPtr& src, GpuMat_<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@ -447,6 +450,8 @@ __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, Glob
    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/histogram.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/histogram.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <int BIN_COUNT, class Policy, class SrcPtr, typename ResType, class MaskPtr>
 __host__ void gridHistogram_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@ -114,6 +117,8 @@ __host__ void gridHistogram(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& st
    gridHistogram_<BIN_COUNT, DefaultHistogramPolicy>(src, dst, stream);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/integral.hpp
@ -53,6 +53,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class SrcPtr, typename DstType>
 __host__ void gridIntegral(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
@ -64,6 +67,8 @@ __host__ void gridIntegral(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& str
    integral_detail::integral(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp
@ -55,6 +55,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Brd, class SrcPtr, typename DstType>
 __host__ void gridPyrDown_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
@ -83,6 +86,8 @@ __host__ void gridPyrUp(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream
    pyramids_detail::pyrUp(shrinkPtr(src), shrinkPtr(dst), rows, cols, dst.rows, dst.cols, StreamAccessor::getStream(stream));
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce.hpp
@ -57,6 +57,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
 __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@ -370,6 +373,8 @@ __host__ void gridCountNonZero(const SrcPtr& src, GpuMat_<ResType>& dst, Stream&
    gridCountNonZero_<DefaultGlobReducePolicy>(src, dst, stream);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
@ -59,6 +59,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <typename T> struct Sum : plus<T>
 {
    typedef T work_type;
@ -225,6 +228,8 @@ __host__ void gridReduceToColumn(const SrcPtr& src, GpuMat_<ResType>& dst, Strea
    gridReduceToColumn_<Reductor, DefaultReduceToVecPolicy>(src, dst, stream);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
@ -57,6 +57,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtrTuple, typename DstType, class MaskPtr>
 __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@ -579,6 +582,8 @@ __host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], Str
    gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
@ -57,6 +57,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
 __host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@ -536,6 +539,8 @@ __host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>,
    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtr, typename DstType>
 __host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
@ -98,6 +101,8 @@ __host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, St
    gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <typename T> struct ConstantPtr
 {
    typedef T   value_type;
@ -88,6 +91,8 @@ template <typename T> struct PtrTraits< ConstantPtrSz<T> > : PtrTraitsBase< Cons
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp
@ -53,6 +53,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // derivX

 template <class SrcPtr> struct DerivXPtr
@ -388,6 +391,8 @@ template <int ksize, class SrcPtr> struct PtrTraits< LaplacianPtrSz<ksize, SrcPt
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp
@ -52,6 +52,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // BrdConstant

 template <class SrcPtr> struct BrdConstant
@ -214,6 +217,8 @@ __host__ BrdBase<BrdWrap, typename PtrTraits<SrcPtr>::ptr_type> brdWrap(const Sr
    return b;
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <typename T> struct GlobPtr
 {
    typedef T   value_type;
@ -106,6 +109,8 @@ template <typename T> struct PtrTraits< GlobPtrSz<T> > : PtrTraitsBase<GlobPtrSz
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
@ -53,6 +53,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <typename T>
 class GpuMat_ : public GpuMat
 {
@ -154,6 +157,8 @@ template <typename T> struct PtrTraits< GpuMat_<T> > : PtrTraitsBase<GpuMat_<T>,
 {
 };

+//! @}
+
 }}

 #include "detail/gpumat.hpp"
--- a/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp
@ -55,6 +55,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // Nearest

 template <class SrcPtr> struct NearestInterPtr
@ -380,6 +383,8 @@ template <class SrcPtr> struct PtrTraits< CommonAreaInterPtrSz<SrcPtr> > : PtrTr
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class SrcPtr, class TablePtr> struct LutPtr
 {
    typedef typename PtrTraits<TablePtr>::value_type value_type;
@ -95,6 +98,8 @@ template <class SrcPtr, class TablePtr> struct PtrTraits< LutPtrSz<SrcPtr, Table
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 struct WithOutMask
 {
    typedef bool value_type;
@ -98,6 +101,8 @@ template <class MaskPtr> struct PtrTraits< SingleMaskChannelsSz<MaskPtr> > : Ptr
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class SrcPtr, class MapPtr> struct RemapPtr1
 {
    typedef typename PtrTraits<SrcPtr>::value_type value_type;
@ -149,6 +152,8 @@ template <class SrcPtr, class MapXPtr, class MapYPtr> struct PtrTraits< RemapPtr
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp
@ -54,6 +54,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class SrcPtr> struct ResizePtr
 {
    typedef typename PtrTraits<SrcPtr>::value_type value_type;
@ -98,6 +101,8 @@ template <class SrcPtr> struct PtrTraits< ResizePtrSz<SrcPtr> > : PtrTraitsBase<
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
@ -92,6 +92,9 @@ namespace

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 #if CUDART_VERSION >= 5050

 template <typename T> struct TexturePtr
@ -248,6 +251,8 @@ template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>,

 #endif

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp
@ -50,6 +50,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class Ptr2DSz, class Ptr2D> struct PtrTraitsBase
 {
    typedef Ptr2DSz ptr_sz_type;
@ -96,6 +99,8 @@ __host__ int getCols(const Ptr2DSz& ptr)
    return PtrTraits<Ptr2DSz>::getCols(ptr);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp
@ -53,6 +53,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // UnaryTransformPtr

 template <class SrcPtr, class Op> struct UnaryTransformPtr
@ -146,6 +149,8 @@ template <class Src1Ptr, class Src2Ptr, class Op> struct PtrTraits< BinaryTransf
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp
@ -53,6 +53,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // affine

 struct AffineMapPtr
@ -147,6 +150,8 @@ warpPerspectivePtr(const SrcPtr& src, Size dstSize, const GpuMat_<float>& warpMa
    return remapPtr(src, perspectiveMap(dstSize, warpMat));
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
@ -52,6 +52,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class PtrTuple> struct ZipPtr;

 template <class Ptr0, class Ptr1> struct ZipPtr< tuple<Ptr0, Ptr1> > : tuple<Ptr0, Ptr1>
@ -168,6 +171,8 @@ template <class PtrTuple> struct PtrTraits< ZipPtrSz<PtrTuple> > : PtrTraitsBase
 {
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/atomic.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/atomic.hpp
@ -50,6 +50,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // atomicAdd

 __device__ __forceinline__ int atomicAdd(int* address, int val)
@ -192,6 +195,8 @@ __device__ static double atomicMax(double* address, double val)
 #endif
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/limits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/limits.hpp
@ -52,6 +52,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <class T> struct numeric_limits;

 template <> struct numeric_limits<bool>
@ -119,6 +122,8 @@ template <> struct numeric_limits<double>
    static const bool is_signed = true;
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
@ -50,6 +50,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <typename T> __device__ __forceinline__ T saturate_cast(uchar v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(schar v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(ushort v) { return T(v); }
@ -267,6 +270,8 @@ template <> __device__ __forceinline__ uint saturate_cast<uint>(double v)
 #endif
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp
@ -128,6 +128,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // 2

 __device__ __forceinline__ uint vadd2(uint a, uint b)
@ -908,6 +911,8 @@ __device__ __forceinline__ uint vmin4(uint a, uint b)
    return r;
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/tuple.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/tuple.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 using tuple_detail::tuple;
 using tuple_detail::tuple_size;
 using tuple_detail::get;
@ -75,6 +78,8 @@ template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple
    typedef typename tuple_detail::ConvertTuple<Tuple, tuple_size<Tuple>::value, CvtOp>::type type;
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/type_traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/type_traits.hpp
@ -52,6 +52,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // NullType

 struct NullType {};
@ -164,6 +167,8 @@ template <typename A, typename B> struct LargerType
    >::type type;
 };

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // saturate_cast

 namespace vec_math_detail
@ -931,6 +934,8 @@ CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)

 #undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
@ -50,6 +50,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // MakeVec

 template<typename T, int CN> struct MakeVec;
@ -177,6 +180,8 @@ template<> struct VecTraits<char4>
    __host__ __device__ __forceinline__ static char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
 };

+//! @}
+
 }}

 // DataType
--- a/modules/cudev/include/opencv2/cudev/warp/reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/reduce.hpp
@ -53,6 +53,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 // warpReduce

 template <typename T, class Op>
@ -201,6 +204,8 @@ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t
    return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@ -52,6 +52,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 template <typename T>
 __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
 {
@ -94,6 +97,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
    return warpScanInclusive(data, smem, tid) - data;
 }

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@ -51,6 +51,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 #if CV_CUDEV_ARCH >= 300

 // shfl
@ -419,6 +422,8 @@ CV_CUDEV_SHFL_XOR_VEC_INST(double)

 #endif // CV_CUDEV_ARCH >= 300

+//! @}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/warp/warp.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/warp.hpp
@ -50,6 +50,9 @@

 namespace cv { namespace cudev {

+//! @addtogroup cudev
+//! @{
+
 enum
 {
    LOG_WARP_SIZE = 5,
@ -117,6 +120,8 @@ __device__ __forceinline__ void warpYota(OutIt beg, OutIt end, T value)
        *t = value;
 }

+//! @}
+
 }}

 #endif
--- a/modules/viz/include/opencv2/viz/types.hpp
+++ b/modules/viz/include/opencv2/viz/types.hpp
@ -187,6 +187,8 @@ namespace cv
    } /* namespace viz */
 } /* namespace cv */

+//! @cond IGNORED
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /// cv::viz::Color

@ -237,4 +239,6 @@ inline cv::viz::Color cv::viz::Color::amethyst()       { return Color(204, 102,

 inline cv::viz::Color cv::viz::Color::not_set()        { return Color(-1, -1, -1); }

+//! @endcond
+
 #endif