From ceb6e8bd940cd4f28f3cf167c41acd676a1223e4 Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@itseez.com>
Date: Thu, 20 Nov 2014 16:42:06 +0300
Subject: [PATCH] Doxygen documentation: cuda

---
 doc/CMakeLists.txt                            |   8 +-
 doc/Doxyfile.in                               |   2 +-
 modules/core/include/opencv2/core/cuda.hpp    | 199 ++++-
 .../opencv2/core/cuda_stream_accessor.hpp     |   5 +
 .../core/include/opencv2/core/cuda_types.hpp  |  11 +
 modules/cuda/doc/introduction.markdown        |  85 ++
 modules/cuda/include/opencv2/cuda.hpp         | 183 +++-
 .../cudaarithm/include/opencv2/cudaarithm.hpp | 793 +++++++++++++++---
 .../cudabgsegm/include/opencv2/cudabgsegm.hpp | 114 ++-
 .../cudacodec/include/opencv2/cudacodec.hpp   | 200 ++++-
 .../include/opencv2/cudafeatures2d.hpp        | 260 ++++--
 .../include/opencv2/cudafilters.hpp           | 208 ++++-
 .../include/opencv2/cudaimgproc.hpp           | 486 ++++++++++-
 .../cudalegacy/include/opencv2/cudalegacy.hpp |   7 +
 .../include/opencv2/cudalegacy/NCV.hpp        |   4 +-
 .../opencv2/cudalegacy/NCVBroxOpticalFlow.hpp |   6 +
 .../cudalegacy/NCVHaarObjectDetection.hpp     |   4 +-
 .../include/opencv2/cudalegacy/NCVPyramid.hpp |   4 +
 .../opencv2/cudalegacy/NPP_staging.hpp        |  15 +-
 .../include/opencv2/cudalegacy/private.hpp    |   4 +
 .../include/opencv2/cudaoptflow.hpp           | 101 ++-
 .../cudastereo/include/opencv2/cudastereo.hpp | 171 +++-
 .../include/opencv2/cudawarping.hpp           | 158 +++-
 modules/cudev/include/opencv2/cudev.hpp       |   7 +
 .../include/opencv2/cudev/block/block.hpp     |   6 +
 .../opencv2/cudev/block/dynamic_smem.hpp      |   5 +
 .../include/opencv2/cudev/block/reduce.hpp    |   5 +
 .../include/opencv2/cudev/block/scan.hpp      |   5 +
 .../opencv2/cudev/block/vec_distance.hpp      |   5 +
 .../cudev/include/opencv2/cudev/common.hpp    |   5 +
 .../opencv2/cudev/expr/binary_func.hpp        |   5 +
 .../include/opencv2/cudev/expr/binary_op.hpp  |   5 +
 .../include/opencv2/cudev/expr/color.hpp      |   5 +
 .../include/opencv2/cudev/expr/deriv.hpp      |   5 +
 .../cudev/include/opencv2/cudev/expr/expr.hpp |   5 +
 .../opencv2/cudev/expr/per_element_func.hpp   |   5 +
 .../include/opencv2/cudev/expr/reduction.hpp  |   5 +
 .../include/opencv2/cudev/expr/unary_func.hpp |   5 +
 .../include/opencv2/cudev/expr/unary_op.hpp   |   5 +
 .../include/opencv2/cudev/expr/warping.hpp    |   5 +
 .../opencv2/cudev/functional/color_cvt.hpp    |   5 +
 .../opencv2/cudev/functional/functional.hpp   |   5 +
 .../cudev/functional/tuple_adapter.hpp        |   5 +
 .../cudev/include/opencv2/cudev/grid/copy.hpp |   5 +
 .../include/opencv2/cudev/grid/histogram.hpp  |   5 +
 .../include/opencv2/cudev/grid/integral.hpp   |   5 +
 .../include/opencv2/cudev/grid/pyramids.hpp   |   5 +
 .../include/opencv2/cudev/grid/reduce.hpp     |   5 +
 .../opencv2/cudev/grid/reduce_to_vec.hpp      |   5 +
 .../opencv2/cudev/grid/split_merge.hpp        |   5 +
 .../include/opencv2/cudev/grid/transform.hpp  |   5 +
 .../include/opencv2/cudev/grid/transpose.hpp  |   5 +
 .../include/opencv2/cudev/ptr2d/constant.hpp  |   5 +
 .../include/opencv2/cudev/ptr2d/deriv.hpp     |   5 +
 .../opencv2/cudev/ptr2d/extrapolation.hpp     |   5 +
 .../include/opencv2/cudev/ptr2d/glob.hpp      |   5 +
 .../include/opencv2/cudev/ptr2d/gpumat.hpp    |   5 +
 .../opencv2/cudev/ptr2d/interpolation.hpp     |   5 +
 .../cudev/include/opencv2/cudev/ptr2d/lut.hpp |   5 +
 .../include/opencv2/cudev/ptr2d/mask.hpp      |   5 +
 .../include/opencv2/cudev/ptr2d/remap.hpp     |   5 +
 .../include/opencv2/cudev/ptr2d/resize.hpp    |   5 +
 .../include/opencv2/cudev/ptr2d/texture.hpp   |   5 +
 .../include/opencv2/cudev/ptr2d/traits.hpp    |   5 +
 .../include/opencv2/cudev/ptr2d/transform.hpp |   5 +
 .../include/opencv2/cudev/ptr2d/warping.hpp   |   5 +
 .../cudev/include/opencv2/cudev/ptr2d/zip.hpp |   5 +
 .../include/opencv2/cudev/util/atomic.hpp     |   5 +
 .../include/opencv2/cudev/util/limits.hpp     |   5 +
 .../opencv2/cudev/util/saturate_cast.hpp      |   5 +
 .../opencv2/cudev/util/simd_functions.hpp     |   5 +
 .../include/opencv2/cudev/util/tuple.hpp      |   5 +
 .../opencv2/cudev/util/type_traits.hpp        |   5 +
 .../include/opencv2/cudev/util/vec_math.hpp   |   5 +
 .../include/opencv2/cudev/util/vec_traits.hpp |   5 +
 .../include/opencv2/cudev/warp/reduce.hpp     |   5 +
 .../cudev/include/opencv2/cudev/warp/scan.hpp |   5 +
 .../include/opencv2/cudev/warp/shuffle.hpp    |   5 +
 .../cudev/include/opencv2/cudev/warp/warp.hpp |   5 +
 modules/viz/include/opencv2/viz/types.hpp     |   4 +
 80 files changed, 2917 insertions(+), 398 deletions(-)
 create mode 100644 modules/cuda/doc/introduction.markdown

diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 81e18973f..2ef7b5fc4 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -159,12 +159,18 @@ if(BUILD_DOCS AND HAVE_DOXYGEN)
   set(reflist) # modules reference
   foreach(m ${candidates})
     set(reflist "${reflist} \n- @subpage ${m}")
-    set(all_headers ${all_headers} "${OPENCV_MODULE_opencv_${m}_HEADERS}")
+
+    set(header_dir "${OPENCV_MODULE_opencv_${m}_LOCATION}/include")
+    if(EXISTS ${header_dir})
+        set(all_headers ${all_headers} ${header_dir})
+    endif()
+
     set(docs_dir "${OPENCV_MODULE_opencv_${m}_LOCATION}/doc")
     if(EXISTS ${docs_dir})
       set(all_images ${all_images} ${docs_dir})
       set(all_headers ${all_headers} ${docs_dir})
     endif()
+
   endforeach()
 
   # additional config
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index 508b6512a..b27593764 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -99,7 +99,7 @@ FILE_PATTERNS          =
 RECURSIVE              = YES
 EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       =
+EXCLUDE_PATTERNS       = *.inl.hpp *.impl.hpp *_detail.hpp */cudev/**/detail/*.hpp
 EXCLUDE_SYMBOLS        = cv::DataType<*> int
 EXAMPLE_PATH           = @CMAKE_DOXYGEN_EXAMPLE_PATH@
 EXAMPLE_PATTERNS       = *
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 612b5dbd1..218ebd86f 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -52,10 +52,12 @@
 #include "opencv2/core/cuda_types.hpp"
 
 /**
-@defgroup cuda CUDA-accelerated Computer Vision
-@{
-    @defgroup cuda_struct Data structures
-@}
+  @addtogroup cuda
+  @{
+    @defgroup cuda_init Initalization and Information
+    @defgroup cuda_struct Data Structures
+    @defgroup cuda_calib3d Camera Calibration and 3D Reconstruction
+  @}
  */
 
 namespace cv { namespace cuda {
@@ -65,8 +67,28 @@ namespace cv { namespace cuda {
 
 //////////////////////////////// GpuMat ///////////////////////////////
 
-//! Smart pointer for GPU memory with reference counting.
-//! Its interface is mostly similar with cv::Mat.
+/** @brief Base storage class for GPU memory with reference counting.
+
+Its interface matches the Mat interface with the following limitations:
+
+-   no arbitrary dimensions support (only 2D)
+-   no functions that return references to their data (because references on GPU are not valid for
+    CPU)
+-   no expression templates technique support
+
+Beware that the latter limitation may lead to overloaded matrix operators that cause memory
+allocations. The GpuMat class is convertible to cuda::PtrStepSz and cuda::PtrStep so it can be
+passed directly to the kernel.
+
+@note In contrast with Mat, in most cases GpuMat::isContinuous() == false . This means that rows are
+aligned to a size depending on the hardware. Single-row GpuMat is always a continuous matrix.
+
+@note You are not recommended to leave static or global GpuMat variables allocated, that is, to rely
+on its destructor. The destruction order of such variables and CUDA context is undefined. GPU memory
+release function returns error if the CUDA context has been destroyed before.
+
+@sa Mat
+ */
 class CV_EXPORTS GpuMat
 {
 public:
@@ -277,11 +299,28 @@ public:
     Allocator* allocator;
 };
 
-//! creates continuous matrix
+/** @brief Creates a continuous matrix.
+
+@param rows Row count.
+@param cols Column count.
+@param type Type of the matrix.
+@param arr Destination matrix. This parameter changes only if it has a proper type and area (
+\f$\texttt{rows} \times \texttt{cols}\f$ ).
+
+Matrix is called continuous if its elements are stored continuously, that is, without gaps at the
+end of each row.
+ */
 CV_EXPORTS void createContinuous(int rows, int cols, int type, OutputArray arr);
 
-//! ensures that size of the given matrix is not less than (rows, cols) size
-//! and matrix type is match specified one too
+/** @brief Ensures that the size of a matrix is big enough and the matrix has a proper type.
+
+@param rows Minimum desired number of rows.
+@param cols Minimum desired number of columns.
+@param type Desired matrix type.
+@param arr Destination matrix.
+
+The function does not reallocate memory if the matrix has proper attributes already.
+ */
 CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
 
 CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat);
@@ -292,10 +331,21 @@ CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCou
 
 //////////////////////////////// CudaMem ////////////////////////////////
 
-//! CudaMem is limited cv::Mat with page locked memory allocation.
-//! Page locked memory is only needed for async and faster coping to GPU.
-//! It is convertable to cv::Mat header without reference counting
-//! so you can use it with other opencv functions.
+/** @brief Class with reference counting wrapping special memory type allocation functions from CUDA.
+
+Its interface is also Mat-like but with additional memory type parameters.
+
+-   **PAGE\_LOCKED** sets a page locked memory type used commonly for fast and asynchronous
+    uploading/downloading data from/to GPU.
+-   **SHARED** specifies a zero copy memory allocation that enables mapping the host memory to GPU
+    address space, if supported.
+-   **WRITE\_COMBINED** sets the write combined buffer that is not cached by CPU. Such buffers are
+    used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache
+    utilization.
+
+@note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2
+Pinned Memory APIs* document or *CUDA C Programming Guide*.
+ */
 class CV_EXPORTS CudaMem
 {
 public:
@@ -335,7 +385,13 @@ public:
     //! returns matrix header with disabled reference counting for CudaMem data.
     Mat createMatHeader() const;
 
-    //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware.
+    /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting
+    for it.
+
+    This can be done only if memory was allocated with the SHARED flag and if it is supported by the
+    hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which
+    eliminates an extra copy.
+     */
     GpuMat createGpuMatHeader() const;
 
     // Please see cv::Mat for descriptions
@@ -363,17 +419,28 @@ public:
     AllocType alloc_type;
 };
 
-//! page-locks the matrix m memory and maps it for the device(s)
+/** @brief Page-locks the memory of matrix and maps it for the device(s).
+
+@param m Input matrix.
+ */
 CV_EXPORTS void registerPageLocked(Mat& m);
 
-//! unmaps the memory of matrix m, and makes it pageable again
+/** @brief Unmaps the memory of matrix and makes it pageable again.
+
+@param m Input matrix.
+ */
 CV_EXPORTS void unregisterPageLocked(Mat& m);
 
 ///////////////////////////////// Stream //////////////////////////////////
 
-//! Encapculates Cuda Stream. Provides interface for async coping.
-//! Passed to each function that supports async kernel execution.
-//! Reference counting is enabled.
+/** @brief This class encapsulates a queue of asynchronous calls.
+
+@note Currently, you may face problems if an operation is enqueued twice with different data. Some
+functions use the constant GPU memory, and next call may update the memory before the previous one
+has been finished. But calling different operations asynchronously is safe because each operation
+has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are
+also safe. :
+ */
 class CV_EXPORTS Stream
 {
     typedef void (Stream::*bool_type)() const;
@@ -385,16 +452,26 @@ public:
     //! creates a new asynchronous stream
     Stream();
 
-    //! queries an asynchronous stream for completion status
+    /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
+    */
     bool queryIfComplete() const;
 
-    //! waits for stream tasks to complete
+    /** @brief Blocks the current CPU thread until all operations in the stream are complete.
+    */
     void waitForCompletion();
 
-    //! makes a compute stream wait on an event
+    /** @brief Makes a compute stream wait on an event.
+    */
     void waitEvent(const Event& event);
 
-    //! adds a callback to be called on the host after all currently enqueued items in the stream have completed
+    /** @brief Adds a callback to be called on the host after all currently enqueued items in the stream have
+    completed.
+
+    @note Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization
+    that may depend on outstanding device work or other callbacks that are not mandated to run earlier.
+    Callbacks without a mandated order (in independent streams) execute in undefined order and may be
+    serialized.
+     */
     void enqueueHostCallback(StreamCallback callback, void* userData);
 
     //! return Stream object for default CUDA stream
@@ -446,21 +523,41 @@ private:
     friend struct EventAccessor;
 };
 
+//! @} cuda_struct
+
 //////////////////////////////// Initialization & Info ////////////////////////
 
-//! this is the only function that do not throw exceptions if the library is compiled without CUDA
+//! @addtogroup cuda_init
+//! @{
+
+/** @brief Returns the number of installed CUDA-enabled devices.
+
+Use this function before any other CUDA functions calls. If OpenCV is compiled without CUDA support,
+this function returns 0.
+ */
 CV_EXPORTS int getCudaEnabledDeviceCount();
 
-//! set device to be used for GPU executions for the calling host thread
+/** @brief Sets a device and initializes it for the current thread.
+
+@param device System index of a CUDA device starting with 0.
+
+If the call of this function is omitted, a default device is initialized at the fist CUDA usage.
+ */
 CV_EXPORTS void setDevice(int device);
 
-//! returns which device is currently being used for the calling host thread
+/** @brief Returns the current device index set by cuda::setDevice or initialized by default.
+ */
 CV_EXPORTS int getDevice();
 
-//! explicitly destroys and cleans up all resources associated with the current device in the current process
-//! any subsequent API call to this device will reinitialize the device
+/** @brief Explicitly destroys and cleans up all resources associated with the current device in the current
+process.
+
+Any subsequent API call to this device will reinitialize the device.
+ */
 CV_EXPORTS void resetDevice();
 
+/** @brief Enumeration providing CUDA computing features.
+ */
 enum FeatureSet
 {
     FEATURE_SET_COMPUTE_10 = 10,
@@ -482,12 +579,27 @@ enum FeatureSet
 //! checks whether current device supports the given feature
 CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
 
-//! information about what GPU archs this OpenCV CUDA module was compiled for
+/** @brief Class providing a set of static methods to check what NVIDIA\* card architecture the CUDA module was
+built for.
+
+According to the CUDA C Programming Guide Version 3.2: "PTX code produced for some specific compute
+capability can always be compiled to binary code of greater or equal compute capability".
+ */
 class CV_EXPORTS TargetArchs
 {
 public:
+    /** @brief The following method checks whether the module was built with the support of the given feature:
+
+    @param feature\_set Features to be checked. See :ocvcuda::FeatureSet.
+     */
     static bool builtWith(FeatureSet feature_set);
 
+    /** @brief There is a set of methods to check whether the module contains intermediate (PTX) or binary CUDA
+    code for the given architecture(s):
+
+    @param major Major compute capability version.
+    @param minor Minor compute capability version.
+     */
     static bool has(int major, int minor);
     static bool hasPtx(int major, int minor);
     static bool hasBin(int major, int minor);
@@ -498,17 +610,25 @@ public:
     static bool hasEqualOrGreaterBin(int major, int minor);
 };
 
-//! information about the given GPU.
+/** @brief Class providing functionality for querying the specified GPU properties.
+ */
 class CV_EXPORTS DeviceInfo
 {
 public:
     //! creates DeviceInfo object for the current GPU
     DeviceInfo();
 
-    //! creates DeviceInfo object for the given GPU
+    /** @brief The constructors.
+
+    @param device\_id System index of the CUDA device starting with 0.
+
+    Constructs the DeviceInfo object for the specified device. If device\_id parameter is missed, it
+    constructs an object for the current device.
+     */
     DeviceInfo(int device_id);
 
-    //! device number.
+    /** @brief Returns system index of the CUDA device starting with 0.
+    */
     int deviceID() const;
 
     //! ASCII string identifying device
@@ -680,10 +800,19 @@ public:
     size_t freeMemory() const;
     size_t totalMemory() const;
 
-    //! checks whether device supports the given feature
+    /** @brief Provides information on CUDA feature support.
+
+    @param feature\_set Features to be checked. See cuda::FeatureSet.
+
+    This function returns true if the device has the specified CUDA feature. Otherwise, it returns false
+     */
     bool supports(FeatureSet feature_set) const;
 
-    //! checks whether the CUDA module can be run on the given device
+    /** @brief Checks the CUDA module and device compatibility.
+
+    This function returns true if the CUDA module can be run on the specified device. Otherwise, it
+    returns false .
+     */
     bool isCompatible() const;
 
 private:
@@ -693,7 +822,7 @@ private:
 CV_EXPORTS void printCudaDeviceInfo(int device);
 CV_EXPORTS void printShortCudaDeviceInfo(int device);
 
-//! @}
+//! @} cuda_init
 
 }} // namespace cv { namespace cuda {
 
diff --git a/modules/core/include/opencv2/core/cuda_stream_accessor.hpp b/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
index 4eb4ba61a..38275d2fa 100644
--- a/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
@@ -66,6 +66,11 @@ namespace cv
         class Stream;
         class Event;
 
+        /** @brief Class that enables getting cudaStream\_t from cuda::Stream
+
+        because it is the only public header that depends on the CUDA Runtime API. Including it
+        brings a dependency to your code.
+         */
         struct StreamAccessor
         {
             CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
diff --git a/modules/core/include/opencv2/core/cuda_types.hpp b/modules/core/include/opencv2/core/cuda_types.hpp
index ec67ae08b..490086fb0 100644
--- a/modules/core/include/opencv2/core/cuda_types.hpp
+++ b/modules/core/include/opencv2/core/cuda_types.hpp
@@ -89,6 +89,11 @@ namespace cv
             size_t size;
         };
 
+        /** @brief Structure similar to cuda::PtrStepSz but containing only a pointer and row step.
+
+        Width and height fields are excluded due to performance reasons. The structure is intended
+        for internal use or for users who write device code.
+         */
         template <typename T> struct PtrStep : public DevPtr<T>
         {
             __CV_CUDA_HOST_DEVICE__ PtrStep() : step(0) {}
@@ -104,6 +109,12 @@ namespace cv
             __CV_CUDA_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
         };
 
+        /** @brief Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA
+        kernels).
+
+        Typically, it is used internally by OpenCV and by users who write device code. You can call
+        its members from both host and device code.
+         */
         template <typename T> struct PtrStepSz : public PtrStep<T>
         {
             __CV_CUDA_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
diff --git a/modules/cuda/doc/introduction.markdown b/modules/cuda/doc/introduction.markdown
new file mode 100644
index 000000000..ebe8c21af
--- /dev/null
+++ b/modules/cuda/doc/introduction.markdown
@@ -0,0 +1,85 @@
+CUDA Module Introduction {#cuda_intro}
+========================
+
+General Information
+-------------------
+
+The OpenCV CUDA module is a set of classes and functions to utilize CUDA computational capabilities.
+It is implemented using NVIDIA\* CUDA\* Runtime API and supports only NVIDIA GPUs. The OpenCV CUDA
+module includes utility functions, low-level vision primitives, and high-level algorithms. The
+utility functions and low-level primitives provide a powerful infrastructure for developing fast
+vision algorithms taking advantage of CUDA whereas the high-level functionality includes some
+state-of-the-art algorithms (such as stereo correspondence, face and people detectors, and others)
+ready to be used by the application developers.
+
+The CUDA module is designed as a host-level API. This means that if you have pre-compiled OpenCV
+CUDA binaries, you are not required to have the CUDA Toolkit installed or write any extra code to
+make use of the CUDA.
+
+The OpenCV CUDA module is designed for ease of use and does not require any knowledge of CUDA.
+Though, such a knowledge will certainly be useful to handle non-trivial cases or achieve the highest
+performance. It is helpful to understand the cost of various operations, what the GPU does, what the
+preferred data formats are, and so on. The CUDA module is an effective instrument for quick
+implementation of CUDA-accelerated computer vision algorithms. However, if your algorithm involves
+many simple operations, then, for the best possible performance, you may still need to write your
+own kernels to avoid extra write and read operations on the intermediate results.
+
+To enable CUDA support, configure OpenCV using CMake with WITH\_CUDA=ON . When the flag is set and
+if CUDA is installed, the full-featured OpenCV CUDA module is built. Otherwise, the module is still
+built but at runtime all functions from the module throw Exception with CV\_GpuNotSupported error
+code, except for cuda::getCudaEnabledDeviceCount(). The latter function returns zero GPU count in
+this case. Building OpenCV without CUDA support does not perform device code compilation, so it does
+not require the CUDA Toolkit installed. Therefore, using the cuda::getCudaEnabledDeviceCount()
+function, you can implement a high-level algorithm that will detect GPU presence at runtime and
+choose an appropriate implementation (CPU or GPU) accordingly.
+
+Compilation for Different NVIDIA\* Platforms
+--------------------------------------------
+
+NVIDIA\* compiler enables generating binary code (cubin and fatbin) and intermediate code (PTX).
+Binary code often implies a specific GPU architecture and generation, so the compatibility with
+other GPUs is not guaranteed. PTX is targeted for a virtual platform that is defined entirely by the
+set of capabilities or features. Depending on the selected virtual platform, some of the
+instructions are emulated or disabled, even if the real hardware supports all the features.
+
+At the first call, the PTX code is compiled to binary code for the particular GPU using a JIT
+compiler. When the target GPU has a compute capability (CC) lower than the PTX code, JIT fails. By
+default, the OpenCV CUDA module includes:
+
+\*
+   Binaries for compute capabilities 1.3 and 2.0 (controlled by CUDA\_ARCH\_BIN in CMake)
+
+\*
+   PTX code for compute capabilities 1.1 and 1.3 (controlled by CUDA\_ARCH\_PTX in CMake)
+
+This means that for devices with CC 1.3 and 2.0 binary images are ready to run. For all newer
+platforms, the PTX code for 1.3 is JIT'ed to a binary image. For devices with CC 1.1 and 1.2, the
+PTX for 1.1 is JIT'ed. For devices with CC 1.0, no code is available and the functions throw
+Exception. For platforms where JIT compilation is performed first, the run is slow.
+
+On a GPU with CC 1.0, you can still compile the CUDA module and most of the functions will run
+flawlessly. To achieve this, add "1.0" to the list of binaries, for example,
+CUDA\_ARCH\_BIN="1.0 1.3 2.0" . The functions that cannot be run on CC 1.0 GPUs throw an exception.
+
+You can always determine at runtime whether the OpenCV GPU-built binaries (or PTX code) are
+compatible with your GPU. The function cuda::DeviceInfo::isCompatible returns the compatibility
+status (true/false).
+
+Utilizing Multiple GPUs
+-----------------------
+
+In the current version, each of the OpenCV CUDA algorithms can use only a single GPU. So, to utilize
+multiple GPUs, you have to manually distribute the work between GPUs. Switching active devie can be
+done using cuda::setDevice() function. For more details please read Cuda C Programming Guide.
+
+While developing algorithms for multiple GPUs, note a data passing overhead. For primitive functions
+and small images, it can be significant, which may eliminate all the advantages of having multiple
+GPUs. But for high-level algorithms, consider using multi-GPU acceleration. For example, the Stereo
+Block Matching algorithm has been successfully parallelized using the following algorithm:
+
+1.  Split each image of the stereo pair into two horizontal overlapping stripes.
+2.  Process each pair of stripes (from the left and right images) on a separate Fermi\* GPU.
+3.  Merge the results into a single disparity map.
+
+With this algorithm, a dual GPU gave a 180% performance increase comparing to the single Fermi GPU.
+For a source code example, see <https://github.com/Itseez/opencv/tree/master/samples/gpu/>.
diff --git a/modules/cuda/include/opencv2/cuda.hpp b/modules/cuda/include/opencv2/cuda.hpp
index a42bfb7d8..1937987db 100644
--- a/modules/cuda/include/opencv2/cuda.hpp
+++ b/modules/cuda/include/opencv2/cuda.hpp
@@ -49,10 +49,22 @@
 
 #include "opencv2/core/cuda.hpp"
 
+/**
+@defgroup cuda CUDA-accelerated Computer Vision
+    @ref cuda_intro "Introduction page"
+    @{
+        @defgroup cuda_objdetect Object Detection
+    @}
+
+ */
+
 namespace cv { namespace cuda {
 
 //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
 
+//! @addtogroup cuda_objdetect
+//! @{
+
 struct CV_EXPORTS HOGConfidence
 {
    double scale;
@@ -61,31 +73,92 @@ struct CV_EXPORTS HOGConfidence
    std::vector<double> part_scores[4];
 };
 
+/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
+
+Interfaces of all methods are kept similar to the CPU HOG descriptor and detector analogues as much
+as possible.
+
+@note
+   -   An example applying the HOG descriptor for people detection can be found at
+        opencv\_source\_code/samples/cpp/peopledetect.cpp
+    -   A CUDA example applying the HOG descriptor for people detection can be found at
+        opencv\_source\_code/samples/gpu/hog.cpp
+    -   (Python) An example applying the HOG descriptor for people detection can be found at
+        opencv\_source\_code/samples/python2/peopledetect.py
+ */
 struct CV_EXPORTS HOGDescriptor
 {
     enum { DEFAULT_WIN_SIGMA = -1 };
     enum { DEFAULT_NLEVELS = 64 };
     enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
 
+    /** @brief Creates the HOG descriptor and detector.
+
+    @param win\_size Detection window size. Align to block size and block stride.
+    @param block\_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
+    @param block\_stride Block stride. It must be a multiple of cell size.
+    @param cell\_size Cell size. Only (8, 8) is supported for now.
+    @param nbins Number of bins. Only 9 bins per cell are supported for now.
+    @param win\_sigma Gaussian smoothing window parameter.
+    @param threshold\_L2hys L2-Hys normalization method shrinkage.
+    @param gamma\_correction Flag to specify whether the gamma correction preprocessing is required or
+    not.
+    @param nlevels Maximum number of detection window increases.
+     */
     HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
                   Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
                   int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
                   double threshold_L2hys=0.2, bool gamma_correction=true,
                   int nlevels=DEFAULT_NLEVELS);
 
+    /** @brief Returns the number of coefficients required for the classification.
+     */
     size_t getDescriptorSize() const;
+    /** @brief Returns the block histogram size.
+    */
     size_t getBlockHistogramSize() const;
 
+    /** @brief Sets coefficients for the linear SVM classifier.
+    */
     void setSVMDetector(const std::vector<float>& detector);
 
+    /** @brief Returns coefficients of the classifier trained for people detection (for default window size).
+    */
     static std::vector<float> getDefaultPeopleDetector();
+    /** @brief Returns coefficients of the classifier trained for people detection (for 48x96 windows).
+    */
     static std::vector<float> getPeopleDetector48x96();
+    /** @brief Returns coefficients of the classifier trained for people detection (for 64x128 windows).
+    */
     static std::vector<float> getPeopleDetector64x128();
 
+    /** @brief Performs object detection without a multi-scale window.
+
+    @param img Source image. CV\_8UC1 and CV\_8UC4 types are supported for now.
+    @param found\_locations Left-top corner points of detected objects boundaries.
+    @param hit\_threshold Threshold for the distance between features and SVM classifying plane.
+    Usually it is 0 and should be specfied in the detector coefficients (as the last free
+    coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
+    manually here.
+    @param win\_stride Window stride. It must be a multiple of block stride.
+    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
+     */
     void detect(const GpuMat& img, std::vector<Point>& found_locations,
                 double hit_threshold=0, Size win_stride=Size(),
                 Size padding=Size());
 
+    /** @brief Performs object detection with a multi-scale window.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param found\_locations Detected objects boundaries.
+    @param hit\_threshold Threshold for the distance between features and SVM classifying plane. See
+    cuda::HOGDescriptor::detect for details.
+    @param win\_stride Window stride. It must be a multiple of block stride.
+    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
+    @param scale0 Coefficient of the detection window increase.
+    @param group\_threshold Coefficient to regulate the similarity threshold. When detected, some
+    objects can be covered by many rectangles. 0 means not to perform grouping. See groupRectangles .
+     */
     void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
                           double hit_threshold=0, Size win_stride=Size(),
                           Size padding=Size(), double scale0=1.05,
@@ -98,6 +171,17 @@ struct CV_EXPORTS HOGDescriptor
                                                                     double hit_threshold, Size win_stride, Size padding,
                                                                     std::vector<HOGConfidence> &conf_out, int group_threshold);
 
+    /** @brief Returns block descriptors computed for the whole image.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param win\_stride Window stride. It must be a multiple of block stride.
+    @param descriptors 2D array of descriptors.
+    @param descr\_format Descriptor storage format:
+    -   **DESCR\_FORMAT\_ROW\_BY\_ROW** - Row-major order.
+    -   **DESCR\_FORMAT\_COL\_BY\_COL** - Column-major order.
+
+    The function is mainly used to learn the classifier.
+     */
     void getDescriptors(const GpuMat& img, Size win_stride,
                         GpuMat& descriptors,
                         int descr_format=DESCR_FORMAT_COL_BY_COL);
@@ -145,20 +229,82 @@ protected:
 
 //////////////////////////// CascadeClassifier ////////////////////////////
 
-// The cascade classifier class for object detection: supports old haar and new lbp xlm formats and nvbin for haar cascades olny.
+/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
+
+@note
+   -   A cascade classifier example can be found at
+        opencv\_source\_code/samples/gpu/cascadeclassifier.cpp
+    -   A Nvidea API specific cascade classifier example can be found at
+        opencv\_source\_code/samples/gpu/cascadeclassifier\_nvidia\_api.cpp
+ */
 class CV_EXPORTS CascadeClassifier_CUDA
 {
 public:
     CascadeClassifier_CUDA();
+    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
+
+    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
+    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
+    type of OpenCV XML cascade supported for LBP.
+     */
     CascadeClassifier_CUDA(const String& filename);
     ~CascadeClassifier_CUDA();
 
+    /** @brief Checks whether the classifier is loaded or not.
+    */
     bool empty() const;
+    /** @brief Loads the classifier from a file. The previous content is destroyed.
+
+    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
+    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
+    type of OpenCV XML cascade supported for LBP.
+     */
     bool load(const String& filename);
+    /** @brief Destroys the loaded classifier.
+    */
     void release();
 
-    /* returns number of detected objects */
+    /** @overload */
     int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
+    /** @brief Detects objects of different sizes in the input image.
+
+    @param image Matrix of type CV\_8U containing an image where objects should be detected.
+    @param objectsBuf Buffer to store detected objects (rectangles). If it is empty, it is allocated
+    with the default size. If not empty, the function searches not more than N objects, where
+    N = sizeof(objectsBufer's data)/sizeof(cv::Rect).
+    @param maxObjectSize Maximum possible object size. Objects larger than that are ignored. Used for
+    second signature and supported only for LBP cascades.
+    @param scaleFactor Parameter specifying how much the image size is reduced at each image scale.
+    @param minNeighbors Parameter specifying how many neighbors each candidate rectangle should have
+    to retain it.
+    @param minSize Minimum possible object size. Objects smaller than that are ignored.
+
+    The detected objects are returned as a list of rectangles.
+
+    The function returns the number of detected objects, so you can retrieve them as in the following
+    example:
+    @code
+        cuda::CascadeClassifier_CUDA cascade_gpu(...);
+
+        Mat image_cpu = imread(...)
+        GpuMat image_gpu(image_cpu);
+
+        GpuMat objbuf;
+        int detections_number = cascade_gpu.detectMultiScale( image_gpu,
+                  objbuf, 1.2, minNeighbors);
+
+        Mat obj_host;
+        // download only detected number of rectangles
+        objbuf.colRange(0, detections_number).download(obj_host);
+
+        Rect* faces = obj_host.ptr<Rect>();
+        for(int i = 0; i < detections_num; ++i)
+           cv::rectangle(image_cpu, faces[i], Scalar(255));
+
+        imshow("Faces", image_cpu);
+    @endcode
+    @sa CascadeClassifier::detectMultiScale
+     */
     int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 
     bool findLargestObject;
@@ -174,8 +320,13 @@ private:
     friend class CascadeClassifier_CUDA_LBP;
 };
 
+//! @} cuda_objdetect
+
 //////////////////////////// Labeling ////////////////////////////
 
+//! @addtogroup cuda
+//! @{
+
 //!performs labeling via graph cuts of a 2D regular 4-connected graph.
 CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels,
                          GpuMat& buf, Stream& stream = Stream::Null());
@@ -192,8 +343,13 @@ CV_EXPORTS void connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Sc
 //! performs connected componnents labeling.
 CV_EXPORTS void labelComponents(const GpuMat& mask, GpuMat& components, int flags = 0, Stream& stream = Stream::Null());
 
+//! @}
+
 //////////////////////////// Calib3d ////////////////////////////
 
+//! @addtogroup cuda_calib3d
+//! @{
+
 CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
                                 GpuMat& dst, Stream& stream = Stream::Null());
 
@@ -201,13 +357,34 @@ CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tve
                               const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
                               Stream& stream = Stream::Null());
 
+/** @brief Finds the object pose from 3D-2D point correspondences.
+
+@param object Single-row matrix of object points.
+@param image Single-row matrix of image points.
+@param camera\_mat 3x3 matrix of intrinsic camera parameters.
+@param dist\_coef Distortion coefficients. See undistortPoints for details.
+@param rvec Output 3D rotation vector.
+@param tvec Output 3D translation vector.
+@param use\_extrinsic\_guess Flag to indicate that the function must use rvec and tvec as an
+initial transformation guess. It is not supported for now.
+@param num\_iters Maximum number of RANSAC iterations.
+@param max\_dist Euclidean distance threshold to detect whether point is inlier or not.
+@param min\_inlier\_count Flag to indicate that the function must stop if greater or equal number
+of inliers is achieved. It is not supported for now.
+@param inliers Output vector of inlier indices.
+ */
 CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
                                const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,
                                int num_iters=100, float max_dist=8.0, int min_inlier_count=100,
                                std::vector<int>* inliers=NULL);
 
+//! @}
+
 //////////////////////////// VStab ////////////////////////////
 
+//! @addtogroup cuda
+//! @{
+
 //! removes points (CV_32FC2, single row matrix) with zero mask value
 CV_EXPORTS void compactPoints(GpuMat &points0, GpuMat &points1, const GpuMat &mask);
 
@@ -215,6 +392,8 @@ CV_EXPORTS void calcWobbleSuppressionMaps(
         int left, int idx, int right, Size size, const Mat &ml, const Mat &mr,
         GpuMat &mapx, GpuMat &mapy);
 
+//! @}
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDA_HPP__ */
diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index e493fd759..1ed1b3fe7 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -49,18 +49,85 @@
 
 #include "opencv2/core/cuda.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudaarithm Operations on Matrices
+    @{
+        @defgroup cudaarithm_core Core Operations on Matrices
+        @defgroup cudaarithm_elem Per-element Operations
+        @defgroup cudaarithm_reduce Matrix Reductions
+        @defgroup cudaarithm_arithm Arithm Operations on Matrices
+    @}
+  @}
+ */
+
 namespace cv { namespace cuda {
 
-//! adds one matrix to another (dst = src1 + src2)
+//! @addtogroup cudaarithm
+//! @{
+
+//! @addtogroup cudaarithm_elem
+//! @{
+
+/** @brief Computes a matrix-matrix or matrix-scalar sum.
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+@param dst Destination matrix that has the same size and number of channels as the input array(s).
+The depth is defined by dtype or src1 depth.
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed.
+@param dtype Optional depth of the output array.
+@param stream Stream for the asynchronous version.
+
+@sa add
+ */
 CV_EXPORTS void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
 
-//! subtracts one matrix from another (dst = src1 - src2)
+/** @brief Computes a matrix-matrix or matrix-scalar difference.
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+@param dst Destination matrix that has the same size and number of channels as the input array(s).
+The depth is defined by dtype or src1 depth.
+@param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+destination array to be changed.
+@param dtype Optional depth of the output array.
+@param stream Stream for the asynchronous version.
+
+@sa subtract
+ */
 CV_EXPORTS void subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
 
-//! computes element-wise weighted product of the two arrays (dst = scale * src1 * src2)
+/** @brief Computes a matrix-matrix or matrix-scalar per-element product.
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and number of channels as the input array(s).
+The depth is defined by dtype or src1 depth.
+@param scale Optional scale factor.
+@param dtype Optional depth of the output array.
+@param stream Stream for the asynchronous version.
+
+@sa multiply
+ */
 CV_EXPORTS void multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
 
-//! computes element-wise weighted quotient of the two arrays (dst = scale * (src1 / src2))
+/** @brief Computes a matrix-matrix or matrix-scalar division.
+
+@param src1 First source matrix or a scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and number of channels as the input array(s).
+The depth is defined by dtype or src1 depth.
+@param scale Optional scale factor.
+@param dtype Optional depth of the output array.
+@param stream Stream for the asynchronous version.
+
+This function, in contrast to divide, uses a round-down rounding mode.
+
+@sa divide
+ */
 CV_EXPORTS void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
 
 //! computes element-wise weighted reciprocal of an array (dst = scale/src2)
@@ -69,59 +136,199 @@ static inline void divide(double src1, InputArray src2, OutputArray dst, int dty
     divide(src1, src2, dst, 1.0, dtype, stream);
 }
 
-//! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
+/** @brief Computes per-element absolute difference of two matrices (or of a matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param stream Stream for the asynchronous version.
+
+@sa absdiff
+ */
 CV_EXPORTS void absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes absolute value of each matrix element
+/** @brief Computes an absolute value of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+@sa abs
+ */
 CV_EXPORTS void abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes square of each pixel in an image
+/** @brief Computes a square value of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes square root of each pixel in an image
+/** @brief Computes a square root of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+@sa sqrt
+ */
 CV_EXPORTS void sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes exponent of each matrix element
+/** @brief Computes an exponent of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+@sa exp
+ */
 CV_EXPORTS void exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes natural logarithm of absolute value of each matrix element
+/** @brief Computes a natural logarithm of absolute value of each matrix element.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+@sa log
+ */
 CV_EXPORTS void log(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes power of each matrix element:
-//!    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
-//!    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
+/** @brief Raises every matrix element to a power.
+
+@param src Source matrix.
+@param power Exponent of power.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+
+The function pow raises every element of the input matrix to power :
+
+\f[\texttt{dst} (I) =  \fork{\texttt{src}(I)^power}{if \texttt{power} is integer}{|\texttt{src}(I)|^power}{otherwise}\f]
+
+@sa pow
+ */
 CV_EXPORTS void pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null());
 
-//! compares elements of two arrays (dst = src1 <cmpop> src2)
+/** @brief Compares elements of two matrices (or of a matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param cmpop Flag specifying the relation between the elements to be checked:
+-   **CMP\_EQ:** a(.) == b(.)
+-   **CMP\_GT:** a(.) \< b(.)
+-   **CMP\_GE:** a(.) \<= b(.)
+-   **CMP\_LT:** a(.) \< b(.)
+-   **CMP\_LE:** a(.) \<= b(.)
+-   **CMP\_NE:** a(.) != b(.)
+@param stream Stream for the asynchronous version.
+
+@sa compare
+ */
 CV_EXPORTS void compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null());
 
-//! performs per-elements bit-wise inversion
+/** @brief Performs a per-element bitwise inversion.
+
+@param src Source matrix.
+@param dst Destination matrix with the same size and type as src .
+@param mask Optional operation mask. 8-bit single channel image.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
-//! calculates per-element bit-wise disjunction of two arrays
+/** @brief Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param mask Optional operation mask. 8-bit single channel image.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
-//! calculates per-element bit-wise conjunction of two arrays
+/** @brief Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param mask Optional operation mask. 8-bit single channel image.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
-//! calculates per-element bit-wise "exclusive or" operation
+/** @brief Performs a per-element bitwise exclusive or operation of two matrices (or of matrix and scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param mask Optional operation mask. 8-bit single channel image.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
 
-//! pixel by pixel right shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with integers elements
+/** @brief Performs pixel by pixel right shift of an image by a constant value.
+
+@param src Source matrix. Supports 1, 3 and 4 channels images with integers elements.
+@param val Constant values, one per channel.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
 
-//! pixel by pixel left shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+/** @brief Performs pixel by pixel right left of an image by a constant value.
+
+@param src Source matrix. Supports 1, 3 and 4 channels images with CV\_8U , CV\_16U or CV\_32S
+depth.
+@param val Constant values, one per channel.
+@param dst Destination matrix with the same size and type as src .
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes per-element minimum of two arrays (dst = min(src1, src2))
+/** @brief Computes the per-element minimum of two matrices (or a matrix and a scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param stream Stream for the asynchronous version.
+
+@sa min
+ */
 CV_EXPORTS void min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes per-element maximum of two arrays (dst = max(src1, src2))
+/** @brief Computes the per-element maximum of two matrices (or a matrix and a scalar).
+
+@param src1 First source matrix or scalar.
+@param src2 Second source matrix or scalar.
+@param dst Destination matrix that has the same size and type as the input array(s).
+@param stream Stream for the asynchronous version.
+
+@sa max
+ */
 CV_EXPORTS void max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
 
-//! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
+/** @brief Computes the weighted sum of two arrays.
+
+@param src1 First source array.
+@param alpha Weight for the first array elements.
+@param src2 Second source array of the same size and channel number as src1 .
+@param beta Weight for the second array elements.
+@param dst Destination array that has the same size and number of channels as the input arrays.
+@param gamma Scalar added to each sum.
+@param dtype Optional depth of the destination array. When both input arrays have the same depth,
+dtype can be set to -1, which will be equivalent to src1.depth().
+@param stream Stream for the asynchronous version.
+
+The function addWeighted calculates the weighted sum of two arrays as follows:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )\f]
+
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+@sa addWeighted
+ */
 CV_EXPORTS void addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst,
                             int dtype = -1, Stream& stream = Stream::Null());
 
@@ -131,142 +338,352 @@ static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, Outp
     addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
 }
 
-//! applies fixed threshold to the image
+/** @brief Applies a fixed-level threshold to each array element.
+
+@param src Source array (single-channel).
+@param dst Destination array with the same size and type as src .
+@param thresh Threshold value.
+@param maxval Maximum value to use with THRESH\_BINARY and THRESH\_BINARY\_INV threshold types.
+@param type Threshold type. For details, see threshold . The THRESH\_OTSU and THRESH\_TRIANGLE
+threshold types are not supported.
+@param stream Stream for the asynchronous version.
+
+@sa threshold
+ */
 CV_EXPORTS double threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
 
-//! computes magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
+/** @brief Computes magnitudes of complex matrix elements.
+
+@param xy Source complex matrix in the interleaved format ( CV\_32FC2 ).
+@param magnitude Destination matrix of float magnitudes ( CV\_32FC1 ).
+@param stream Stream for the asynchronous version.
+
+@sa magnitude
+ */
 CV_EXPORTS void magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());
 
-//! computes squared magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
+/** @brief Computes squared magnitudes of complex matrix elements.
+
+@param xy Source complex matrix in the interleaved format ( CV\_32FC2 ).
+@param magnitude Destination matrix of float magnitude squares ( CV\_32FC1 ).
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());
 
-//! computes magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
+/** @overload
+ computes magnitude of each (x(i), y(i)) vector
+ supports only floating-point source
+@param x Source matrix containing real components ( CV\_32FC1 ).
+@param y Source matrix containing imaginary components ( CV\_32FC1 ).
+@param magnitude Destination matrix of float magnitudes ( CV\_32FC1 ).
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());
 
-//! computes squared magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
+/** @overload
+ computes squared magnitude of each (x(i), y(i)) vector
+ supports only floating-point source
+@param x Source matrix containing real components ( CV\_32FC1 ).
+@param y Source matrix containing imaginary components ( CV\_32FC1 ).
+@param magnitude Destination matrix of float magnitude squares ( CV\_32FC1 ).
+@param stream Stream for the asynchronous version.
+*/
 CV_EXPORTS void magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());
 
-//! computes angle of each (x(i), y(i)) vector
-//! supports only floating-point source
+/** @brief Computes polar angles of complex matrix elements.
+
+@param x Source matrix containing real components ( CV\_32FC1 ).
+@param y Source matrix containing imaginary components ( CV\_32FC1 ).
+@param angle Destination matrix of angles ( CV\_32FC1 ).
+@param angleInDegrees Flag for angles that must be evaluated in degrees.
+@param stream Stream for the asynchronous version.
+
+@sa phase
+ */
 CV_EXPORTS void phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
 
-//! converts Cartesian coordinates to polar
-//! supports only floating-point source
+/** @brief Converts Cartesian coordinates into polar.
+
+@param x Source matrix containing real components ( CV\_32FC1 ).
+@param y Source matrix containing imaginary components ( CV\_32FC1 ).
+@param magnitude Destination matrix of float magnitudes ( CV\_32FC1 ).
+@param angle Destination matrix of angles ( CV\_32FC1 ).
+@param angleInDegrees Flag for angles that must be evaluated in degrees.
+@param stream Stream for the asynchronous version.
+
+@sa cartToPolar
+ */
 CV_EXPORTS void cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
 
-//! converts polar coordinates to Cartesian
-//! supports only floating-point source
+/** @brief Converts polar coordinates into Cartesian.
+
+@param magnitude Source matrix containing magnitudes ( CV\_32FC1 ).
+@param angle Source matrix containing angles ( CV\_32FC1 ).
+@param x Destination matrix of real components ( CV\_32FC1 ).
+@param y Destination matrix of imaginary components ( CV\_32FC1 ).
+@param angleInDegrees Flag that indicates angles in degrees.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null());
 
-//! makes multi-channel array out of several single-channel arrays
+//! @} cudaarithm_elem
+
+//! @addtogroup cudaarithm_core
+//! @{
+
+/** @brief Makes a multi-channel matrix out of several single-channel matrices.
+
+@param src Array/vector of source matrices.
+@param n Number of source matrices.
+@param dst Destination matrix.
+@param stream Stream for the asynchronous version.
+
+@sa merge
+ */
 CV_EXPORTS void merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null());
+/** @overload */
 CV_EXPORTS void merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null());
 
-//! copies each plane of a multi-channel array to a dedicated array
+/** @brief Copies each plane of a multi-channel matrix into an array.
+
+@param src Source matrix.
+@param dst Destination array/vector of single-channel matrices.
+@param stream Stream for the asynchronous version.
+
+@sa split
+ */
 CV_EXPORTS void split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null());
+/** @overload */
 CV_EXPORTS void split(InputArray src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
 
-//! transposes the matrix
-//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
+/** @brief Transposes a matrix.
+
+@param src1 Source matrix. 1-, 4-, 8-byte element sizes are supported for now.
+@param dst Destination matrix.
+@param stream Stream for the asynchronous version.
+
+@sa transpose
+ */
 CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null());
 
-//! reverses the order of the rows, columns or both in a matrix
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
+/** @brief Flips a 2D matrix around vertical, horizontal, or both axes.
+
+@param src Source matrix. Supports 1, 3 and 4 channels images with CV\_8U, CV\_16U, CV\_32S or
+CV\_32F depth.
+@param dst Destination matrix.
+@param flipCode Flip mode for the source:
+-   0 Flips around x-axis.
+-   \> 0 Flips around y-axis.
+-   \< 0 Flips around both axes.
+@param stream Stream for the asynchronous version.
+
+@sa flip
+ */
 CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null());
 
-//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
-//! destination array will have the depth type as lut and the same channels number as source
-//! supports CV_8UC1, CV_8UC3 types
+/** @brief Base class for transform using lookup table.
+ */
 class CV_EXPORTS LookUpTable : public Algorithm
 {
 public:
+    /** @brief Transforms the source matrix into the destination matrix using the given look-up table:
+    dst(I) = lut(src(I)) .
+
+    @param src Source matrix. CV\_8UC1 and CV\_8UC3 matrices are supported for now.
+    @param dst Destination matrix.
+    @param stream Stream for the asynchronous version.
+     */
     virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 };
 
+/** @brief Creates implementation for cuda::LookUpTable .
+
+@param lut Look-up table of 256 elements. It is a continuous CV\_8U matrix.
+ */
 CV_EXPORTS Ptr<LookUpTable> createLookUpTable(InputArray lut);
 
-//! copies 2D array to a larger destination array and pads borders with user-specifiable constant
+/** @brief Forms a border around an image.
+
+@param src Source image. CV\_8UC1 , CV\_8UC4 , CV\_32SC1 , and CV\_32FC1 types are supported.
+@param dst Destination image with the same type as src. The size is
+Size(src.cols+left+right, src.rows+top+bottom) .
+@param top
+@param bottom
+@param left
+@param right Number of pixels in each direction from the source image rectangle to extrapolate.
+For example: top=1, bottom=1, left=1, right=1 mean that 1 pixel-wide border needs to be built.
+@param borderType Border type. See borderInterpolate for details. BORDER\_REFLECT101 ,
+BORDER\_REPLICATE , BORDER\_CONSTANT , BORDER\_REFLECT and BORDER\_WRAP are supported for now.
+@param value Border value.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType,
                                Scalar value = Scalar(), Stream& stream = Stream::Null());
 
-//! computes norm of array
-//! supports NORM_INF, NORM_L1, NORM_L2
-//! supports all matrices except 64F
+//! @} cudaarithm_core
+
+//! @addtogroup cudaarithm_reduce
+//! @{
+
+/** @brief Returns the norm of a matrix (or difference of two matrices).
+
+@param src1 Source matrix. Any matrices except 64F are supported.
+@param normType Norm type. NORM\_L1 , NORM\_L2 , and NORM\_INF are supported for now.
+@param mask optional operation mask; it must have the same size as src1 and CV\_8UC1 type.
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+@sa norm
+ */
 CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask, GpuMat& buf);
+/** @overload
+uses new buffer, no mask
+*/
 static inline double norm(InputArray src, int normType)
 {
     GpuMat buf;
     return norm(src, normType, GpuMat(), buf);
 }
+/** @overload
+no mask
+*/
 static inline double norm(InputArray src, int normType, GpuMat& buf)
 {
     return norm(src, normType, GpuMat(), buf);
 }
 
-//! computes norm of the difference between two arrays
-//! supports NORM_INF, NORM_L1, NORM_L2
-//! supports only CV_8UC1 type
+/** @brief Returns the difference of two matrices.
+
+@param src1 Source matrix. Any matrices except 64F are supported.
+@param src2 Second source matrix (if any) with the same size and type as src1.
+@param normType Norm type. NORM\_L1 , NORM\_L2 , and NORM\_INF are supported for now.
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+@sa norm
+ */
 CV_EXPORTS double norm(InputArray src1, InputArray src2, GpuMat& buf, int normType=NORM_L2);
+/** @overload
+uses new buffer
+*/
 static inline double norm(InputArray src1, InputArray src2, int normType=NORM_L2)
 {
     GpuMat buf;
     return norm(src1, src2, buf, normType);
 }
 
-//! computes sum of array elements
-//! supports only single channel images
+/** @brief Returns the sum of matrix elements.
+
+@param src Source image of any depth except for CV\_64F .
+@param mask optional operation mask; it must have the same size as src1 and CV\_8UC1 type.
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+@sa sum
+ */
 CV_EXPORTS Scalar sum(InputArray src, InputArray mask, GpuMat& buf);
+/** @overload
+uses new buffer, no mask
+*/
 static inline Scalar sum(InputArray src)
 {
     GpuMat buf;
     return sum(src, GpuMat(), buf);
 }
+/** @overload
+no mask
+*/
 static inline Scalar sum(InputArray src, GpuMat& buf)
 {
     return sum(src, GpuMat(), buf);
 }
 
-//! computes sum of array elements absolute values
-//! supports only single channel images
+/** @brief Returns the sum of absolute values for matrix elements.
+
+@param src Source image of any depth except for CV\_64F .
+@param mask optional operation mask; it must have the same size as src1 and CV\_8UC1 type.
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+ */
 CV_EXPORTS Scalar absSum(InputArray src, InputArray mask, GpuMat& buf);
+/** @overload
+uses new buffer, no mask
+*/
 static inline Scalar absSum(InputArray src)
 {
     GpuMat buf;
     return absSum(src, GpuMat(), buf);
 }
+/** @overload
+no mask
+*/
 static inline Scalar absSum(InputArray src, GpuMat& buf)
 {
     return absSum(src, GpuMat(), buf);
 }
 
-//! computes squared sum of array elements
-//! supports only single channel images
+/** @brief Returns the squared sum of matrix elements.
+
+@param src Source image of any depth except for CV\_64F .
+@param mask optional operation mask; it must have the same size as src1 and CV\_8UC1 type.
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+ */
 CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask, GpuMat& buf);
+/** @overload
+uses new buffer, no mask
+*/
 static inline Scalar sqrSum(InputArray src)
 {
     GpuMat buf;
     return sqrSum(src, GpuMat(), buf);
 }
+/** @overload
+no mask
+*/
 static inline Scalar sqrSum(InputArray src, GpuMat& buf)
 {
     return sqrSum(src, GpuMat(), buf);
 }
 
-//! finds global minimum and maximum array elements and returns their values
+/** @brief Finds global minimum and maximum matrix elements and returns their values.
+
+@param src Single-channel source image.
+@param minVal Pointer to the returned minimum value. Use NULL if not required.
+@param maxVal Pointer to the returned maximum value. Use NULL if not required.
+@param mask Optional mask to select a sub-matrix.
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+The function does not work with CV\_64F images on GPUs with the compute capability \< 1.3.
+
+@sa minMaxLoc
+ */
 CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf);
+/** @overload
+uses new buffer
+*/
 static inline void minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
 {
     GpuMat buf;
     minMax(src, minVal, maxVal, mask, buf);
 }
 
-//! finds global minimum and maximum array elements and returns their values with locations
+/** @brief Finds global minimum and maximum matrix elements and returns their values with locations.
+
+@param src Single-channel source image.
+@param minVal Pointer to the returned minimum value. Use NULL if not required.
+@param maxVal Pointer to the returned maximum value. Use NULL if not required.
+@param minLoc Pointer to the returned minimum location. Use NULL if not required.
+@param maxLoc Pointer to the returned maximum location. Use NULL if not required.
+@param mask Optional mask to select a sub-matrix.
+@param valbuf Optional values buffer to avoid extra memory allocations. It is resized
+automatically.
+@param locbuf Optional locations buffer to avoid extra memory allocations. It is resized
+automatically.
+The function does not work with CV\_64F images on GPU with the compute capability \< 1.3.
+
+@sa minMaxLoc
+ */
 CV_EXPORTS void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
                           InputArray mask, GpuMat& valbuf, GpuMat& locbuf);
+/** @overload
+uses new buffer
+*/
 static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
                              InputArray mask=noArray())
 {
@@ -274,34 +691,104 @@ static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, P
     minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
 }
 
-//! counts non-zero array elements
+/** @brief Counts non-zero matrix elements.
+
+@param src Single-channel source image.
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+The function does not work with CV\_64F images on GPUs with the compute capability \< 1.3.
+
+@sa countNonZero
+ */
 CV_EXPORTS int countNonZero(InputArray src, GpuMat& buf);
+/** @overload
+uses new buffer
+*/
 static inline int countNonZero(const GpuMat& src)
 {
     GpuMat buf;
     return countNonZero(src, buf);
 }
 
-//! reduces a matrix to a vector
+/** @brief Reduces a matrix to a vector.
+
+@param mtx Source 2D matrix.
+@param vec Destination vector. Its size and type is defined by dim and dtype parameters.
+@param dim Dimension index along which the matrix is reduced. 0 means that the matrix is reduced
+to a single row. 1 means that the matrix is reduced to a single column.
+@param reduceOp Reduction operation that could be one of the following:
+-   **CV\_REDUCE\_SUM** The output is the sum of all rows/columns of the matrix.
+-   **CV\_REDUCE\_AVG** The output is the mean vector of all rows/columns of the matrix.
+-   **CV\_REDUCE\_MAX** The output is the maximum (column/row-wise) of all rows/columns of the
+matrix.
+-   **CV\_REDUCE\_MIN** The output is the minimum (column/row-wise) of all rows/columns of the
+matrix.
+@param dtype When it is negative, the destination vector will have the same type as the source
+matrix. Otherwise, its type will be CV\_MAKE\_TYPE(CV\_MAT\_DEPTH(dtype), mtx.channels()) .
+@param stream Stream for the asynchronous version.
+
+The function reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
+1D vectors and performing the specified operation on the vectors until a single row/column is
+obtained. For example, the function can be used to compute horizontal and vertical projections of a
+raster image. In case of CV\_REDUCE\_SUM and CV\_REDUCE\_AVG , the output may have a larger element
+bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction
+modes.
+
+@sa reduce
+ */
 CV_EXPORTS void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
 
-//! computes mean value and standard deviation of all or selected array elements
-//! supports only CV_8UC1 type
+/** @brief Computes a mean value and a standard deviation of matrix elements.
+
+@param mtx Source matrix. CV\_8UC1 matrices are supported for now.
+@param mean Mean value.
+@param stddev Standard deviation value.
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+@sa meanStdDev
+ */
 CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
+/** @overload
+uses new buffer
+*/
 static inline void meanStdDev(InputArray src, Scalar& mean, Scalar& stddev)
 {
     GpuMat buf;
     meanStdDev(src, mean, stddev, buf);
 }
 
-//! computes the standard deviation of integral images
-//! supports only CV_32SC1 source type and CV_32FC1 sqr type
-//! output will have CV_32FC1 type
+/** @brief Computes a standard deviation of integral images.
+
+@param src Source image. Only the CV\_32SC1 type is supported.
+@param sqr Squared source image. Only the CV\_32FC1 type is supported.
+@param dst Destination image with the same type and size as src .
+@param rect Rectangular window.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null());
 
-//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
+/** @brief Normalizes the norm or value range of an array.
+
+@param src Input array.
+@param dst Output array of the same size as src .
+@param alpha Norm value to normalize to or the lower range boundary in case of the range
+normalization.
+@param beta Upper range boundary in case of the range normalization; it is not used for the norm
+normalization.
+@param norm_type Normalization type ( NORM\_MINMAX , NORM\_L2 , NORM\_L1 or NORM\_INF ).
+@param dtype When negative, the output array has the same type as src; otherwise, it has the same
+number of channels as src and the depth =CV\_MAT\_DEPTH(dtype).
+@param mask Optional operation mask.
+@param norm\_buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+@param cvt\_buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+@sa normalize
+ */
 CV_EXPORTS void normalize(InputArray src, OutputArray dst, double alpha, double beta,
                           int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+/** @overload
+uses new buffers
+*/
 static inline void normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0,
                              int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
 {
@@ -310,65 +797,179 @@ static inline void normalize(InputArray src, OutputArray dst, double alpha = 1,
     normalize(src, dst, alpha, beta, norm_type, dtype, mask, norm_buf, cvt_buf);
 }
 
-//! computes the integral image
-//! sum will have CV_32S type, but will contain unsigned int values
-//! supports only CV_8UC1 source type
+/** @brief Computes an integral image.
+
+@param src Source image. Only CV\_8UC1 images are supported for now.
+@param sum Integral image containing 32-bit unsigned integer values packed into CV\_32SC1 .
+@param buffer Optional buffer to avoid extra memory allocations. It is resized automatically.
+@param stream Stream for the asynchronous version.
+
+@sa integral
+ */
 CV_EXPORTS void integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null());
 static inline void integralBuffered(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
 {
     integral(src, sum, buffer, stream);
 }
+/** @overload
+uses new buffer
+*/
 static inline void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
 {
     GpuMat buffer;
     integral(src, sum, buffer, stream);
 }
 
-//! computes squared integral image
-//! result matrix will have 64F type, but will contain 64U values
-//! supports source images of 8UC1 type only
+/** @brief Computes a squared integral image.
+
+@param src Source image. Only CV\_8UC1 images are supported for now.
+@param sqsum Squared integral image containing 64-bit unsigned integer values packed into
+CV\_64FC1 .
+@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null());
+/** @overload
+uses new buffer
+*/
 static inline void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
 {
     GpuMat buffer;
     sqrIntegral(src, sqsum, buffer, stream);
 }
 
+//! @} cudaarithm_reduce
+
+//! @addtogroup cudaarithm_arithm
+//! @{
+
+/** @brief Performs generalized matrix multiplication.
+
+@param src1 First multiplied input matrix that should have CV\_32FC1 , CV\_64FC1 , CV\_32FC2 , or
+CV\_64FC2 type.
+@param src2 Second multiplied input matrix of the same type as src1 .
+@param alpha Weight of the matrix product.
+@param src3 Third optional delta matrix added to the matrix product. It should have the same type
+as src1 and src2 .
+@param beta Weight of src3 .
+@param dst Destination matrix. It has the proper size and the same type as input matrices.
+@param flags Operation flags:
+-   **GEMM\_1\_T** transpose src1
+-   **GEMM\_2\_T** transpose src2
+-   **GEMM\_3\_T** transpose src3
+@param stream Stream for the asynchronous version.
+
+The function performs generalized matrix multiplication similar to the gemm functions in BLAS level
+3. For example, gemm(src1, src2, alpha, src3, beta, dst, GEMM\_1\_T + GEMM\_3\_T) corresponds to
+
+\f[\texttt{dst} =  \texttt{alpha} \cdot \texttt{src1} ^T  \cdot \texttt{src2} +  \texttt{beta} \cdot \texttt{src3} ^T\f]
+
+@note Transposition operation doesn't support CV\_64FC2 input type.
+
+@sa gemm
+ */
 CV_EXPORTS void gemm(InputArray src1, InputArray src2, double alpha,
                      InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null());
 
-//! performs per-element multiplication of two full (not packed) Fourier spectrums
-//! supports 32FC2 matrices only (interleaved format)
+/** @brief Performs a per-element multiplication of two Fourier spectrums.
+
+@param src1 First spectrum.
+@param src2 Second spectrum with the same size and type as a .
+@param dst Destination spectrum.
+@param flags Mock parameter used for CPU/CUDA interfaces similarity.
+@param conjB Optional flag to specify if the second spectrum needs to be conjugated before the
+multiplication.
+@param stream Stream for the asynchronous version.
+
+Only full (not packed) CV\_32FC2 complex spectrums in the interleaved format are supported for now.
+
+@sa mulSpectrums
+ */
 CV_EXPORTS void mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null());
 
-//! performs per-element multiplication of two full (not packed) Fourier spectrums
-//! supports 32FC2 matrices only (interleaved format)
+/** @brief Performs a per-element multiplication of two Fourier spectrums and scales the result.
+
+@param src1 First spectrum.
+@param src2 Second spectrum with the same size and type as a .
+@param dst Destination spectrum.
+@param flags Mock parameter used for CPU/CUDA interfaces similarity.
+@param scale Scale constant.
+@param conjB Optional flag to specify if the second spectrum needs to be conjugated before the
+multiplication.
+@param stream Stream for the asynchronous version.
+
+Only full (not packed) CV\_32FC2 complex spectrums in the interleaved format are supported for now.
+
+@sa mulSpectrums
+ */
 CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
 
-//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
-//! Param dft_size is the size of DFT transform.
-//!
-//! If the source matrix is not continous, then additional copy will be done,
-//! so to avoid copying ensure the source matrix is continous one. If you want to use
-//! preallocated output ensure it is continuous too, otherwise it will be reallocated.
-//!
-//! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values
-//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
-//!
-//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
+/** @brief Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
+
+@param src Source matrix (real or complex).
+@param dst Destination matrix (real or complex).
+@param dft\_size Size of a discrete Fourier transform.
+@param flags Optional flags:
+-   **DFT\_ROWS** transforms each individual row of the source matrix.
+-   **DFT\_SCALE** scales the result: divide it by the number of elements in the transform
+(obtained from dft\_size ).
+-   **DFT\_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
+cases are always forward and inverse, respectively).
+-   **DFT\_REAL\_OUTPUT** specifies the output as real. The source matrix is the result of
+real-complex transform, so the destination matrix must be real.
+@param stream Stream for the asynchronous version.
+
+Use to handle real matrices ( CV32FC1 ) and complex matrices in the interleaved format ( CV32FC2 ).
+
+The source matrix should be continuous, otherwise reallocation and data copying is performed. The
+function chooses an operation mode depending on the flags, size, and channel count of the source
+matrix:
+
+-   If the source matrix is complex and the output is not specified as real, the destination
+matrix is complex and has the dft\_size size and CV\_32FC2 type. The destination matrix
+contains a full result of the DFT (forward or inverse).
+-   If the source matrix is complex and the output is specified as real, the function assumes that
+its input is the result of the forward transform (see the next item). The destination matrix
+has the dft\_size size and CV\_32FC1 type. It contains the result of the inverse DFT.
+-   If the source matrix is real (its type is CV\_32FC1 ), forward DFT is performed. The result of
+the DFT is packed into complex ( CV\_32FC2 ) matrix. So, the width of the destination matrix
+is dft\_size.width / 2 + 1 . But if the source is a single column, the height is reduced
+instead of the width.
+
+@sa dft
+ */
 CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
 
-//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
-//! supports source images of 32FC1 type only
-//! result matrix will have 32FC1 type
+/** @brief Base class for convolution (or cross-correlation) operator. :
+ */
 class CV_EXPORTS Convolution : public Algorithm
 {
 public:
+    /** @brief Computes a convolution (or cross-correlation) of two images.
+
+    @param image Source image. Only CV\_32FC1 images are supported for now.
+    @param templ Template image. The size is not greater than the image size. The type is the same as
+    image .
+    @param result Result image. If image is *W x H* and templ is *w x h*, then result must be *W-w+1 x
+    H-h+1*.
+    @param ccorr Flags to evaluate cross-correlation instead of convolution.
+    @param stream Stream for the asynchronous version.
+     */
     virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
 };
 
+/** @brief Creates implementation for cuda::Convolution .
+
+@param user\_block\_size Block size. If you leave default value Size(0,0) then automatic
+estimation of block size will be used (which is optimized for speed). By varying user\_block\_size
+you can reduce memory requirements at the cost of speed.
+ */
 CV_EXPORTS Ptr<Convolution> createConvolution(Size user_block_size = Size());
 
+//! @} cudaarithm_arithm
+
+//! @} cudaarithm
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDAARITHM_HPP__ */
diff --git a/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp b/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp
index a08ed64b1..b2970f2a1 100644
--- a/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp
+++ b/modules/cudabgsegm/include/opencv2/cudabgsegm.hpp
@@ -50,11 +50,33 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/video/background_segm.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudabgsegm Background Segmentation
+  @}
+ */
+
 namespace cv { namespace cuda {
 
+//! @addtogroup cudabgsegm
+//! @{
+
 ////////////////////////////////////////////////////
 // MOG
 
+/** @brief Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite MOG2001.
+
+@sa BackgroundSubtractorMOG
+
+@note
+   -   An example on gaussian mixture based background/foreground segmantation can be found at
+        opencv\_source\_code/samples/gpu/bgfg\_segm.cpp
+ */
 class CV_EXPORTS BackgroundSubtractorMOG : public cv::BackgroundSubtractor
 {
 public:
@@ -78,6 +100,14 @@ public:
     virtual void setNoiseSigma(double noiseSigma) = 0;
 };
 
+/** @brief Creates mixture-of-gaussian background subtractor
+
+@param history Length of the history.
+@param nmixtures Number of Gaussian mixtures.
+@param backgroundRatio Background ratio.
+@param noiseSigma Noise strength (standard deviation of the brightness or each color channel). 0
+means some automatic value.
+ */
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorMOG>
     createBackgroundSubtractorMOG(int history = 200, int nmixtures = 5,
                                   double backgroundRatio = 0.7, double noiseSigma = 0);
@@ -85,6 +115,14 @@ CV_EXPORTS Ptr<cuda::BackgroundSubtractorMOG>
 ////////////////////////////////////////////////////
 // MOG2
 
+/** @brief Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite MOG2004.
+
+@sa BackgroundSubtractorMOG2
+ */
 class CV_EXPORTS BackgroundSubtractorMOG2 : public cv::BackgroundSubtractorMOG2
 {
 public:
@@ -96,6 +134,15 @@ public:
     virtual void getBackgroundImage(OutputArray backgroundImage, Stream& stream) const = 0;
 };
 
+/** @brief Creates MOG2 Background Subtractor
+
+@param history Length of the history.
+@param varThreshold Threshold on the squared Mahalanobis distance between the pixel and the model
+to decide whether a pixel is well described by the background model. This parameter does not
+affect the background update.
+@param detectShadows If true, the algorithm will detect shadows and mark them. It decreases the
+speed a bit, so if you do not need this feature, set the parameter to false.
+ */
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorMOG2>
     createBackgroundSubtractorMOG2(int history = 500, double varThreshold = 16,
                                    bool detectShadows = true);
@@ -103,6 +150,12 @@ CV_EXPORTS Ptr<cuda::BackgroundSubtractorMOG2>
 ////////////////////////////////////////////////////
 // GMG
 
+/** @brief Background/Foreground Segmentation Algorithm.
+
+The class discriminates between foreground and background pixels by building and maintaining a model
+of the background. Any pixel which does not fit this model is then deemed to be foreground. The
+class implements algorithm described in @cite GMG2012.
+ */
 class CV_EXPORTS BackgroundSubtractorGMG : public cv::BackgroundSubtractor
 {
 public:
@@ -140,54 +193,71 @@ public:
     virtual void setMaxVal(double val) = 0;
 };
 
+/** @brief Creates GMG Background Subtractor
+
+@param initializationFrames Number of frames of video to use to initialize histograms.
+@param decisionThreshold Value above which pixel is determined to be FG.
+ */
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorGMG>
     createBackgroundSubtractorGMG(int initializationFrames = 120, double decisionThreshold = 0.8);
 
 ////////////////////////////////////////////////////
 // FGD
 
-/**
- * Foreground Object Detection from Videos Containing Complex Background.
- * Liyuan Li, Weimin Huang, Irene Y.H. Gu, and Qi Tian.
- * ACM MM2003 9p
+/** @brief The class discriminates between foreground and background pixels by building and maintaining a model
+of the background.
+
+Any pixel which does not fit this model is then deemed to be foreground. The class implements
+algorithm described in @cite FGD2003.
+@sa BackgroundSubtractor
  */
 class CV_EXPORTS BackgroundSubtractorFGD : public cv::BackgroundSubtractor
 {
 public:
+    /** @brief Returns the output foreground regions calculated by findContours.
+
+    @param foreground\_regions Output array (CPU memory).
+     */
     virtual void getForegroundRegions(OutputArrayOfArrays foreground_regions) = 0;
 };
 
 struct CV_EXPORTS FGDParams
 {
-    int Lc;  // Quantized levels per 'color' component. Power of two, typically 32, 64 or 128.
-    int N1c; // Number of color vectors used to model normal background color variation at a given pixel.
-    int N2c; // Number of color vectors retained at given pixel.  Must be > N1c, typically ~ 5/3 of N1c.
-    // Used to allow the first N1c vectors to adapt over time to changing background.
+    int Lc;  //!< Quantized levels per 'color' component. Power of two, typically 32, 64 or 128.
+    int N1c; //!< Number of color vectors used to model normal background color variation at a given pixel.
+    int N2c; //!< Number of color vectors retained at given pixel.  Must be > N1c, typically ~ 5/3 of N1c.
+    //!< Used to allow the first N1c vectors to adapt over time to changing background.
 
-    int Lcc;  // Quantized levels per 'color co-occurrence' component.  Power of two, typically 16, 32 or 64.
-    int N1cc; // Number of color co-occurrence vectors used to model normal background color variation at a given pixel.
-    int N2cc; // Number of color co-occurrence vectors retained at given pixel.  Must be > N1cc, typically ~ 5/3 of N1cc.
-    // Used to allow the first N1cc vectors to adapt over time to changing background.
+    int Lcc;  //!< Quantized levels per 'color co-occurrence' component.  Power of two, typically 16, 32 or 64.
+    int N1cc; //!< Number of color co-occurrence vectors used to model normal background color variation at a given pixel.
+    int N2cc; //!< Number of color co-occurrence vectors retained at given pixel.  Must be > N1cc, typically ~ 5/3 of N1cc.
+    //!< Used to allow the first N1cc vectors to adapt over time to changing background.
 
-    bool is_obj_without_holes; // If TRUE we ignore holes within foreground blobs. Defaults to TRUE.
-    int perform_morphing;     // Number of erode-dilate-erode foreground-blob cleanup iterations.
-    // These erase one-pixel junk blobs and merge almost-touching blobs. Default value is 1.
+    bool is_obj_without_holes; //!< If TRUE we ignore holes within foreground blobs. Defaults to TRUE.
+    int perform_morphing;     //!< Number of erode-dilate-erode foreground-blob cleanup iterations.
+    //!< These erase one-pixel junk blobs and merge almost-touching blobs. Default value is 1.
 
-    float alpha1; // How quickly we forget old background pixel values seen. Typically set to 0.1.
-    float alpha2; // "Controls speed of feature learning". Depends on T. Typical value circa 0.005.
-    float alpha3; // Alternate to alpha2, used (e.g.) for quicker initial convergence. Typical value 0.1.
+    float alpha1; //!< How quickly we forget old background pixel values seen. Typically set to 0.1.
+    float alpha2; //!< "Controls speed of feature learning". Depends on T. Typical value circa 0.005.
+    float alpha3; //!< Alternate to alpha2, used (e.g.) for quicker initial convergence. Typical value 0.1.
 
-    float delta;   // Affects color and color co-occurrence quantization, typically set to 2.
-    float T;       // A percentage value which determines when new features can be recognized as new background. (Typically 0.9).
-    float minArea; // Discard foreground blobs whose bounding box is smaller than this threshold.
+    float delta;   //!< Affects color and color co-occurrence quantization, typically set to 2.
+    float T;       //!< A percentage value which determines when new features can be recognized as new background. (Typically 0.9).
+    float minArea; //!< Discard foreground blobs whose bounding box is smaller than this threshold.
 
-    // default Params
+    //! default Params
     FGDParams();
 };
 
+/** @brief Creates FGD Background Subtractor
+
+@param params Algorithm's parameters. See @cite FGD2003 for explanation.
+ */
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorFGD>
     createBackgroundSubtractorFGD(const FGDParams& params = FGDParams());
 
+//! @}
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDABGSEGM_HPP__ */
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index 747c044ee..850a181d1 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -50,8 +50,18 @@
 
 #include "opencv2/core/cuda.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudacodec Video Encoding/Decoding
+  @}
+ */
+
 namespace cv { namespace cudacodec {
 
+//! @addtogroup cudacodec
+//! @{
+
 ////////////////////////////////// Video Encoding //////////////////////////////////
 
 // Works only under Windows.
@@ -68,35 +78,53 @@ enum SurfaceFormat
     SF_GRAY = SF_BGR
 };
 
+/** @brief Different parameters for CUDA video encoder.
+ */
 struct CV_EXPORTS EncoderParams
 {
-    int P_Interval;      // NVVE_P_INTERVAL,
-    int IDR_Period;      // NVVE_IDR_PERIOD,
-    int DynamicGOP;      // NVVE_DYNAMIC_GOP,
-    int RCType;          // NVVE_RC_TYPE,
-    int AvgBitrate;      // NVVE_AVG_BITRATE,
-    int PeakBitrate;     // NVVE_PEAK_BITRATE,
-    int QP_Level_Intra;  // NVVE_QP_LEVEL_INTRA,
-    int QP_Level_InterP; // NVVE_QP_LEVEL_INTER_P,
-    int QP_Level_InterB; // NVVE_QP_LEVEL_INTER_B,
-    int DeblockMode;     // NVVE_DEBLOCK_MODE,
-    int ProfileLevel;    // NVVE_PROFILE_LEVEL,
-    int ForceIntra;      // NVVE_FORCE_INTRA,
-    int ForceIDR;        // NVVE_FORCE_IDR,
-    int ClearStat;       // NVVE_CLEAR_STAT,
-    int DIMode;          // NVVE_SET_DEINTERLACE,
-    int Presets;         // NVVE_PRESETS,
-    int DisableCabac;    // NVVE_DISABLE_CABAC,
-    int NaluFramingType; // NVVE_CONFIGURE_NALU_FRAMING_TYPE
-    int DisableSPSPPS;   // NVVE_DISABLE_SPS_PPS
+    int P_Interval;      //!< NVVE_P_INTERVAL,
+    int IDR_Period;      //!< NVVE_IDR_PERIOD,
+    int DynamicGOP;      //!< NVVE_DYNAMIC_GOP,
+    int RCType;          //!< NVVE_RC_TYPE,
+    int AvgBitrate;      //!< NVVE_AVG_BITRATE,
+    int PeakBitrate;     //!< NVVE_PEAK_BITRATE,
+    int QP_Level_Intra;  //!< NVVE_QP_LEVEL_INTRA,
+    int QP_Level_InterP; //!< NVVE_QP_LEVEL_INTER_P,
+    int QP_Level_InterB; //!< NVVE_QP_LEVEL_INTER_B,
+    int DeblockMode;     //!< NVVE_DEBLOCK_MODE,
+    int ProfileLevel;    //!< NVVE_PROFILE_LEVEL,
+    int ForceIntra;      //!< NVVE_FORCE_INTRA,
+    int ForceIDR;        //!< NVVE_FORCE_IDR,
+    int ClearStat;       //!< NVVE_CLEAR_STAT,
+    int DIMode;          //!< NVVE_SET_DEINTERLACE,
+    int Presets;         //!< NVVE_PRESETS,
+    int DisableCabac;    //!< NVVE_DISABLE_CABAC,
+    int NaluFramingType; //!< NVVE_CONFIGURE_NALU_FRAMING_TYPE
+    int DisableSPSPPS;   //!< NVVE_DISABLE_SPS_PPS
 
     EncoderParams();
+    /** @brief Constructors.
+
+    @param configFile Config file name.
+
+    Creates default parameters or reads parameters from config file.
+     */
     explicit EncoderParams(const String& configFile);
 
+    /** @brief Reads parameters from config file.
+
+    @param configFile Config file name.
+     */
     void load(const String& configFile);
+    /** @brief Saves parameters to config file.
+
+    @param configFile Config file name.
+     */
     void save(const String& configFile) const;
 };
 
+/** @brief Callbacks for CUDA video encoder.
+ */
 class CV_EXPORTS EncoderCallBack
 {
 public:
@@ -109,41 +137,109 @@ public:
 
     virtual ~EncoderCallBack() {}
 
-    //! callback function to signal the start of bitstream that is to be encoded
-    //! callback must allocate host buffer for CUDA encoder and return pointer to it and it's size
+    /** @brief Callback function to signal the start of bitstream that is to be encoded.
+
+    Callback must allocate buffer for CUDA encoder and return pointer to it and it's size.
+     */
     virtual uchar* acquireBitStream(int* bufferSize) = 0;
 
-    //! callback function to signal that the encoded bitstream is ready to be written to file
+    /** @brief Callback function to signal that the encoded bitstream is ready to be written to file.
+    */
     virtual void releaseBitStream(unsigned char* data, int size) = 0;
 
-    //! callback function to signal that the encoding operation on the frame has started
+    /** @brief Callback function to signal that the encoding operation on the frame has started.
+
+    @param frameNumber
+    @param picType Specify frame type (I-Frame, P-Frame or B-Frame).
+     */
     virtual void onBeginFrame(int frameNumber, PicType picType) = 0;
 
-    //! callback function signals that the encoding operation on the frame has finished
+    /** @brief Callback function signals that the encoding operation on the frame has finished.
+
+    @param frameNumber
+    @param picType Specify frame type (I-Frame, P-Frame or B-Frame).
+     */
     virtual void onEndFrame(int frameNumber, PicType picType) = 0;
 };
 
+/** @brief Video writer interface.
+
+The implementation uses H264 video codec.
+
+@note Currently only Windows platform is supported.
+
+@note
+   -   An example on how to use the videoWriter class can be found at
+        opencv\_source\_code/samples/gpu/video\_writer.cpp
+ */
 class CV_EXPORTS VideoWriter
 {
 public:
     virtual ~VideoWriter() {}
 
-    //! writes the next frame from GPU memory
+    /** @brief Writes the next video frame.
+
+    @param frame The written frame.
+    @param lastFrame Indicates that it is end of stream. The parameter can be ignored.
+
+    The method write the specified image to video file. The image must have the same size and the same
+    surface format as has been specified when opening the video writer.
+     */
     virtual void write(InputArray frame, bool lastFrame = false) = 0;
 
     virtual EncoderParams getEncoderParams() const = 0;
 };
 
-//! create VideoWriter for specified output file (only AVI file format is supported)
+/** @brief Creates video writer.
+
+@param fileName Name of the output video file. Only AVI file format is supported.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param format Surface format of input frames ( SF\_UYVY , SF\_YUY2 , SF\_YV12 , SF\_NV12 ,
+SF\_IYUV , SF\_BGR or SF\_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+
+The constructors initialize video writer. FFMPEG is used to write videos. User can implement own
+multiplexing with cudacodec::EncoderCallBack .
+ */
 CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+/** @overload
+@param fileName Name of the output video file. Only AVI file format is supported.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param params Encoder parameters. See cudacodec::EncoderParams .
+@param format Surface format of input frames ( SF\_UYVY , SF\_YUY2 , SF\_YV12 , SF\_NV12 ,
+SF\_IYUV , SF\_BGR or SF\_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
 CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
 
-//! create VideoWriter for user-defined callbacks
+/** @overload
+@param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallBack . Use it if you
+want to work with raw video stream.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param format Surface format of input frames ( SF\_UYVY , SF\_YUY2 , SF\_YV12 , SF\_NV12 ,
+SF\_IYUV , SF\_BGR or SF\_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
 CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+/** @overload
+@param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallBack . Use it if you
+want to work with raw video stream.
+@param frameSize Size of the input video frames.
+@param fps Framerate of the created video stream.
+@param params Encoder parameters. See cudacodec::EncoderParams .
+@param format Surface format of input frames ( SF\_UYVY , SF\_YUY2 , SF\_YV12 , SF\_NV12 ,
+SF\_IYUV , SF\_BGR or SF\_GRAY). BGR or gray frames will be converted to YV12 format before
+encoding, frames with other formats will be used as is.
+*/
 CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
 
 ////////////////////////////////// Video Decoding //////////////////////////////////////////
 
+/** @brief Video codecs supported by cudacodec::VideoReader .
+ */
 enum Codec
 {
     MPEG1 = 0,
@@ -155,13 +251,15 @@ enum Codec
     H264_SVC,
     H264_MVC,
 
-    Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   // Y,U,V (4:2:0)
-    Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,V,U (4:2:0)
-    Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,UV  (4:2:0)
-    Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   // YUYV/YUY2 (4:2:2)
-    Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    // UYVY (4:2:2)
+    Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   //!< Y,U,V (4:2:0)
+    Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   //!< Y,V,U (4:2:0)
+    Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   //!< Y,UV  (4:2:0)
+    Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   //!< YUYV/YUY2 (4:2:2)
+    Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    //!< UYVY (4:2:2)
 };
 
+/** @brief Chroma formats supported by cudacodec::VideoReader .
+ */
 enum ChromaFormat
 {
     Monochrome = 0,
@@ -170,6 +268,8 @@ enum ChromaFormat
     YUV444
 };
 
+/** @brief Struct providing information about video file format. :
+ */
 struct FormatInfo
 {
     Codec codec;
@@ -178,29 +278,65 @@ struct FormatInfo
     int height;
 };
 
+/** @brief Video reader interface.
+
+@note
+   -   An example on how to use the videoReader class can be found at
+        opencv\_source\_code/samples/gpu/video\_reader.cpp
+ */
 class CV_EXPORTS VideoReader
 {
 public:
     virtual ~VideoReader() {}
 
+    /** @brief Grabs, decodes and returns the next video frame.
+
+    If no frames has been grabbed (there are no more frames in video file), the methods return false .
+    The method throws Exception if error occurs.
+     */
     virtual bool nextFrame(OutputArray frame) = 0;
 
+    /** @brief Returns information about video file format.
+    */
     virtual FormatInfo format() const = 0;
 };
 
+/** @brief Interface for video demultiplexing. :
+
+User can implement own demultiplexing by implementing this interface.
+ */
 class CV_EXPORTS RawVideoSource
 {
 public:
     virtual ~RawVideoSource() {}
 
+    /** @brief Returns next packet with RAW video frame.
+
+    @param data Pointer to frame data.
+    @param size Size in bytes of current frame.
+    @param endOfFile Indicates that it is end of stream.
+     */
     virtual bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0;
 
+    /** @brief Returns information about video file format.
+    */
     virtual FormatInfo format() const = 0;
 };
 
+/** @brief Creates video reader.
+
+@param filename Name of the input video file.
+
+FFMPEG is used to read videos. User can implement own demultiplexing with cudacodec::RawVideoSource
+ */
 CV_EXPORTS Ptr<VideoReader> createVideoReader(const String& filename);
+/** @overload
+@param source RAW video source implemented by user.
+*/
 CV_EXPORTS Ptr<VideoReader> createVideoReader(const Ptr<RawVideoSource>& source);
 
+//! @}
+
 }} // namespace cv { namespace cudacodec {
 
 #endif /* __OPENCV_CUDACODEC_HPP__ */
diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index a89580e0e..5c7160709 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -50,150 +50,175 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/cudafilters.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudafeatures2d Feature Detection and Description
+  @}
+ */
+
 namespace cv { namespace cuda {
 
+//! @addtogroup cudafeatures2d
+//! @{
+
+/** @brief Brute-force descriptor matcher.
+
+For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+by trying each one. This descriptor matcher supports masking permissible matches between descriptor
+sets.
+
+The class BFMatcher\_CUDA has an interface similar to the class DescriptorMatcher. It has two groups
+of match methods: for matching descriptors of one image with another image or with an image set.
+Also, all functions have an alternative to save results either to the GPU memory or to the CPU
+memory.
+
+@sa DescriptorMatcher, BFMatcher
+ */
 class CV_EXPORTS BFMatcher_CUDA
 {
 public:
     explicit BFMatcher_CUDA(int norm = cv::NORM_L2);
 
-    // Add descriptors to train descriptor collection
+    //! Add descriptors to train descriptor collection
     void add(const std::vector<GpuMat>& descCollection);
 
-    // Get train descriptors collection
+    //! Get train descriptors collection
     const std::vector<GpuMat>& getTrainDescriptors() const;
 
-    // Clear train descriptors collection
+    //! Clear train descriptors collection
     void clear();
 
-    // Return true if there are not train descriptors in collection
+    //! Return true if there are not train descriptors in collection
     bool empty() const;
 
-    // Return true if the matcher supports mask in match methods
+    //! Return true if the matcher supports mask in match methods
     bool isMaskSupported() const;
 
-    // Find one best match for each query descriptor
+    //! Find one best match for each query descriptor
     void matchSingle(const GpuMat& query, const GpuMat& train,
         GpuMat& trainIdx, GpuMat& distance,
         const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
 
-    // Download trainIdx and distance and convert it to CPU vector with DMatch
+    //! Download trainIdx and distance and convert it to CPU vector with DMatch
     static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    // Convert trainIdx and distance to vector with DMatch
+    //! Convert trainIdx and distance to vector with DMatch
     static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);
 
-    // Find one best match for each query descriptor
+    //! Find one best match for each query descriptor
     void match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());
 
-    // Make gpu collection of trains and masks in suitable format for matchCollection function
+    //! Make gpu collection of trains and masks in suitable format for matchCollection function
     void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
 
-    // Find one best match from train collection for each query descriptor
+    //! Find one best match from train collection for each query descriptor
     void matchCollection(const GpuMat& query, const GpuMat& trainCollection,
         GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
         const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());
 
-    // Download trainIdx, imgIdx and distance and convert it to vector with DMatch
+    //! Download trainIdx, imgIdx and distance and convert it to vector with DMatch
     static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    // Convert trainIdx, imgIdx and distance to vector with DMatch
+    //! Convert trainIdx, imgIdx and distance to vector with DMatch
     static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);
 
-    // Find one best match from train collection for each query descriptor.
+    //! Find one best match from train collection for each query descriptor.
     void match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
 
-    // Find k best matches for each query descriptor (in increasing order of distances)
+    //! Find k best matches for each query descriptor (in increasing order of distances)
     void knnMatchSingle(const GpuMat& query, const GpuMat& train,
         GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
         const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
 
-    // Download trainIdx and distance and convert it to vector with DMatch
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Download trainIdx and distance and convert it to vector with DMatch
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
     static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
         std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    // Convert trainIdx and distance to vector with DMatch
+    //! Convert trainIdx and distance to vector with DMatch
     static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
         std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
 
-    // Find k best matches for each query descriptor (in increasing order of distances).
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Find k best matches for each query descriptor (in increasing order of distances).
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
     void knnMatch(const GpuMat& query, const GpuMat& train,
         std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),
         bool compactResult = false);
 
-    // Find k best matches from train collection for each query descriptor (in increasing order of distances)
+    //! Find k best matches from train collection for each query descriptor (in increasing order of distances)
     void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
         GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
         const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());
 
-    // Download trainIdx and distance and convert it to vector with DMatch
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Download trainIdx and distance and convert it to vector with DMatch
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
+    //! @see BFMatcher_CUDA::knnMatchDownload
     static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
         std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    // Convert trainIdx and distance to vector with DMatch
+    //! Convert trainIdx and distance to vector with DMatch
+    //! @see BFMatcher_CUDA::knnMatchConvert
     static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
         std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
 
-    // Find k best matches  for each query descriptor (in increasing order of distances).
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Find k best matches  for each query descriptor (in increasing order of distances).
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
     void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
         const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
 
-    // Find best matches for each query descriptor which have distance less than maxDistance.
-    // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
-    // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
-    // because it didn't have enough memory.
-    // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
-    // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    // Matches doesn't sorted.
+    //! Find best matches for each query descriptor which have distance less than maxDistance.
+    //! nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
+    //! carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
+    //! because it didn't have enough memory.
+    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
+    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
+    //! Matches doesn't sorted.
     void radiusMatchSingle(const GpuMat& query, const GpuMat& train,
         GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
         const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
 
-    // Download trainIdx, nMatches and distance and convert it to vector with DMatch.
-    // matches will be sorted in increasing order of distances.
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Download trainIdx, nMatches and distance and convert it to vector with DMatch.
+    //! matches will be sorted in increasing order of distances.
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
     static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
         std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    // Convert trainIdx, nMatches and distance to vector with DMatch.
+    //! Convert trainIdx, nMatches and distance to vector with DMatch.
     static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
         std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
 
-    // Find best matches for each query descriptor which have distance less than maxDistance
-    // in increasing order of distances).
+    //! Find best matches for each query descriptor which have distance less than maxDistance
+    //! in increasing order of distances).
     void radiusMatch(const GpuMat& query, const GpuMat& train,
         std::vector< std::vector<DMatch> >& matches, float maxDistance,
         const GpuMat& mask = GpuMat(), bool compactResult = false);
 
-    // Find best matches for each query descriptor which have distance less than maxDistance.
-    // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
-    // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    // Matches doesn't sorted.
+    //! Find best matches for each query descriptor which have distance less than maxDistance.
+    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
+    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
+    //! Matches doesn't sorted.
     void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
         const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());
 
-    // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
-    // matches will be sorted in increasing order of distances.
-    // compactResult is used when mask is not empty. If compactResult is false matches
-    // vector will have the same size as queryDescriptors rows. If compactResult is true
-    // matches vector will not contain matches for fully masked out query descriptors.
+    //! Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
+    //! matches will be sorted in increasing order of distances.
+    //! compactResult is used when mask is not empty. If compactResult is false matches
+    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //! matches vector will not contain matches for fully masked out query descriptors.
     static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
         std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    // Convert trainIdx, nMatches and distance to vector with DMatch.
+    //! Convert trainIdx, nMatches and distance to vector with DMatch.
     static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
         std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
 
-    // Find best matches from train collection for each query descriptor which have distance less than
-    // maxDistance (in increasing order of distances).
+    //! Find best matches from train collection for each query descriptor which have distance less than
+    //! maxDistance (in increasing order of distances).
     void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
         const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
 
@@ -203,6 +228,8 @@ private:
     std::vector<GpuMat> trainDescCollection;
 };
 
+/** @brief Class used for corner detection using the FAST algorithm. :
+ */
 class CV_EXPORTS FAST_CUDA
 {
 public:
@@ -213,23 +240,45 @@ public:
         ROWS_COUNT
     };
 
-    // all features have same size
+    //! all features have same size
     static const int FEATURE_SIZE = 7;
 
+    /** @brief Constructor.
+
+    @param threshold Threshold on difference between intensity of the central pixel and pixels on a
+    circle around this pixel.
+    @param nonmaxSuppression If it is true, non-maximum suppression is applied to detected corners
+    (keypoints).
+    @param keypointsRatio Inner buffer size for keypoints store is determined as (keypointsRatio \*
+    image\_width \* image\_height).
+     */
     explicit FAST_CUDA(int threshold, bool nonmaxSuppression = true, double keypointsRatio = 0.05);
 
-    //! finds the keypoints using FAST detector
-    //! supports only CV_8UC1 images
+    /** @brief Finds the keypoints using FAST detector.
+
+    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
+    supported.
+    @param mask Optional input mask that marks the regions where we should detect features.
+    @param keypoints The output vector of keypoints. Can be stored both in CPU and GPU memory. For GPU
+    memory:
+    -   keypoints.ptr\<Vec2s\>(LOCATION\_ROW)[i] will contain location of i'th point
+    -   keypoints.ptr\<float\>(RESPONSE\_ROW)[i] will contain response of i'th point (if non-maximum
+    suppression is applied)
+     */
     void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
+    /** @overload */
     void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
 
-    //! download keypoints from device to host memory
+    /** @brief Download keypoints from GPU to CPU memory.
+    */
     static void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
 
-    //! convert keypoints to KeyPoint vector
+    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
+    */
     static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
 
-    //! release temporary buffer's memory
+    /** @brief Releases inner buffer memory.
+    */
     void release();
 
     bool nonmaxSuppression;
@@ -239,13 +288,22 @@ public:
     //! max keypoints = keypointsRatio * img.size().area()
     double keypointsRatio;
 
-    //! find keypoints and compute it's response if nonmaxSuppression is true
-    //! return count of detected keypoints
+    /** @brief Find keypoints and compute it's response if nonmaxSuppression is true.
+
+    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
+    supported.
+    @param mask Optional input mask that marks the regions where we should detect features.
+
+    The function returns count of detected keypoints.
+     */
     int calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask);
 
-    //! get final array of keypoints
-    //! performs nonmax suppression if needed
-    //! return final count of keypoints
+    /** @brief Gets final array of keypoints.
+
+    @param keypoints The output vector of keypoints.
+
+    The function performs non-max suppression if needed and returns final count of keypoints.
+     */
     int getKeyPoints(GpuMat& keypoints);
 
 private:
@@ -257,6 +315,8 @@ private:
     GpuMat d_keypoints_;
 };
 
+/** @brief Class for extracting ORB features and descriptors from an image. :
+ */
 class CV_EXPORTS ORB_CUDA
 {
 public:
@@ -276,28 +336,51 @@ public:
         DEFAULT_FAST_THRESHOLD = 20
     };
 
-    //! Constructor
+    /** @brief Constructor.
+
+    @param nFeatures The number of desired features.
+    @param scaleFactor Coefficient by which we divide the dimensions from one scale pyramid level to
+    the next.
+    @param nLevels The number of levels in the scale pyramid.
+    @param edgeThreshold How far from the boundary the points should be.
+    @param firstLevel The level at which the image is given. If 1, that means we will also look at the
+    image scaleFactor times bigger.
+    @param WTA_K
+    @param scoreType
+    @param patchSize
+     */
     explicit ORB_CUDA(int nFeatures = 500, float scaleFactor = 1.2f, int nLevels = 8, int edgeThreshold = 31,
                      int firstLevel = 0, int WTA_K = 2, int scoreType = 0, int patchSize = 31);
 
-    //! Compute the ORB features on an image
-    //! image - the image to compute the features (supports only CV_8UC1 images)
-    //! mask - the mask to apply
-    //! keypoints - the resulting keypoints
+    /** @overload */
     void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
+    /** @overload */
     void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
 
-    //! Compute the ORB features and descriptors on an image
-    //! image - the image to compute the features (supports only CV_8UC1 images)
-    //! mask - the mask to apply
-    //! keypoints - the resulting keypoints
-    //! descriptors - descriptors array
+    /** @brief Detects keypoints and computes descriptors for them.
+
+    @param image Input 8-bit grayscale image.
+    @param mask Optional input mask that marks the regions where we should detect features.
+    @param keypoints The input/output vector of keypoints. Can be stored both in CPU and GPU memory.
+    For GPU memory:
+    -   keypoints.ptr\<float\>(X\_ROW)[i] contains x coordinate of the i'th feature.
+    -   keypoints.ptr\<float\>(Y\_ROW)[i] contains y coordinate of the i'th feature.
+    -   keypoints.ptr\<float\>(RESPONSE\_ROW)[i] contains the response of the i'th feature.
+    -   keypoints.ptr\<float\>(ANGLE\_ROW)[i] contains orientation of the i'th feature.
+    -   keypoints.ptr\<float\>(OCTAVE\_ROW)[i] contains the octave of the i'th feature.
+    -   keypoints.ptr\<float\>(SIZE\_ROW)[i] contains the size of the i'th feature.
+    @param descriptors Computed descriptors. if blurForDescriptor is true, image will be blurred
+    before descriptors calculation.
+     */
     void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors);
+    /** @overload */
     void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors);
 
-    //! download keypoints from device to host memory
+    /** @brief Download keypoints from GPU to CPU memory.
+    */
     static void downloadKeyPoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-    //! convert keypoints to KeyPoint vector
+    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
+    */
     static void convertKeyPoints(const Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
 
     //! returns the descriptor size in bytes
@@ -309,7 +392,8 @@ public:
         fastDetector_.nonmaxSuppression = nonmaxSuppression;
     }
 
-    //! release temporary buffer's memory
+    /** @brief Releases inner buffer memory.
+    */
     void release();
 
     //! if true, image will be blurred before descriptors calculation
@@ -335,10 +419,10 @@ private:
     int scoreType_;
     int patchSize_;
 
-    // The number of desired features per scale
+    //! The number of desired features per scale
     std::vector<size_t> n_features_per_level_;
 
-    // Points to compute BRIEF descriptors from
+    //! Points to compute BRIEF descriptors from
     GpuMat pattern_;
 
     std::vector<GpuMat> imagePyr_;
@@ -356,6 +440,8 @@ private:
     GpuMat d_keypoints_;
 };
 
+//! @}
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDAFEATURES2D_HPP__ */
diff --git a/modules/cudafilters/include/opencv2/cudafilters.hpp b/modules/cudafilters/include/opencv2/cudafilters.hpp
index 2c06575b5..a7954916b 100644
--- a/modules/cudafilters/include/opencv2/cudafilters.hpp
+++ b/modules/cudafilters/include/opencv2/cudafilters.hpp
@@ -50,65 +50,189 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/imgproc.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudafilters Image Filtering
+
+Functions and classes described in this section are used to perform various linear or non-linear
+filtering operations on 2D images.
+
+@note
+   -   An example containing all basic morphology operators like erode and dilate can be found at
+        opencv\_source\_code/samples/gpu/morphology.cpp
+
+  @}
+ */
+
 namespace cv { namespace cuda {
 
+//! @addtogroup cudafilters
+//! @{
+
+/** @brief Common interface for all CUDA filters :
+ */
 class CV_EXPORTS Filter : public Algorithm
 {
 public:
+    /** @brief Applies the specified filter to the image.
+
+    @param src Input image.
+    @param dst Output image.
+    @param stream Stream for the asynchronous version.
+     */
     virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Box Filter
 
-//! creates a normalized 2D box filter
-//! supports CV_8UC1, CV_8UC4 types
+/** @brief Creates a normalized 2D box filter.
+
+@param srcType Input image type. Only CV\_8UC1 and CV\_8UC4 are supported for now.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel
+center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa boxFilter
+ */
 CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
                                        int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Linear Filter
 
-//! Creates a non-separable linear 2D filter
-//! supports 1 and 4 channel CV_8U, CV_16U and CV_32F input
+/** @brief Creates a non-separable linear 2D filter.
+
+@param srcType Input image type. Supports CV\_8U , CV\_16U and CV\_32F one and four channel image.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param kernel 2D array of filter coefficients.
+@param anchor Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel
+center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa filter2D
+ */
 CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
                                           int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Laplacian Filter
 
-//! creates a Laplacian operator
-//! supports only ksize = 1 and ksize = 3
+/** @brief Creates a Laplacian operator.
+
+@param srcType Input image type. Supports CV\_8U , CV\_16U and CV\_32F one and four channel image.
+@param dstType Output image type. Only the same type as src is supported for now.
+@param ksize Aperture size used to compute the second-derivative filters (see getDerivKernels). It
+must be positive and odd. Only ksize = 1 and ksize = 3 are supported.
+@param scale Optional scale factor for the computed Laplacian values. By default, no scaling is
+applied (see getDerivKernels ).
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+
+@sa Laplacian
+ */
 CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
                                              int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
 
-//! creates a separable linear filter
+/** @brief Creates a separable linear filter.
+
+@param srcType Source array type.
+@param dstType Destination array type.
+@param rowKernel Horizontal filter coefficients. Support kernels with size \<= 32 .
+@param columnKernel Vertical filter coefficients. Support kernels with size \<= 32 .
+@param anchor Anchor position within the kernel. Negative values mean that anchor is positioned at
+the aperture center.
+@param rowBorderMode Pixel extrapolation method in the vertical direction For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa sepFilter2D
+ */
 CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
                                                    Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Deriv Filter
 
-//! creates a generalized Deriv operator
+/** @brief Creates a generalized Deriv operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Derivative order in respect of x.
+@param dy Derivative order in respect of y.
+@param ksize Aperture size. See getDerivKernels for details.
+@param normalize Flag indicating whether to normalize (scale down) the filter coefficients or not.
+See getDerivKernels for details.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. For details, see getDerivKernels .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+ */
 CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
                                          int ksize, bool normalize = false, double scale = 1,
                                          int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
-//! creates a Sobel operator
+/** @brief Creates a Sobel operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Derivative order in respect of x.
+@param dy Derivative order in respect of y.
+@param ksize Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. For details, see getDerivKernels .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa Sobel
+ */
 CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
-//! creates a vertical or horizontal Scharr operator
+/** @brief Creates a vertical or horizontal Scharr operator.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param dx Order of the derivative x.
+@param dy Order of the derivative y.
+@param scale Optional scale factor for the computed derivative values. By default, no scaling is
+applied. See getDerivKernels for details.
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa Scharr
+ */
 CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
                                           double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter
 
-//! creates a Gaussian filter
+/** @brief Creates a Gaussian filter.
+
+@param srcType Source image type.
+@param dstType Destination array type.
+@param ksize Aperture size. See getGaussianKernel for details.
+@param sigma1 Gaussian sigma in the horizontal direction. See getGaussianKernel for details.
+@param sigma2 Gaussian sigma in the vertical direction. If 0, then
+\f$\texttt{sigma2}\leftarrow\texttt{sigma1}\f$ .
+@param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
+borderInterpolate.
+@param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@sa GaussianBlur
+ */
 CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
                                             double sigma1, double sigma2 = 0,
                                             int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
@@ -116,19 +240,49 @@ CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Morphology Filter
 
-//! creates a 2D morphological filter
-//! supports CV_8UC1 and CV_8UC4 types
+/** @brief Creates a 2D morphological filter.
+
+@param op Type of morphological operation. The following types are possible:
+-   **MORPH\_ERODE** erode
+-   **MORPH\_DILATE** dilate
+-   **MORPH\_OPEN** opening
+-   **MORPH\_CLOSE** closing
+-   **MORPH\_GRADIENT** morphological gradient
+-   **MORPH\_TOPHAT** "top hat"
+-   **MORPH\_BLACKHAT** "black hat"
+@param srcType Input/output image type. Only CV\_8UC1 and CV\_8UC4 are supported.
+@param kernel 2D 8-bit structuring element for the morphological operation.
+@param anchor Anchor position within the structuring element. Negative values mean that the anchor
+is at the center.
+@param iterations Number of times erosion and dilation to be applied.
+
+@sa morphologyEx
+ */
 CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Image Rank Filter
 
-//! result pixel value is the maximum of pixel values under the rectangular mask region
+/** @brief Creates the maximum filter.
+
+@param srcType Input/output image type. Only CV\_8UC1 and CV\_8UC4 are supported.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
 CV_EXPORTS Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
                                           int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-//! result pixel value is the maximum of pixel values under the rectangular mask region
+/** @brief Creates the minimum filter.
+
+@param srcType Input/output image type. Only CV\_8UC1 and CV\_8UC4 are supported.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
 CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
                                           int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
@@ -136,14 +290,30 @@ CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // 1D Sum Filter
 
-//! creates a horizontal 1D box filter
-//! supports only CV_8UC1 source type and CV_32FC1 sum type
+/** @brief Creates a horizontal 1D box filter.
+
+@param srcType Input image type. Only CV\_8UC1 type is supported for now.
+@param dstType Output image type. Only CV\_32FC1 type is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
 CV_EXPORTS Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
-//! creates a vertical 1D box filter
-//! supports only CV_8UC1 sum type and CV_32FC1 dst type
+/** @brief Creates a vertical 1D box filter.
+
+@param srcType Input image type. Only CV\_8UC1 type is supported for now.
+@param dstType Output image type. Only CV\_32FC1 type is supported for now.
+@param ksize Kernel size.
+@param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
+@param borderMode Pixel extrapolation method. For details, see borderInterpolate .
+@param borderVal Default border value.
+ */
 CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
+//! @}
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDAFILTERS_HPP__ */
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index d451b93b1..2b3e14659 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -50,16 +50,48 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/imgproc.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudaimgproc Image Processing
+    @{
+      @defgroup cudaimgproc_color Color space processing
+      @defgroup cudaimgproc_hist Histogram Calculation
+      @defgroup cudaimgproc_hough Hough Transform
+      @defgroup cudaimgproc_feature Feature Detection
+    @}
+  @}
+*/
+
 namespace cv { namespace cuda {
 
+//! @addtogroup cudaimgproc
+//! @{
+
 /////////////////////////// Color Processing ///////////////////////////
 
-//! converts image from one color space to another
+//! @addtogroup cudaimgproc_color
+//! @{
+
+/** @brief Converts an image from one color space to another.
+
+@param src Source image with CV\_8U , CV\_16U , or CV\_32F depth and 1, 3, or 4 channels.
+@param dst Destination image.
+@param code Color space conversion code. For details, see cvtColor .
+@param dcn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream Stream for the asynchronous version.
+
+3-channel color spaces (like HSV, XYZ, and so on) can be stored in a 4-channel image for better
+performance.
+
+@sa cvtColor
+ */
 CV_EXPORTS void cvtColor(InputArray src, OutputArray dst, int code, int dcn = 0, Stream& stream = Stream::Null());
 
 enum
 {
-    // Bayer Demosaicing (Malvar, He, and Cutler)
+    //! Bayer Demosaicing (Malvar, He, and Cutler)
     COLOR_BayerBG2BGR_MHT = 256,
     COLOR_BayerGB2BGR_MHT = 257,
     COLOR_BayerRG2BGR_MHT = 258,
@@ -75,105 +107,228 @@ enum
     COLOR_BayerRG2GRAY_MHT = 262,
     COLOR_BayerGR2GRAY_MHT = 263
 };
+
+/** @brief Converts an image from Bayer pattern to RGB or grayscale.
+
+@param src Source image (8-bit or 16-bit single channel).
+@param dst Destination image.
+@param code Color space conversion code (see the description below).
+@param dcn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream Stream for the asynchronous version.
+
+The function can do the following transformations:
+
+-   Demosaicing using bilinear interpolation
+
+    > -   COLOR\_BayerBG2GRAY , COLOR\_BayerGB2GRAY , COLOR\_BayerRG2GRAY , COLOR\_BayerGR2GRAY
+    > -   COLOR\_BayerBG2BGR , COLOR\_BayerGB2BGR , COLOR\_BayerRG2BGR , COLOR\_BayerGR2BGR
+
+-   Demosaicing using Malvar-He-Cutler algorithm (@cite MHT2011)
+
+    > -   COLOR\_BayerBG2GRAY\_MHT , COLOR\_BayerGB2GRAY\_MHT , COLOR\_BayerRG2GRAY\_MHT ,
+    >     COLOR\_BayerGR2GRAY\_MHT
+    > -   COLOR\_BayerBG2BGR\_MHT , COLOR\_BayerGB2BGR\_MHT , COLOR\_BayerRG2BGR\_MHT ,
+    >     COLOR\_BayerGR2BGR\_MHT
+
+@sa cvtColor
+ */
 CV_EXPORTS void demosaicing(InputArray src, OutputArray dst, int code, int dcn = -1, Stream& stream = Stream::Null());
 
-//! swap channels
-//! dstOrder - Integer array describing how channel values are permutated. The n-th entry
-//!            of the array contains the number of the channel that is stored in the n-th channel of
-//!            the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR
-//!            channel order.
+/** @brief Exchanges the color channels of an image in-place.
+
+@param image Source image. Supports only CV\_8UC4 type.
+@param dstOrder Integer array describing how channel values are permutated. The n-th entry of the
+array contains the number of the channel that is stored in the n-th channel of the output image.
+E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR channel order.
+@param stream Stream for the asynchronous version.
+
+The methods support arbitrary permutations of the original channels, including replication.
+ */
 CV_EXPORTS void swapChannels(InputOutputArray image, const int dstOrder[4], Stream& stream = Stream::Null());
 
-//! Routines for correcting image color gamma
+/** @brief Routines for correcting image color gamma.
+
+@param src Source image (3- or 4-channel 8 bit).
+@param dst Destination image.
+@param forward true for forward gamma correction or false for inverse gamma correction.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void gammaCorrection(InputArray src, OutputArray dst, bool forward = true, Stream& stream = Stream::Null());
 
 enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
        ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};
 
-//! Composite two images using alpha opacity values contained in each image
-//! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types
+/** @brief Composites two images using alpha opacity values contained in each image.
+
+@param img1 First image. Supports CV\_8UC4 , CV\_16UC4 , CV\_32SC4 and CV\_32FC4 types.
+@param img2 Second image. Must have the same size and the same type as img1 .
+@param dst Destination image.
+@param alpha\_op Flag specifying the alpha-blending operation:
+-   **ALPHA\_OVER**
+-   **ALPHA\_IN**
+-   **ALPHA\_OUT**
+-   **ALPHA\_ATOP**
+-   **ALPHA\_XOR**
+-   **ALPHA\_PLUS**
+-   **ALPHA\_OVER\_PREMUL**
+-   **ALPHA\_IN\_PREMUL**
+-   **ALPHA\_OUT\_PREMUL**
+-   **ALPHA\_ATOP\_PREMUL**
+-   **ALPHA\_XOR\_PREMUL**
+-   **ALPHA\_PLUS\_PREMUL**
+-   **ALPHA\_PREMUL**
+@param stream Stream for the asynchronous version.
+
+@note
+   -   An example demonstrating the use of alphaComp can be found at
+        opencv\_source\_code/samples/gpu/alpha\_comp.cpp
+ */
 CV_EXPORTS void alphaComp(InputArray img1, InputArray img2, OutputArray dst, int alpha_op, Stream& stream = Stream::Null());
 
+//! @} cudaimgproc_color
+
 ////////////////////////////// Histogram ///////////////////////////////
 
-//! Calculates histogram for 8u one channel image
-//! Output hist will have one row, 256 cols and CV32SC1 type.
+//! @addtogroup cudaimgproc_hist
+//! @{
+
+/** @brief Calculates histogram for one channel 8-bit image.
+
+@param src Source image with CV\_8UC1 type.
+@param hist Destination histogram with one row, 256 columns, and the CV\_32SC1 type.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void calcHist(InputArray src, OutputArray hist, Stream& stream = Stream::Null());
 
-//! normalizes the grayscale image brightness and contrast by normalizing its histogram
+/** @brief Equalizes the histogram of a grayscale image.
+
+@param src Source image with CV\_8UC1 type.
+@param dst Destination image.
+@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+@param stream Stream for the asynchronous version.
+
+@sa equalizeHist
+ */
 CV_EXPORTS void equalizeHist(InputArray src, OutputArray dst, InputOutputArray buf, Stream& stream = Stream::Null());
 
+/** @overload */
 static inline void equalizeHist(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
 {
     GpuMat buf;
     cuda::equalizeHist(src, dst, buf, stream);
 }
 
+/** @brief Base class for Contrast Limited Adaptive Histogram Equalization. :
+ */
 class CV_EXPORTS CLAHE : public cv::CLAHE
 {
 public:
     using cv::CLAHE::apply;
+    /** @brief Equalizes the histogram of a grayscale image using Contrast Limited Adaptive Histogram Equalization.
+
+    @param src Source image with CV\_8UC1 type.
+    @param dst Destination image.
+    @param stream Stream for the asynchronous version.
+     */
     virtual void apply(InputArray src, OutputArray dst, Stream& stream) = 0;
 };
+
+/** @brief Creates implementation for cuda::CLAHE .
+
+@param clipLimit Threshold for contrast limiting.
+@param tileGridSize Size of grid for histogram equalization. Input image will be divided into
+equally sized rectangular tiles. tileGridSize defines the number of tiles in row and column.
+ */
 CV_EXPORTS Ptr<cuda::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
 
-//! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.
+/** @brief Computes levels with even distribution.
+
+@param levels Destination array. levels has 1 row, nLevels columns, and the CV\_32SC1 type.
+@param nLevels Number of computed levels. nLevels must be at least 2.
+@param lowerLevel Lower boundary value of the lowest level.
+@param upperLevel Upper boundary value of the greatest level.
+ */
 CV_EXPORTS void evenLevels(OutputArray levels, int nLevels, int lowerLevel, int upperLevel);
 
-//! Calculates histogram with evenly distributed bins for signle channel source.
-//! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types.
-//! Output hist will have one row and histSize cols and CV_32SC1 type.
+/** @brief Calculates a histogram with evenly distributed bins.
+
+@param src Source image. CV\_8U, CV\_16U, or CV\_16S depth and 1 or 4 channels are supported. For
+a four-channel image, all channels are processed separately.
+@param hist Destination histogram with one row, histSize columns, and the CV\_32S type.
+@param histSize Size of the histogram.
+@param lowerLevel Lower boundary of lowest-level bin.
+@param upperLevel Upper boundary of highest-level bin.
+@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void histEven(InputArray src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
 
+/** @overload */
 static inline void histEven(InputArray src, OutputArray hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null())
 {
     GpuMat buf;
     cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
 }
 
-//! Calculates histogram with evenly distributed bins for four-channel source.
-//! All channels of source are processed separately.
-//! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types.
-//! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type.
+/** @overload */
 CV_EXPORTS void histEven(InputArray src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
 
+/** @overload */
 static inline void histEven(InputArray src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null())
 {
     GpuMat buf;
     cuda::histEven(src, hist, buf, histSize, lowerLevel, upperLevel, stream);
 }
 
-//! Calculates histogram with bins determined by levels array.
-//! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
-//! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types.
-//! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type.
+/** @brief Calculates a histogram with bins determined by the levels array.
+
+@param src Source image. CV\_8U , CV\_16U , or CV\_16S depth and 1 or 4 channels are supported.
+For a four-channel image, all channels are processed separately.
+@param hist Destination histogram with one row, (levels.cols-1) columns, and the CV\_32SC1 type.
+@param levels Number of levels in the histogram.
+@param buf Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void histRange(InputArray src, OutputArray hist, InputArray levels, InputOutputArray buf, Stream& stream = Stream::Null());
 
+/** @overload */
 static inline void histRange(InputArray src, OutputArray hist, InputArray levels, Stream& stream = Stream::Null())
 {
     GpuMat buf;
     cuda::histRange(src, hist, levels, buf, stream);
 }
 
-//! Calculates histogram with bins determined by levels array.
-//! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
-//! All channels of source are processed separately.
-//! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types.
-//! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type.
+/** @overload */
 CV_EXPORTS void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream = Stream::Null());
 
+/** @overload */
 static inline void histRange(InputArray src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null())
 {
     GpuMat buf;
     cuda::histRange(src, hist, levels, buf, stream);
 }
 
+//! @} cudaimgproc_hist
+
 //////////////////////////////// Canny ////////////////////////////////
 
+/** @brief Base class for Canny Edge Detector. :
+ */
 class CV_EXPORTS CannyEdgeDetector : public Algorithm
 {
 public:
+    /** @brief Finds edges in an image using the @cite Canny86 algorithm.
+
+    @param image Single-channel 8-bit input image.
+    @param edges Output edge map. It has the same size and type as image .
+     */
     virtual void detect(InputArray image, OutputArray edges) = 0;
+    /** @overload
+    @param dx First derivative of image in the vertical direction. Support only CV\_32S type.
+    @param dy First derivative of image in the horizontal direction. Support only CV\_32S type.
+    @param edges Output edge map. It has the same size and type as image .
+    */
     virtual void detect(InputArray dx, InputArray dy, OutputArray edges) = 0;
 
     virtual void setLowThreshold(double low_thresh) = 0;
@@ -189,6 +344,16 @@ public:
     virtual bool getL2Gradient() const = 0;
 };
 
+/** @brief Creates implementation for cuda::CannyEdgeDetector .
+
+@param low\_thresh First threshold for the hysteresis procedure.
+@param high\_thresh Second threshold for the hysteresis procedure.
+@param apperture\_size Aperture size for the Sobel operator.
+@param L2gradient Flag indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to compute the image gradient magnitude (
+L2gradient=true ), or a faster default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough ( L2gradient=false
+).
+ */
 CV_EXPORTS Ptr<CannyEdgeDetector> createCannyEdgeDetector(double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
 
 /////////////////////////// Hough Transform ////////////////////////////
@@ -196,10 +361,32 @@ CV_EXPORTS Ptr<CannyEdgeDetector> createCannyEdgeDetector(double low_thresh, dou
 //////////////////////////////////////
 // HoughLines
 
+//! @addtogroup cudaimgproc_hough
+//! @{
+
+/** @brief Base class for lines detector algorithm. :
+ */
 class CV_EXPORTS HoughLinesDetector : public Algorithm
 {
 public:
+    /** @brief Finds lines in a binary image using the classical Hough transform.
+
+    @param src 8-bit, single-channel binary source image.
+    @param lines Output vector of lines. Each line is represented by a two-element vector
+    \f$(\rho, \theta)\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of
+    the image). \f$\theta\f$ is the line rotation angle in radians (
+    \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ).
+
+    @sa HoughLines
+     */
     virtual void detect(InputArray src, OutputArray lines) = 0;
+
+    /** @brief Downloads results from cuda::HoughLinesDetector::detect to host memory.
+
+    @param d\_lines Result of cuda::HoughLinesDetector::detect .
+    @param h\_lines Output host array.
+    @param h\_votes Optional output array for line's votes.
+     */
     virtual void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray()) = 0;
 
     virtual void setRho(float rho) = 0;
@@ -218,16 +405,35 @@ public:
     virtual int getMaxLines() const = 0;
 };
 
+/** @brief Creates implementation for cuda::HoughLinesDetector .
+
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param doSort Performs lines sort by votes.
+@param maxLines Maximum number of output lines.
+ */
 CV_EXPORTS Ptr<HoughLinesDetector> createHoughLinesDetector(float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 
 
 //////////////////////////////////////
 // HoughLinesP
 
-//! finds line segments in the black-n-white image using probabilistic Hough transform
+/** @brief Base class for line segments detector algorithm. :
+ */
 class CV_EXPORTS HoughSegmentDetector : public Algorithm
 {
 public:
+    /** @brief Finds line segments in a binary image using the probabilistic Hough transform.
+
+    @param src 8-bit, single-channel binary source image.
+    @param lines Output vector of lines. Each line is represented by a 4-element vector
+    \f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected
+    line segment.
+
+    @sa HoughLinesP
+     */
     virtual void detect(InputArray src, OutputArray lines) = 0;
 
     virtual void setRho(float rho) = 0;
@@ -246,14 +452,32 @@ public:
     virtual int getMaxLines() const = 0;
 };
 
+/** @brief Creates implementation for cuda::HoughSegmentDetector .
+
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param minLineLength Minimum line length. Line segments shorter than that are rejected.
+@param maxLineGap Maximum allowed gap between points on the same line to link them.
+@param maxLines Maximum number of output lines.
+ */
 CV_EXPORTS Ptr<HoughSegmentDetector> createHoughSegmentDetector(float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096);
 
 //////////////////////////////////////
 // HoughCircles
 
+/** @brief Base class for circles detector algorithm. :
+ */
 class CV_EXPORTS HoughCirclesDetector : public Algorithm
 {
 public:
+    /** @brief Finds circles in a grayscale image using the Hough transform.
+
+    @param src 8-bit, single-channel grayscale input image.
+    @param circles Output vector of found circles. Each vector is encoded as a 3-element
+    floating-point vector \f$(x, y, radius)\f$ .
+
+    @sa HoughCircles
+     */
     virtual void detect(InputArray src, OutputArray circles) = 0;
 
     virtual void setDp(float dp) = 0;
@@ -278,85 +502,257 @@ public:
     virtual int getMaxCircles() const = 0;
 };
 
+/** @brief Creates implementation for cuda::HoughCirclesDetector .
+
+@param dp Inverse ratio of the accumulator resolution to the image resolution. For example, if
+dp=1 , the accumulator has the same resolution as the input image. If dp=2 , the accumulator has
+half as big width and height.
+@param minDist Minimum distance between the centers of the detected circles. If the parameter is
+too small, multiple neighbor circles may be falsely detected in addition to a true one. If it is
+too large, some circles may be missed.
+@param cannyThreshold The higher threshold of the two passed to Canny edge detector (the lower one
+is twice smaller).
+@param votesThreshold The accumulator threshold for the circle centers at the detection stage. The
+smaller it is, the more false circles may be detected.
+@param minRadius Minimum circle radius.
+@param maxRadius Maximum circle radius.
+@param maxCircles Maximum number of output circles.
+ */
 CV_EXPORTS Ptr<HoughCirclesDetector> createHoughCirclesDetector(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
 
 //////////////////////////////////////
 // GeneralizedHough
 
-//! Ballard, D.H. (1981). Generalizing the Hough transform to detect arbitrary shapes. Pattern Recognition 13 (2): 111-122.
-//! Detects position only without traslation and rotation
+/** @brief Creates implementation for generalized hough transform from @cite Ballard1981 .
+ */
 CV_EXPORTS Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
 
-//! Guil, N., González-Linares, J.M. and Zapata, E.L. (1999). Bidimensional shape detection using an invariant approach. Pattern Recognition 32 (6): 1025-1038.
-//! Detects position, traslation and rotation
+/** @brief Creates implementation for generalized hough transform from @cite Guil1999 .
+ */
 CV_EXPORTS Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();
 
+//! @} cudaimgproc_hough
+
 ////////////////////////// Corners Detection ///////////////////////////
 
+//! @addtogroup cudaimgproc_feature
+//! @{
+
+/** @brief Base class for Cornerness Criteria computation. :
+ */
 class CV_EXPORTS CornernessCriteria : public Algorithm
 {
 public:
+    /** @brief Computes the cornerness criteria at each image pixel.
+
+    @param src Source image.
+    @param dst Destination image containing cornerness values. It will have the same size as src and
+    CV\_32FC1 type.
+    @param stream Stream for the asynchronous version.
+     */
     virtual void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 };
 
-//! computes Harris cornerness criteria at each image pixel
+/** @brief Creates implementation for Harris cornerness criteria.
+
+@param srcType Input source type. Only CV\_8UC1 and CV\_32FC1 are supported for now.
+@param blockSize Neighborhood size.
+@param ksize Aperture parameter for the Sobel operator.
+@param k Harris detector free parameter.
+@param borderType Pixel extrapolation method. Only BORDER\_REFLECT101 and BORDER\_REPLICATE are
+supported for now.
+
+@sa cornerHarris
+ */
 CV_EXPORTS Ptr<CornernessCriteria> createHarrisCorner(int srcType, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101);
 
-//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
+/** @brief Creates implementation for the minimum eigen value of a 2x2 derivative covariation matrix (the
+cornerness criteria).
+
+@param srcType Input source type. Only CV\_8UC1 and CV\_32FC1 are supported for now.
+@param blockSize Neighborhood size.
+@param ksize Aperture parameter for the Sobel operator.
+@param borderType Pixel extrapolation method. Only BORDER\_REFLECT101 and BORDER\_REPLICATE are
+supported for now.
+
+@sa cornerMinEigenVal
+ */
 CV_EXPORTS Ptr<CornernessCriteria> createMinEigenValCorner(int srcType, int blockSize, int ksize, int borderType = BORDER_REFLECT101);
 
 ////////////////////////// Corners Detection ///////////////////////////
 
+/** @brief Base class for Corners Detector. :
+ */
 class CV_EXPORTS CornersDetector : public Algorithm
 {
 public:
-    //! return 1 rows matrix with CV_32FC2 type
+    /** @brief Determines strong corners on an image.
+
+    @param image Input 8-bit or floating-point 32-bit, single-channel image.
+    @param corners Output vector of detected corners (1-row matrix with CV\_32FC2 type with corners
+    positions).
+    @param mask Optional region of interest. If the image is not empty (it needs to have the type
+    CV\_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+     */
     virtual void detect(InputArray image, OutputArray corners, InputArray mask = noArray()) = 0;
 };
 
+/** @brief Creates implementation for cuda::CornersDetector .
+
+@param srcType Input source type. Only CV\_8UC1 and CV\_32FC1 are supported for now.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see cornerMinEigenVal ) or the Harris function response (see cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see cornerHarris)
+or cornerMinEigenVal.
+@param harrisK Free parameter of the Harris detector.
+ */
 CV_EXPORTS Ptr<CornersDetector> createGoodFeaturesToTrackDetector(int srcType, int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
                                                                   int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
 
+//! @} cudaimgproc_feature
+
 ///////////////////////////// Mean Shift //////////////////////////////
 
-//! Does mean shift filtering on GPU.
+/** @brief Performs mean-shift filtering for each point of the source image.
+
+@param src Source image. Only CV\_8UC4 images are supported for now.
+@param dst Destination image containing the color of mapped points. It has the same size and type
+as src .
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param criteria Termination criteria. See TermCriteria.
+@param stream
+
+It maps each point of the source image into another point. As a result, you have a new color and new
+position of each point.
+ */
 CV_EXPORTS void meanShiftFiltering(InputArray src, OutputArray dst, int sp, int sr,
                                    TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
                                    Stream& stream = Stream::Null());
 
-//! Does mean shift procedure on GPU.
+/** @brief Performs a mean-shift procedure and stores information about processed points (their colors and
+positions) in two images.
+
+@param src Source image. Only CV\_8UC4 images are supported for now.
+@param dstr Destination image containing the color of mapped points. The size and type is the same
+as src .
+@param dstsp Destination image containing the position of mapped points. The size is the same as
+src size. The type is CV\_16SC2 .
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param criteria Termination criteria. See TermCriteria.
+@param stream
+
+@sa cuda::meanShiftFiltering
+ */
 CV_EXPORTS void meanShiftProc(InputArray src, OutputArray dstr, OutputArray dstsp, int sp, int sr,
                               TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1),
                               Stream& stream = Stream::Null());
 
-//! Does mean shift segmentation with elimination of small regions.
+/** @brief Performs a mean-shift segmentation of the source image and eliminates small segments.
+
+@param src Source image. Only CV\_8UC4 images are supported for now.
+@param dst Segmented image with the same size and type as src (host memory).
+@param sp Spatial window radius.
+@param sr Color window radius.
+@param minsize Minimum segment size. Smaller segments are merged.
+@param criteria Termination criteria. See TermCriteria.
+ */
 CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, int sr, int minsize,
                                       TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
 
 /////////////////////////// Match Template ////////////////////////////
 
-//! computes the proximity map for the raster template and the image where the template is searched for
+/** @brief Base class for Template Matching. :
+ */
 class CV_EXPORTS TemplateMatching : public Algorithm
 {
 public:
+    /** @brief Computes a proximity map for a raster template and an image where the template is searched for.
+
+    @param image Source image.
+    @param templ Template image with the size and type the same as image .
+    @param result Map containing comparison results ( CV\_32FC1 ). If image is *W x H* and templ is *w
+    x h*, then result must be *W-w+1 x H-h+1*.
+    @param stream Stream for the asynchronous version.
+     */
     virtual void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null()) = 0;
 };
 
+/** @brief Creates implementation for cuda::TemplateMatching .
+
+@param srcType Input source type. CV\_32F and CV\_8U depth images (1..4 channels) are supported
+for now.
+@param method Specifies the way to compare the template with the image.
+@param user\_block\_size You can use field user\_block\_size to set specific block size. If you
+leave its default value Size(0,0) then automatic estimation of block size will be used (which is
+optimized for speed). By varying user\_block\_size you can reduce memory requirements at the cost
+of speed.
+
+The following methods are supported for the CV\_8U depth images for now:
+
+-   CV\_TM\_SQDIFF
+-   CV\_TM\_SQDIFF\_NORMED
+-   CV\_TM\_CCORR
+-   CV\_TM\_CCORR\_NORMED
+-   CV\_TM\_CCOEFF
+-   CV\_TM\_CCOEFF\_NORMED
+
+The following methods are supported for the CV\_32F images for now:
+
+-   CV\_TM\_SQDIFF
+-   CV\_TM\_CCORR
+
+@sa matchTemplate
+ */
 CV_EXPORTS Ptr<TemplateMatching> createTemplateMatching(int srcType, int method, Size user_block_size = Size());
 
 ////////////////////////// Bilateral Filter ///////////////////////////
 
-//! Performa bilateral filtering of passsed image
+/** @brief Performs bilateral filtering of passed image
+
+@param src Source image. Supports only (channles != 2 && depth() != CV\_8S && depth() != CV\_32S
+&& depth() != CV\_64F).
+@param dst Destination imagwe.
+@param kernel\_size Kernel window size.
+@param sigma\_color Filter sigma in the color space.
+@param sigma\_spatial Filter sigma in the coordinate space.
+@param borderMode Border type. See borderInterpolate for details. BORDER\_REFLECT101 ,
+BORDER\_REPLICATE , BORDER\_CONSTANT , BORDER\_REFLECT and BORDER\_WRAP are supported for now.
+@param stream Stream for the asynchronous version.
+
+@sa bilateralFilter
+ */
 CV_EXPORTS void bilateralFilter(InputArray src, OutputArray dst, int kernel_size, float sigma_color, float sigma_spatial,
                                 int borderMode = BORDER_DEFAULT, Stream& stream = Stream::Null());
 
 ///////////////////////////// Blending ////////////////////////////////
 
-//! performs linear blending of two images
-//! to avoid accuracy errors sum of weigths shouldn't be very close to zero
+/** @brief Performs linear blending of two images.
+
+@param img1 First image. Supports only CV\_8U and CV\_32F depth.
+@param img2 Second image. Must have the same size and the same type as img1 .
+@param weights1 Weights for first image. Must have tha same size as img1 . Supports only CV\_32F
+type.
+@param weights2 Weights for second image. Must have tha same size as img2 . Supports only CV\_32F
+type.
+@param result Destination image.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void blendLinear(InputArray img1, InputArray img2, InputArray weights1, InputArray weights2,
                             OutputArray result, Stream& stream = Stream::Null());
 
+//! @}
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDAIMGPROC_HPP__ */
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy.hpp b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
index 5ae75cd31..a72ef09c7 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@@ -49,4 +49,11 @@
 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudalegacy Legacy support
+  @}
+*/
+
 #endif /* __OPENCV_CUDALEGACY_HPP__ */
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
index cb84c23ad..6eb65b0d8 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
@@ -60,6 +60,8 @@
 //
 //==============================================================================
 
+//! @addtogroup cudalegacy
+//! @{
 
 /**
 * Compile-time assert namespace
@@ -1023,6 +1025,6 @@ CV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Nc
     NCVMatrixAlloc<type> name(alloc, width, height); \
     ncvAssertReturn(name.isMemAllocated(), err);
 
-
+//! @}
 
 #endif // _ncv_hpp_
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp
index 777000cf7..c14532b48 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVBroxOpticalFlow.hpp
@@ -62,6 +62,9 @@
 
 #include "opencv2/cudalegacy/NCV.hpp"
 
+//! @addtogroup cudalegacy
+//! @{
+
 /// \brief Model and solver parameters
 struct NCVBroxOpticalFlowDescriptor
 {
@@ -89,6 +92,7 @@ struct NCVBroxOpticalFlowDescriptor
 /// \param [in]  frame1            frame to track
 /// \param [out] u                 flow horizontal component (along \b x axis)
 /// \param [out] v                 flow vertical component (along \b y axis)
+/// \param       stream
 /// \return                        computation status
 /////////////////////////////////////////////////////////////////////////////////////////
 
@@ -101,4 +105,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                              NCVMatrix<Ncv32f> &v,
                              cudaStream_t stream);
 
+//! @}
+
 #endif
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp
index 6c69cbd5a..6b84e8b25 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVHaarObjectDetection.hpp
@@ -61,6 +61,8 @@
 
 #include "opencv2/cudalegacy/NCV.hpp"
 
+//! @addtogroup cudalegacy
+//! @{
 
 //==============================================================================
 //
@@ -456,6 +458,6 @@ CV_EXPORTS NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
                                              NCVVector<HaarClassifierNode128> &h_HaarNodes,
                                              NCVVector<HaarFeature64> &h_HaarFeatures);
 
-
+//! @}
 
 #endif // _ncvhaarobjectdetection_hpp_
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp
index 7ec22a367..9f4501a5a 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCVPyramid.hpp
@@ -48,6 +48,8 @@
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/core/cuda/common.hpp"
 
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
     namespace pyramid
@@ -106,4 +108,6 @@ private:
 
 #endif //_WIN32
 
+//! @endcond
+
 #endif //_ncvpyramid_hpp_
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
index 979ceef41..6cc50d7a4 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
@@ -45,19 +45,14 @@
 
 #include "opencv2/cudalegacy/NCV.hpp"
 
-
-/**
-* \file NPP_staging.hpp
-* NPP Staging Library
-*/
-
+//! @addtogroup cudalegacy
+//! @{
 
 /** \defgroup core_npp NPPST Core
  * Basic functions for CUDA streams management.
  * @{
  */
 
-
 /**
  * Gets an active CUDA stream used by NPPST
  * NOT THREAD SAFE
@@ -168,6 +163,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
  * \param nSrcStep          [IN]  Source image line step
  * \param pDst              [OUT] Destination image pointer (CUDA device memory)
  * \param dstSize           [OUT] Destination image size
+ * \param nDstStep
  * \param oROI              [IN]  Region of interest in the source image
  * \param borderType        [IN]  Type of border
  * \param pKernel           [IN]  Pointer to row kernel values (CUDA device memory)
@@ -201,6 +197,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
  * \param nSrcStep          [IN]  Source image line step
  * \param pDst              [OUT] Destination image pointer (CUDA device memory)
  * \param dstSize           [OUT] Destination image size
+ * \param nDstStep          [IN]
  * \param oROI              [IN]  Region of interest in the source image
  * \param borderType        [IN]  Type of border
  * \param pKernel           [IN]  Pointer to column kernel values (CUDA device memory)
@@ -228,7 +225,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
 /** Size of buffer required for vector image warping.
  *
  * \param srcSize           [IN]  Source image size
- * \param nStep             [IN]  Source image line step
+ * \param nSrcStep          [IN]  Source image line step
  * \param hpSize            [OUT] Where to store computed size (host memory)
  *
  * \return NCV status code
@@ -285,6 +282,7 @@ NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
  * \param pU                [IN]  Pointer to horizontal displacement field (CUDA device memory)
  * \param pV                [IN]  Pointer to vertical displacement field (CUDA device memory)
  * \param nVFStep           [IN]  Displacement field line step
+ * \param pBuffer
  * \param timeScale         [IN]  Value by which displacement field will be scaled for warping
  * \param pDst              [OUT] Destination image pointer (CUDA device memory)
  *
@@ -903,5 +901,6 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
 
 /*@}*/
 
+//! @}
 
 #endif // _npp_staging_hpp_
diff --git a/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp b/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
index 41c23836d..721748099 100644
--- a/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/private.hpp
@@ -56,6 +56,8 @@
 
 #include "opencv2/cudalegacy.hpp"
 
+//! @cond IGNORED
+
 namespace cv { namespace cuda
 {
     class NppStStreamHandler
@@ -89,4 +91,6 @@ namespace cv { namespace cuda
 
 #define ncvSafeCall(expr)  cv::cuda::checkNcvError(expr, __FILE__, __LINE__, CV_Func)
 
+//! @endcond
+
 #endif // __OPENCV_CORE_CUDALEGACY_PRIVATE_HPP__
diff --git a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
index d07a834ef..694ad8d87 100644
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -49,8 +49,21 @@
 
 #include "opencv2/core/cuda.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudaoptflow Optical Flow
+  @}
+ */
+
 namespace cv { namespace cuda {
 
+//! @addtogroup cudaoptflow
+//! @{
+
+/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm
+(@cite Brox2004). :
+ */
 class CV_EXPORTS BroxOpticalFlow
 {
 public:
@@ -88,16 +101,58 @@ public:
     GpuMat buf;
 };
 
+/** @brief Class used for calculating an optical flow.
+
+The class can calculate an optical flow for a sparse feature set or dense optical flow using the
+iterative Lucas-Kanade method with pyramids.
+
+@sa calcOpticalFlowPyrLK
+
+@note
+   -   An example of the Lucas Kanade optical flow algorithm can be found at
+        opencv\_source\_code/samples/gpu/pyrlk\_optical\_flow.cpp
+ */
 class CV_EXPORTS PyrLKOpticalFlow
 {
 public:
     PyrLKOpticalFlow();
 
+    /** @brief Calculate an optical flow for a sparse feature set.
+
+    @param prevImg First 8-bit input image (supports both grayscale and color images).
+    @param nextImg Second input image of the same size and the same type as prevImg .
+    @param prevPts Vector of 2D points for which the flow needs to be found. It must be one row matrix
+    with CV\_32FC2 type.
+    @param nextPts Output vector of 2D points (with single-precision floating-point coordinates)
+    containing the calculated new positions of input features in the second image. When useInitialFlow
+    is true, the vector must have the same size as in the input.
+    @param status Output status vector (CV\_8UC1 type). Each element of the vector is set to 1 if the
+    flow for the corresponding features has been found. Otherwise, it is set to 0.
+    @param err Output vector (CV\_32FC1 type) that contains the difference between patches around the
+    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
+    needed.
+
+    @sa calcOpticalFlowPyrLK
+     */
     void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
         GpuMat& status, GpuMat* err = 0);
 
+    /** @brief Calculate dense optical flow.
+
+    @param prevImg First 8-bit grayscale input image.
+    @param nextImg Second input image of the same size and the same type as prevImg .
+    @param u Horizontal component of the optical flow of the same size as input images, 32-bit
+    floating-point, single-channel
+    @param v Vertical component of the optical flow of the same size as input images, 32-bit
+    floating-point, single-channel
+    @param err Output vector (CV\_32FC1 type) that contains the difference between patches around the
+    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
+    needed.
+     */
     void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
 
+    /** @brief Releases inner buffers memory.
+    */
     void releaseMemory();
 
     Size winSize;
@@ -115,6 +170,8 @@ private:
     GpuMat vPyr_[2];
 };
 
+/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm. :
+ */
 class CV_EXPORTS FarnebackOpticalFlow
 {
 public:
@@ -139,8 +196,20 @@ public:
     double polySigma;
     int flags;
 
+    /** @brief Computes a dense optical flow using the Gunnar Farneback’s algorithm.
+
+    @param frame0 First 8-bit gray-scale input image
+    @param frame1 Second 8-bit gray-scale input image
+    @param flowx Flow horizontal component
+    @param flowy Flow vertical component
+    @param s Stream
+
+    @sa calcOpticalFlowFarneback
+     */
     void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());
 
+    /** @brief Releases unused auxiliary memory buffers.
+     */
     void releaseMemory()
     {
         frames_[0].release();
@@ -295,20 +364,22 @@ private:
     GpuMat extended_I1;
 };
 
-//! Interpolate frames (images) using provided optical flow (displacement field).
-//! frame0   - frame 0 (32-bit floating point images, single channel)
-//! frame1   - frame 1 (the same type and size)
-//! fu       - forward horizontal displacement
-//! fv       - forward vertical displacement
-//! bu       - backward horizontal displacement
-//! bv       - backward vertical displacement
-//! pos      - new frame position
-//! newFrame - new frame
-//! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat;
-//!            occlusion masks            0, occlusion masks            1,
-//!            interpolated forward flow  0, interpolated forward flow  1,
-//!            interpolated backward flow 0, interpolated backward flow 1
-//!
+/** @brief Interpolates frames (images) using provided optical flow (displacement field).
+
+@param frame0 First frame (32-bit floating point images, single channel).
+@param frame1 Second frame. Must have the same type and size as frame0 .
+@param fu Forward horizontal displacement.
+@param fv Forward vertical displacement.
+@param bu Backward horizontal displacement.
+@param bv Backward vertical displacement.
+@param pos New frame position.
+@param newFrame Output image.
+@param buf Temporary buffer, will have width x 6\*height size, CV\_32FC1 type and contain 6
+GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward
+horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow,
+interpolated backward vertical flow.
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
                                   const GpuMat& fu, const GpuMat& fv,
                                   const GpuMat& bu, const GpuMat& bv,
@@ -317,6 +388,8 @@ CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
 
 CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);
 
+//! @}
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDAOPTFLOW_HPP__ */
diff --git a/modules/cudastereo/include/opencv2/cudastereo.hpp b/modules/cudastereo/include/opencv2/cudastereo.hpp
index a58156c43..ee39604e0 100644
--- a/modules/cudastereo/include/opencv2/cudastereo.hpp
+++ b/modules/cudastereo/include/opencv2/cudastereo.hpp
@@ -50,11 +50,25 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/calib3d.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudastereo Stereo Correspondence
+  @}
+ */
+
 namespace cv { namespace cuda {
 
+//! @addtogroup cudastereo
+//! @{
+
 /////////////////////////////////////////
 // StereoBM
 
+/** @brief Class computing stereo correspondence (disparity map) using the block matching algorithm. :
+
+@sa StereoBM
+ */
 class CV_EXPORTS StereoBM : public cv::StereoBM
 {
 public:
@@ -63,20 +77,70 @@ public:
     virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
 };
 
+/** @brief Creates StereoBM object.
+
+@param numDisparities the disparity search range. For each pixel algorithm will find the best
+disparity from 0 (default minimum disparity) to numDisparities. The search range can then be
+shifted by changing the minimum disparity.
+@param blockSize the linear size of the blocks compared by the algorithm. The size should be odd
+(as the block is centered at the current pixel). Larger block size implies smoother, though less
+accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher
+chance for algorithm to find a wrong correspondence.
+ */
 CV_EXPORTS Ptr<cuda::StereoBM> createStereoBM(int numDisparities = 64, int blockSize = 19);
 
 /////////////////////////////////////////
 // StereoBeliefPropagation
 
-//! "Efficient Belief Propagation for Early Vision" P.Felzenszwalb
+/** @brief Class computing stereo correspondence using the belief propagation algorithm. :
+
+The class implements algorithm described in @cite Felzenszwalb2006 . It can compute own data cost
+(using a truncated linear model) or use a user-provided data cost.
+
+@note
+   StereoBeliefPropagation requires a lot of memory for message storage:
+
+    \f[width \_ step  \cdot height  \cdot ndisp  \cdot 4  \cdot (1 + 0.25)\f]
+
+    and for data cost storage:
+
+    \f[width\_step \cdot height \cdot ndisp \cdot (1 + 0.25 + 0.0625 +  \dotsm + \frac{1}{4^{levels}})\f]
+
+    width\_step is the number of bytes in a line including padding.
+
+StereoBeliefPropagation uses a truncated linear model for the data cost and discontinuity terms:
+
+\f[DataCost = data \_ weight  \cdot \min ( \lvert Img_Left(x,y)-Img_Right(x-d,y)  \rvert , max \_ data \_ term)\f]
+
+\f[DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)\f]
+
+For more details, see @cite Felzenszwalb2006.
+
+By default, StereoBeliefPropagation uses floating-point arithmetics and the CV\_32FC1 type for
+messages. But it can also use fixed-point arithmetics and the CV\_16SC1 message type for better
+performance. To avoid an overflow in this case, the parameters must satisfy the following
+requirement:
+
+\f[10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX\f]
+
+@sa StereoMatcher
+ */
 class CV_EXPORTS StereoBeliefPropagation : public cv::StereoMatcher
 {
 public:
     using cv::StereoMatcher::compute;
 
+    /** @overload */
     virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
 
-    //! version for user specified data term
+    /** @brief Enables the stereo correspondence operator that finds the disparity for the specified data cost.
+
+    @param data User-specified data cost, a matrix of msg\_type type and
+    Size(\<image columns\>\*ndisp, \<image rows\>) size.
+    @param disparity Output disparity map. If disparity is empty, the output type is CV\_16SC1 .
+    Otherwise, the type is retained.
+    @param stream Stream for the asynchronous version.
+     */
     virtual void compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null()) = 0;
 
     //! number of BP iterations on each level
@@ -107,18 +171,48 @@ public:
     virtual int getMsgType() const = 0;
     virtual void setMsgType(int msg_type) = 0;
 
+    /** @brief Uses a heuristic method to compute the recommended parameters ( ndisp, iters and levels ) for the
+    specified image size ( width and height ).
+     */
     static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
 };
 
+/** @brief Creates StereoBeliefPropagation object.
+
+@param ndisp Number of disparities.
+@param iters Number of BP iterations on each level.
+@param levels Number of levels.
+@param msg\_type Type for messages. CV\_16SC1 and CV\_32FC1 types are supported.
+ */
 CV_EXPORTS Ptr<cuda::StereoBeliefPropagation>
     createStereoBeliefPropagation(int ndisp = 64, int iters = 5, int levels = 5, int msg_type = CV_32F);
 
 /////////////////////////////////////////
 // StereoConstantSpaceBP
 
-//! "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
-//! Qingxiong Yang, Liang Wang, Narendra Ahuja
-//! http://vision.ai.uiuc.edu/~qyang6/
+/** @brief Class computing stereo correspondence using the constant space belief propagation algorithm. :
+
+The class implements algorithm described in @cite Yang2010. StereoConstantSpaceBP supports both local
+minimum and global minimum data cost initialization algorithms. For more details, see the paper
+mentioned above. By default, a local algorithm is used. To enable a global algorithm, set
+use\_local\_init\_data\_cost to false .
+
+StereoConstantSpaceBP uses a truncated linear model for the data cost and discontinuity terms:
+
+\f[DataCost = data \_ weight  \cdot \min ( \lvert I_2-I_1  \rvert , max \_ data \_ term)\f]
+
+\f[DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)\f]
+
+For more details, see @cite Yang2010.
+
+By default, StereoConstantSpaceBP uses floating-point arithmetics and the CV\_32FC1 type for
+messages. But it can also use fixed-point arithmetics and the CV\_16SC1 message type for better
+performance. To avoid an overflow in this case, the parameters must satisfy the following
+requirement:
+
+\f[10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX\f]
+
+ */
 class CV_EXPORTS StereoConstantSpaceBP : public cuda::StereoBeliefPropagation
 {
 public:
@@ -129,23 +223,40 @@ public:
     virtual bool getUseLocalInitDataCost() const = 0;
     virtual void setUseLocalInitDataCost(bool use_local_init_data_cost) = 0;
 
+    /** @brief Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified
+    image size (widthand height).
+     */
     static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);
 };
 
+/** @brief Creates StereoConstantSpaceBP object.
+
+@param ndisp Number of disparities.
+@param iters Number of BP iterations on each level.
+@param levels Number of levels.
+@param nr\_plane Number of disparity levels on the first level.
+@param msg\_type Type for messages. CV\_16SC1 and CV\_32FC1 types are supported.
+ */
 CV_EXPORTS Ptr<cuda::StereoConstantSpaceBP>
     createStereoConstantSpaceBP(int ndisp = 128, int iters = 8, int levels = 4, int nr_plane = 4, int msg_type = CV_32F);
 
 /////////////////////////////////////////
 // DisparityBilateralFilter
 
-//! Disparity map refinement using joint bilateral filtering given a single color image.
-//! Qingxiong Yang, Liang Wang, Narendra Ahuja
-//! http://vision.ai.uiuc.edu/~qyang6/
+/** @brief Class refining a disparity map using joint bilateral filtering. :
+
+The class implements @cite Yang2010 algorithm.
+ */
 class CV_EXPORTS DisparityBilateralFilter : public cv::Algorithm
 {
 public:
-    //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.
-    //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.
+    /** @brief Refines a disparity map using joint bilateral filtering.
+
+    @param disparity Input disparity map. CV\_8UC1 and CV\_16SC1 types are supported.
+    @param image Input image. CV\_8UC1 and CV\_8UC3 types are supported.
+    @param dst Destination disparity map. It has the same size and type as disparity .
+    @param stream Stream for the asynchronous version.
+     */
     virtual void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 
     virtual int getNumDisparities() const = 0;
@@ -170,24 +281,48 @@ public:
     virtual void setSigmaRange(double sigma_range) = 0;
 };
 
+/** @brief Creates DisparityBilateralFilter object.
+
+@param ndisp Number of disparities.
+@param radius Filter radius.
+@param iters Number of iterations.
+ */
 CV_EXPORTS Ptr<cuda::DisparityBilateralFilter>
     createDisparityBilateralFilter(int ndisp = 64, int radius = 3, int iters = 1);
 
 /////////////////////////////////////////
 // Utility
 
-//! Reprojects disparity image to 3D space.
-//! Supports CV_8U and CV_16S types of input disparity.
-//! The output is a 3- or 4-channel floating-point matrix.
-//! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.
-//! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.
+/** @brief Reprojects a disparity image to 3D space.
+
+@param disp Input disparity image. CV\_8U and CV\_16S types are supported.
+@param xyzw Output 3- or 4-channel floating-point image of the same size as disp . Each element of
+xyzw(x,y) contains 3D coordinates (x,y,z) or (x,y,z,1) of the point (x,y) , computed from the
+disparity map.
+@param Q \f$4 \times 4\f$ perspective transformation matrix that can be obtained via stereoRectify .
+@param dst\_cn The number of channels for output image. Can be 3 or 4.
+@param stream Stream for the asynchronous version.
+
+@sa reprojectImageTo3D
+ */
 CV_EXPORTS void reprojectImageTo3D(InputArray disp, OutputArray xyzw, InputArray Q, int dst_cn = 4, Stream& stream = Stream::Null());
 
-//! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.
-//! Supported types of input disparity: CV_8U, CV_16S.
-//! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).
+/** @brief Colors a disparity image.
+
+@param src\_disp Source disparity image. CV\_8UC1 and CV\_16SC1 types are supported.
+@param dst\_disp Output disparity image. It has the same size as src\_disp . The type is CV\_8UC4
+in BGRA format (alpha = 255).
+@param ndisp Number of disparities.
+@param stream Stream for the asynchronous version.
+
+This function draws a colored disparity map by converting disparity values from [0..ndisp) interval
+first to HSV color space (where different disparity values correspond to different hues) and then
+converting the pixels to RGB for visualization.
+ */
 CV_EXPORTS void drawColorDisp(InputArray src_disp, OutputArray dst_disp, int ndisp, Stream& stream = Stream::Null());
 
+//! @}
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDASTEREO_HPP__ */
diff --git a/modules/cudawarping/include/opencv2/cudawarping.hpp b/modules/cudawarping/include/opencv2/cudawarping.hpp
index d759f5559..fc07fcc93 100644
--- a/modules/cudawarping/include/opencv2/cudawarping.hpp
+++ b/modules/cudawarping/include/opencv2/cudawarping.hpp
@@ -50,54 +50,178 @@
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/imgproc.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudawarping Image Warping
+  @}
+ */
+
 namespace cv { namespace cuda {
 
-//! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]
-//! supports only CV_32FC1 map type
+//! @addtogroup cudawarping
+//! @{
+
+/** @brief Applies a generic geometrical transformation to an image.
+
+@param src Source image.
+@param dst Destination image with the size the same as xmap and the type the same as src .
+@param xmap X values. Only CV\_32FC1 type is supported.
+@param ymap Y values. Only CV\_32FC1 type is supported.
+@param interpolation Interpolation method (see resize ). INTER\_NEAREST , INTER\_LINEAR and
+INTER\_CUBIC are supported for now.
+@param borderMode Pixel extrapolation method (see borderInterpolate ). BORDER\_REFLECT101 ,
+BORDER\_REPLICATE , BORDER\_CONSTANT , BORDER\_REFLECT and BORDER\_WRAP are supported for now.
+@param borderValue Value used in case of a constant border. By default, it is 0.
+@param stream Stream for the asynchronous version.
+
+The function transforms the source image using the specified map:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} (xmap(x,y), ymap(x,y))\f]
+
+Values of pixels with non-integer coordinates are computed using the bilinear interpolation.
+
+@sa remap
+ */
 CV_EXPORTS void remap(InputArray src, OutputArray dst, InputArray xmap, InputArray ymap,
                       int interpolation, int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(),
                       Stream& stream = Stream::Null());
 
-//! resizes the image
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
+/** @brief Resizes an image.
+
+@param src Source image.
+@param dst Destination image with the same type as src . The size is dsize (when it is non-zero)
+or the size is computed from src.size() , fx , and fy .
+@param dsize Destination image size. If it is zero, it is computed as:
+\f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
+Either dsize or both fx and fy must be non-zero.
+@param fx Scale factor along the horizontal axis. If it is zero, it is computed as:
+\f[\texttt{(double)dsize.width/src.cols}\f]
+@param fy Scale factor along the vertical axis. If it is zero, it is computed as:
+\f[\texttt{(double)dsize.height/src.rows}\f]
+@param interpolation Interpolation method. INTER\_NEAREST , INTER\_LINEAR and INTER\_CUBIC are
+supported for now.
+@param stream Stream for the asynchronous version.
+
+@sa resize
+ */
 CV_EXPORTS void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
 
-//! warps the image using affine transformation
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+/** @brief Applies an affine transformation to an image.
+
+@param src Source image. CV\_8U , CV\_16U , CV\_32S , or CV\_32F depth and 1, 3, or 4 channels are
+supported.
+@param dst Destination image with the same type as src . The size is dsize .
+@param M *2x3* transformation matrix.
+@param dsize Size of the destination image.
+@param flags Combination of interpolation methods (see resize) and the optional flag
+WARP\_INVERSE\_MAP specifying that M is an inverse transformation ( dst=\>src ). Only
+INTER\_NEAREST , INTER\_LINEAR , and INTER\_CUBIC interpolation methods are supported.
+@param borderMode
+@param borderValue
+@param stream Stream for the asynchronous version.
+
+@sa warpAffine
+ */
 CV_EXPORTS void warpAffine(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags = INTER_LINEAR,
     int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
 
+/** @brief Builds transformation maps for affine transformation.
+
+@param M *2x3* transformation matrix.
+@param inverse Flag specifying that M is an inverse transformation ( dst=\>src ).
+@param dsize Size of the destination image.
+@param xmap X values with CV\_32FC1 type.
+@param ymap Y values with CV\_32FC1 type.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpAffine , cuda::remap
+ */
 CV_EXPORTS void buildWarpAffineMaps(InputArray M, bool inverse, Size dsize, OutputArray xmap, OutputArray ymap, Stream& stream = Stream::Null());
 
-//! warps the image using perspective transformation
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+/** @brief Applies a perspective transformation to an image.
+
+@param src Source image. CV\_8U , CV\_16U , CV\_32S , or CV\_32F depth and 1, 3, or 4 channels are
+supported.
+@param dst Destination image with the same type as src . The size is dsize .
+@param M *3x3* transformation matrix.
+@param dsize Size of the destination image.
+@param flags Combination of interpolation methods (see resize ) and the optional flag
+WARP\_INVERSE\_MAP specifying that M is the inverse transformation ( dst =\> src ). Only
+INTER\_NEAREST , INTER\_LINEAR , and INTER\_CUBIC interpolation methods are supported.
+@param borderMode
+@param borderValue
+@param stream Stream for the asynchronous version.
+
+@sa warpPerspective
+ */
 CV_EXPORTS void warpPerspective(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags = INTER_LINEAR,
     int borderMode = BORDER_CONSTANT, Scalar borderValue = Scalar(), Stream& stream = Stream::Null());
 
+/** @brief Builds transformation maps for perspective transformation.
+
+@param M *3x3* transformation matrix.
+@param inverse Flag specifying that M is an inverse transformation ( dst=\>src ).
+@param dsize Size of the destination image.
+@param xmap X values with CV\_32FC1 type.
+@param ymap Y values with CV\_32FC1 type.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpPerspective , cuda::remap
+ */
 CV_EXPORTS void buildWarpPerspectiveMaps(InputArray M, bool inverse, Size dsize, OutputArray xmap, OutputArray ymap, Stream& stream = Stream::Null());
 
-//! builds plane warping maps
+/** @brief Builds plane warping maps.
+ */
 CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, InputArray T, float scale,
                                    OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
 
-//! builds cylindrical warping maps
+/** @brief Builds cylindrical warping maps.
+ */
 CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, float scale,
                                          OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
 
-//! builds spherical warping maps
+/** @brief Builds spherical warping maps.
+ */
 CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, float scale,
                                        OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
 
-//! rotates an image around the origin (0,0) and then shifts it
-//! supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-//! supports 1, 3 or 4 channels images with CV_8U, CV_16U or CV_32F depth
+/** @brief Rotates an image around the origin (0,0) and then shifts it.
+
+@param src Source image. Supports 1, 3 or 4 channels images with CV\_8U , CV\_16U or CV\_32F
+depth.
+@param dst Destination image with the same type as src . The size is dsize .
+@param dsize Size of the destination image.
+@param angle Angle of rotation in degrees.
+@param xShift Shift along the horizontal axis.
+@param yShift Shift along the vertical axis.
+@param interpolation Interpolation method. Only INTER\_NEAREST , INTER\_LINEAR , and INTER\_CUBIC
+are supported.
+@param stream Stream for the asynchronous version.
+
+@sa cuda::warpAffine
+ */
 CV_EXPORTS void rotate(InputArray src, OutputArray dst, Size dsize, double angle, double xShift = 0, double yShift = 0,
                        int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
 
-//! smoothes the source image and downsamples it
+/** @brief Smoothes an image and downsamples it.
+
+@param src Source image.
+@param dst Destination image. Will have Size((src.cols+1)/2, (src.rows+1)/2) size and the same
+type as src .
+@param stream Stream for the asynchronous version.
+
+@sa pyrDown
+ */
 CV_EXPORTS void pyrDown(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
-//! upsamples the source image and then smoothes it
+/** @brief Upsamples an image and then smoothes it.
+
+@param src Source image.
+@param dst Destination image. Will have Size(src.cols\*2, src.rows\*2) size and the same type as
+src .
+@param stream Stream for the asynchronous version.
+ */
 CV_EXPORTS void pyrUp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
 
 class CV_EXPORTS ImagePyramid : public Algorithm
@@ -108,6 +232,8 @@ public:
 
 CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());
 
+//! @}
+
 }} // namespace cv { namespace cuda {
 
 #endif /* __OPENCV_CUDAWARPING_HPP__ */
diff --git a/modules/cudev/include/opencv2/cudev.hpp b/modules/cudev/include/opencv2/cudev.hpp
index a5fb4f696..565efa1c6 100644
--- a/modules/cudev/include/opencv2/cudev.hpp
+++ b/modules/cudev/include/opencv2/cudev.hpp
@@ -109,4 +109,11 @@
 #include "cudev/expr/unary_op.hpp"
 #include "cudev/expr/warping.hpp"
 
+/**
+  @addtogroup cuda
+  @{
+    @defgroup cudev Device layer
+  @}
+*/
+
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/block/block.hpp b/modules/cudev/include/opencv2/cudev/block/block.hpp
index 385e1713e..e8d59bb20 100644
--- a/modules/cudev/include/opencv2/cudev/block/block.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/block.hpp
@@ -50,6 +50,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 struct Block
 {
     __device__ __forceinline__ static uint blockId()
@@ -122,6 +125,9 @@ __device__ __forceinline__ static void blockTransfrom(InIt1 beg1, InIt1 end1, In
     for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
         *o = op(*t1, *t2);
 }
+
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp b/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp
index 9f9ba6000..e52f829bf 100644
--- a/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/dynamic_smem.hpp
@@ -50,6 +50,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class T> struct DynamicSharedMem
 {
     __device__ __forceinline__ operator T*()
@@ -81,6 +84,8 @@ template <> struct DynamicSharedMem<double>
     }
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/block/reduce.hpp b/modules/cudev/include/opencv2/cudev/block/reduce.hpp
index 4c9022631..74c8fcac7 100644
--- a/modules/cudev/include/opencv2/cudev/block/reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/reduce.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // blockReduce
 
 template <int N, typename T, class Op>
@@ -123,6 +126,8 @@ __device__ __forceinline__ void blockReduceKeyVal(const tuple<KP0, KP1, KP2, KP3
             >(skeys, key, svals, val, tid, cmp);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/block/scan.hpp b/modules/cudev/include/opencv2/cudev/block/scan.hpp
index c54dfef9f..3369cff98 100644
--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <int THREADS_NUM, typename T>
 __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 {
@@ -96,6 +99,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
     return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp b/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp
index c48e9146e..767d32a46 100644
--- a/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/vec_distance.hpp
@@ -53,6 +53,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // NormL1
 
 template <typename T> struct NormL1
@@ -179,6 +182,8 @@ struct NormHamming
     }
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/common.hpp b/modules/cudev/include/opencv2/cudev/common.hpp
index c8a7b7da2..f475e20b6 100644
--- a/modules/cudev/include/opencv2/cudev/common.hpp
+++ b/modules/cudev/include/opencv2/cudev/common.hpp
@@ -52,6 +52,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 using namespace cv::cuda;
 
 // CV_CUDEV_ARCH
@@ -84,6 +87,8 @@ __host__ __device__ __forceinline__ int divUp(int total, int grain)
 #define CV_PI_F   ((float)CV_PI)
 #define CV_LOG2_F ((float)CV_LOG2)
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp b/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp
index f35ea2dc3..2777a1e18 100644
--- a/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/binary_func.hpp
@@ -55,6 +55,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 #define CV_CUDEV_EXPR_BINARY_FUNC(name) \
     template <class SrcPtr1, class SrcPtr2> \
     __host__ Expr<BinaryTransformPtrSz<typename PtrTraits<SrcPtr1>::ptr_type, typename PtrTraits<SrcPtr2>::ptr_type, name ## _func<typename LargerType<typename PtrTraits<SrcPtr1>::value_type, typename PtrTraits<SrcPtr2>::value_type>::type> > > \
@@ -70,6 +73,8 @@ CV_CUDEV_EXPR_BINARY_FUNC(absdiff)
 
 #undef CV_CUDEV_EXPR_BINARY_FUNC
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp b/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp
index f7e965572..7533946fc 100644
--- a/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/binary_op.hpp
@@ -58,6 +58,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // Binary Operations
 
 #define CV_CUDEV_EXPR_BINOP_INST(op, functor) \
@@ -230,6 +233,8 @@ CV_CUDEV_EXPR_BINOP_INST(>>, bit_rshift)
 
 #undef CV_CUDEV_EXPR_BINOP_INST
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/color.hpp b/modules/cudev/include/opencv2/cudev/expr/color.hpp
index 13f07c15a..f53de78b3 100644
--- a/modules/cudev/include/opencv2/cudev/expr/color.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/color.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 #define CV_CUDEV_EXPR_CVTCOLOR_INST(name) \
     template <class SrcPtr> \
     __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, name ## _func<typename VecTraits<typename PtrTraits<SrcPtr>::value_type>::elem_type> > > \
@@ -277,6 +280,8 @@ CV_CUDEV_EXPR_CVTCOLOR_INST(Luv4_to_LBGRA)
 
 #undef CV_CUDEV_EXPR_CVTCOLOR_INST
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/deriv.hpp b/modules/cudev/include/opencv2/cudev/expr/deriv.hpp
index 822a86b9a..da51cc711 100644
--- a/modules/cudev/include/opencv2/cudev/expr/deriv.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/deriv.hpp
@@ -53,6 +53,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // derivX
 
 template <class SrcPtr>
@@ -116,6 +119,8 @@ laplacian_(const SrcPtr& src)
     return makeExpr(laplacianPtr<ksize>(src));
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/expr.hpp b/modules/cudev/include/opencv2/cudev/expr/expr.hpp
index 46c780b4a..cdc861217 100644
--- a/modules/cudev/include/opencv2/cudev/expr/expr.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/expr.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Body> struct Expr
 {
     Body body;
@@ -87,6 +90,8 @@ template <class Body> struct PtrTraits< Expr<Body> >
     }
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp b/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp
index 56a067de9..d7ecd3bb0 100644
--- a/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/per_element_func.hpp
@@ -56,6 +56,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // min/max
 
 template <class SrcPtr1, class SrcPtr2>
@@ -127,6 +130,8 @@ lut_(const SrcPtr& src, const TablePtr& tbl)
     return makeExpr(lutPtr(src, tbl));
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/reduction.hpp b/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
index 1f0a3ff0e..598fb4f86 100644
--- a/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
@@ -56,6 +56,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // sum
 
 template <class SrcPtr> struct SumExprBody
@@ -254,6 +257,8 @@ integral_(const SrcPtr& src)
     return makeExpr(body);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp b/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp
index a30f6a6f3..b19cec827 100644
--- a/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/unary_func.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 #define CV_CUDEV_EXPR_UNARY_FUNC(name) \
     template <class SrcPtr> \
     __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<SrcPtr>::ptr_type, name ## _func<typename PtrTraits<SrcPtr>::value_type> > > \
@@ -93,6 +96,8 @@ pow_(const SrcPtr& src, float power)
     return makeExpr(transformPtr(src, bind2nd(pow_func<typename PtrTraits<SrcPtr>::value_type>(), power)));
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp b/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp
index 905013e42..c5fabe4ac 100644
--- a/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/unary_op.hpp
@@ -57,6 +57,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 #define CV_CUDEV_EXPR_UNOP_INST(op, functor) \
     template <typename T> \
     __host__ Expr<UnaryTransformPtrSz<typename PtrTraits<GpuMat_<T> >::ptr_type, functor<T> > > \
@@ -89,6 +92,8 @@ CV_CUDEV_EXPR_UNOP_INST(~, bit_not)
 
 #undef CV_CUDEV_EXPR_UNOP_INST
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/expr/warping.hpp b/modules/cudev/include/opencv2/cudev/expr/warping.hpp
index f942a3fb6..e1f78b968 100644
--- a/modules/cudev/include/opencv2/cudev/expr/warping.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/warping.hpp
@@ -57,6 +57,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // resize
 
 template <class SrcPtr>
@@ -166,6 +169,8 @@ transpose_(const SrcPtr& src)
     return makeExpr(body);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp b/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp
index 8be854780..5134d04ed 100644
--- a/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/color_cvt.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // Various 3/4-channel to 3/4-channel RGB transformations
 
 #define CV_CUDEV_RGB2RGB_INST(name, scn, dcn, bidx) \
@@ -469,6 +472,8 @@ CV_CUDEV_RGB5x52GRAY_INST(BGR565_to_GRAY, 6)
 
 #undef CV_CUDEV_RGB5x52GRAY_INST
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/functional/functional.hpp b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
index 7934f78b9..125b66f07 100644
--- a/modules/cudev/include/opencv2/cudev/functional/functional.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // Function Objects
 
 template <typename _Arg, typename _Result> struct unary_function
@@ -873,6 +876,8 @@ template <typename F> struct IsBinaryFunction
     enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp b/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp
index d3a40db0e..ff075dc2b 100644
--- a/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/tuple_adapter.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Op, int n> struct UnaryTupleAdapter
 {
     typedef typename Op::result_type result_type;
@@ -93,6 +96,8 @@ __host__ __device__ BinaryTupleAdapter<Op, n0, n1> binaryTupleAdapter(const Op&
     return a;
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/copy.hpp b/modules/cudev/include/opencv2/cudev/grid/copy.hpp
index d7d3ea834..1d30f9976 100644
--- a/modules/cudev/include/opencv2/cudev/grid/copy.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/copy.hpp
@@ -57,6 +57,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
 __host__ void gridCopy_(const SrcPtr& src, GpuMat_<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -447,6 +450,8 @@ __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, Glob
     gridCopy_<DefaultCopyPolicy>(src, dst, stream);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/histogram.hpp b/modules/cudev/include/opencv2/cudev/grid/histogram.hpp
index ecb1a19c8..154f73771 100644
--- a/modules/cudev/include/opencv2/cudev/grid/histogram.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/histogram.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <int BIN_COUNT, class Policy, class SrcPtr, typename ResType, class MaskPtr>
 __host__ void gridHistogram_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -114,6 +117,8 @@ __host__ void gridHistogram(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& st
     gridHistogram_<BIN_COUNT, DefaultHistogramPolicy>(src, dst, stream);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/integral.hpp b/modules/cudev/include/opencv2/cudev/grid/integral.hpp
index d948c1267..6312f4477 100644
--- a/modules/cudev/include/opencv2/cudev/grid/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/integral.hpp
@@ -53,6 +53,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class SrcPtr, typename DstType>
 __host__ void gridIntegral(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
@@ -64,6 +67,8 @@ __host__ void gridIntegral(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& str
     integral_detail::integral(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp b/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp
index 99833bd3f..22eafe69f 100644
--- a/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/pyramids.hpp
@@ -55,6 +55,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Brd, class SrcPtr, typename DstType>
 __host__ void gridPyrDown_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
@@ -83,6 +86,8 @@ __host__ void gridPyrUp(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream
     pyramids_detail::pyrUp(shrinkPtr(src), shrinkPtr(dst), rows, cols, dst.rows, dst.cols, StreamAccessor::getStream(stream));
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/reduce.hpp b/modules/cudev/include/opencv2/cudev/grid/reduce.hpp
index 3861ae228..4551bc886 100644
--- a/modules/cudev/include/opencv2/cudev/grid/reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce.hpp
@@ -57,6 +57,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
 __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -370,6 +373,8 @@ __host__ void gridCountNonZero(const SrcPtr& src, GpuMat_<ResType>& dst, Stream&
     gridCountNonZero_<DefaultGlobReducePolicy>(src, dst, stream);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
index 361d40d1c..595ee8be6 100644
--- a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
@@ -59,6 +59,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <typename T> struct Sum : plus<T>
 {
     typedef T work_type;
@@ -225,6 +228,8 @@ __host__ void gridReduceToColumn(const SrcPtr& src, GpuMat_<ResType>& dst, Strea
     gridReduceToColumn_<Reductor, DefaultReduceToVecPolicy>(src, dst, stream);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
index ed7e8ee60..1a7134793 100644
--- a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
@@ -57,6 +57,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtrTuple, typename DstType, class MaskPtr>
 __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -579,6 +582,8 @@ __host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], Str
     gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/transform.hpp b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
index 62555ab5a..2f16f7d39 100644
--- a/modules/cudev/include/opencv2/cudev/grid/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
@@ -57,6 +57,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
 __host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -536,6 +539,8 @@ __host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>,
     gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
index cf1bf8303..0d7a19573 100644
--- a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Policy, class SrcPtr, typename DstType>
 __host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
@@ -98,6 +101,8 @@ __host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, St
     gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp
index d3c56e771..b3c5f5f23 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/constant.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <typename T> struct ConstantPtr
 {
     typedef T   value_type;
@@ -88,6 +91,8 @@ template <typename T> struct PtrTraits< ConstantPtrSz<T> > : PtrTraitsBase< Cons
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp
index 097007400..95088177f 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/deriv.hpp
@@ -53,6 +53,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // derivX
 
 template <class SrcPtr> struct DerivXPtr
@@ -388,6 +391,8 @@ template <int ksize, class SrcPtr> struct PtrTraits< LaplacianPtrSz<ksize, SrcPt
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp
index c90e79174..a5f2776f2 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/extrapolation.hpp
@@ -52,6 +52,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // BrdConstant
 
 template <class SrcPtr> struct BrdConstant
@@ -214,6 +217,8 @@ __host__ BrdBase<BrdWrap, typename PtrTraits<SrcPtr>::ptr_type> brdWrap(const Sr
     return b;
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
index 738592663..3563e56fc 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <typename T> struct GlobPtr
 {
     typedef T   value_type;
@@ -106,6 +109,8 @@ template <typename T> struct PtrTraits< GlobPtrSz<T> > : PtrTraitsBase<GlobPtrSz
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
index 50bb0874f..02d8cb773 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/gpumat.hpp
@@ -53,6 +53,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <typename T>
 class GpuMat_ : public GpuMat
 {
@@ -154,6 +157,8 @@ template <typename T> struct PtrTraits< GpuMat_<T> > : PtrTraitsBase<GpuMat_<T>,
 {
 };
 
+//! @}
+
 }}
 
 #include "detail/gpumat.hpp"
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp
index e86d7191e..256d4fd00 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/interpolation.hpp
@@ -55,6 +55,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // Nearest
 
 template <class SrcPtr> struct NearestInterPtr
@@ -380,6 +383,8 @@ template <class SrcPtr> struct PtrTraits< CommonAreaInterPtrSz<SrcPtr> > : PtrTr
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
index accf54561..26a3725c0 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class SrcPtr, class TablePtr> struct LutPtr
 {
     typedef typename PtrTraits<TablePtr>::value_type value_type;
@@ -95,6 +98,8 @@ template <class SrcPtr, class TablePtr> struct PtrTraits< LutPtrSz<SrcPtr, Table
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
index d425fa318..bbd2f1ae0 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 struct WithOutMask
 {
     typedef bool value_type;
@@ -98,6 +101,8 @@ template <class MaskPtr> struct PtrTraits< SingleMaskChannelsSz<MaskPtr> > : Ptr
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp
index db2669a40..9d8745f94 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/remap.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class SrcPtr, class MapPtr> struct RemapPtr1
 {
     typedef typename PtrTraits<SrcPtr>::value_type value_type;
@@ -149,6 +152,8 @@ template <class SrcPtr, class MapXPtr, class MapYPtr> struct PtrTraits< RemapPtr
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp
index 10a4bad90..63ae7eb8a 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/resize.hpp
@@ -54,6 +54,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class SrcPtr> struct ResizePtr
 {
     typedef typename PtrTraits<SrcPtr>::value_type value_type;
@@ -98,6 +101,8 @@ template <class SrcPtr> struct PtrTraits< ResizePtrSz<SrcPtr> > : PtrTraitsBase<
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
index 6df4a783d..6fa83e631 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
@@ -92,6 +92,9 @@ namespace
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 #if CUDART_VERSION >= 5050
 
 template <typename T> struct TexturePtr
@@ -248,6 +251,8 @@ template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>,
 
 #endif
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp
index 7fb4b32b1..f1552cafe 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/traits.hpp
@@ -50,6 +50,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class Ptr2DSz, class Ptr2D> struct PtrTraitsBase
 {
     typedef Ptr2DSz ptr_sz_type;
@@ -96,6 +99,8 @@ __host__ int getCols(const Ptr2DSz& ptr)
     return PtrTraits<Ptr2DSz>::getCols(ptr);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp
index f540e7521..b6edb913d 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/transform.hpp
@@ -53,6 +53,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // UnaryTransformPtr
 
 template <class SrcPtr, class Op> struct UnaryTransformPtr
@@ -146,6 +149,8 @@ template <class Src1Ptr, class Src2Ptr, class Op> struct PtrTraits< BinaryTransf
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp
index 80e5fbeef..c9d00833f 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/warping.hpp
@@ -53,6 +53,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // affine
 
 struct AffineMapPtr
@@ -147,6 +150,8 @@ warpPerspectivePtr(const SrcPtr& src, Size dstSize, const GpuMat_<float>& warpMa
     return remapPtr(src, perspectiveMap(dstSize, warpMat));
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
index 934939f62..368848248 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/zip.hpp
@@ -52,6 +52,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class PtrTuple> struct ZipPtr;
 
 template <class Ptr0, class Ptr1> struct ZipPtr< tuple<Ptr0, Ptr1> > : tuple<Ptr0, Ptr1>
@@ -168,6 +171,8 @@ template <class PtrTuple> struct PtrTraits< ZipPtrSz<PtrTuple> > : PtrTraitsBase
 {
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/util/atomic.hpp b/modules/cudev/include/opencv2/cudev/util/atomic.hpp
index 2da110231..a88cd99b3 100644
--- a/modules/cudev/include/opencv2/cudev/util/atomic.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/atomic.hpp
@@ -50,6 +50,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // atomicAdd
 
 __device__ __forceinline__ int atomicAdd(int* address, int val)
@@ -192,6 +195,8 @@ __device__ static double atomicMax(double* address, double val)
 #endif
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/util/limits.hpp b/modules/cudev/include/opencv2/cudev/util/limits.hpp
index 58faca6b5..71e7faa77 100644
--- a/modules/cudev/include/opencv2/cudev/util/limits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/limits.hpp
@@ -52,6 +52,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <class T> struct numeric_limits;
 
 template <> struct numeric_limits<bool>
@@ -119,6 +122,8 @@ template <> struct numeric_limits<double>
     static const bool is_signed = true;
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
index ff7ce8598..3176542d2 100644
--- a/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/saturate_cast.hpp
@@ -50,6 +50,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <typename T> __device__ __forceinline__ T saturate_cast(uchar v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(schar v) { return T(v); }
 template <typename T> __device__ __forceinline__ T saturate_cast(ushort v) { return T(v); }
@@ -267,6 +270,8 @@ template <> __device__ __forceinline__ uint saturate_cast<uint>(double v)
 #endif
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp b/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp
index db63f5180..2dd6f12ac 100644
--- a/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/simd_functions.hpp
@@ -128,6 +128,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // 2
 
 __device__ __forceinline__ uint vadd2(uint a, uint b)
@@ -908,6 +911,8 @@ __device__ __forceinline__ uint vmin4(uint a, uint b)
     return r;
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/util/tuple.hpp b/modules/cudev/include/opencv2/cudev/util/tuple.hpp
index b015ff344..70d0424bd 100644
--- a/modules/cudev/include/opencv2/cudev/util/tuple.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/tuple.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 using tuple_detail::tuple;
 using tuple_detail::tuple_size;
 using tuple_detail::get;
@@ -75,6 +78,8 @@ template <class Tuple, template <typename T> class CvtOp> struct ConvertTuple
     typedef typename tuple_detail::ConvertTuple<Tuple, tuple_size<Tuple>::value, CvtOp>::type type;
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/util/type_traits.hpp b/modules/cudev/include/opencv2/cudev/util/type_traits.hpp
index ca800c0b7..acd1d3ba2 100644
--- a/modules/cudev/include/opencv2/cudev/util/type_traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/type_traits.hpp
@@ -52,6 +52,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // NullType
 
 struct NullType {};
@@ -164,6 +167,8 @@ template <typename A, typename B> struct LargerType
     >::type type;
 };
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/util/vec_math.hpp b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
index 361ef7b29..82fa06e9c 100644
--- a/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // saturate_cast
 
 namespace vec_math_detail
@@ -931,6 +934,8 @@ CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)
 
 #undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp b/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
index 585423dd5..9bb5678e6 100644
--- a/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
@@ -50,6 +50,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // MakeVec
 
 template<typename T, int CN> struct MakeVec;
@@ -177,6 +180,8 @@ template<> struct VecTraits<char4>
     __host__ __device__ __forceinline__ static char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
 };
 
+//! @}
+
 }}
 
 // DataType
diff --git a/modules/cudev/include/opencv2/cudev/warp/reduce.hpp b/modules/cudev/include/opencv2/cudev/warp/reduce.hpp
index 089ef92d0..f3919c2fe 100644
--- a/modules/cudev/include/opencv2/cudev/warp/reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/reduce.hpp
@@ -53,6 +53,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 // warpReduce
 
 template <typename T, class Op>
@@ -201,6 +204,8 @@ smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t
     return make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/scan.hpp b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
index acd032fb0..a4402986d 100644
--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@@ -52,6 +52,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 template <typename T>
 __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
 {
@@ -94,6 +97,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
     return warpScanInclusive(data, smem, tid) - data;
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
index a6aae5b90..97af06972 100644
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@@ -51,6 +51,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 #if CV_CUDEV_ARCH >= 300
 
 // shfl
@@ -419,6 +422,8 @@ CV_CUDEV_SHFL_XOR_VEC_INST(double)
 
 #endif // CV_CUDEV_ARCH >= 300
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/warp/warp.hpp b/modules/cudev/include/opencv2/cudev/warp/warp.hpp
index c7649880f..61caea259 100644
--- a/modules/cudev/include/opencv2/cudev/warp/warp.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/warp.hpp
@@ -50,6 +50,9 @@
 
 namespace cv { namespace cudev {
 
+//! @addtogroup cudev
+//! @{
+
 enum
 {
     LOG_WARP_SIZE = 5,
@@ -117,6 +120,8 @@ __device__ __forceinline__ void warpYota(OutIt beg, OutIt end, T value)
         *t = value;
 }
 
+//! @}
+
 }}
 
 #endif
diff --git a/modules/viz/include/opencv2/viz/types.hpp b/modules/viz/include/opencv2/viz/types.hpp
index 0e638a929..169a67442 100644
--- a/modules/viz/include/opencv2/viz/types.hpp
+++ b/modules/viz/include/opencv2/viz/types.hpp
@@ -187,6 +187,8 @@ namespace cv
     } /* namespace viz */
 } /* namespace cv */
 
+//! @cond IGNORED
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /// cv::viz::Color
 
@@ -237,4 +239,6 @@ inline cv::viz::Color cv::viz::Color::amethyst()       { return Color(204, 102,
 
 inline cv::viz::Color cv::viz::Color::not_set()        { return Color(-1, -1, -1); }
 
+//! @endcond
+
 #endif