Merge remote-tracking branch 'upstream/master' into rho

2014-12-29 20:55:24 -05:00 · 2014-12-29 20:55:24 -05:00 · 46631c4b18
commit 46631c4b18
parent bcc630a8dc 61991a3330
169 changed files with 4747 additions and 3490 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -181,6 +181,7 @@ OCV_OPTION(BUILD_PERF_TESTS         "Build performance tests"
 OCV_OPTION(BUILD_TESTS              "Build accuracy & regression tests"           ON  IF (NOT IOS) )
 OCV_OPTION(BUILD_WITH_DEBUG_INFO    "Include debug info into debug libs (not MSCV only)" ON )
 OCV_OPTION(BUILD_WITH_STATIC_CRT    "Enables use of staticaly linked CRT for staticaly linked OpenCV" ON IF MSVC )
+OCV_OPTION(BUILD_WITH_DYNAMIC_IPP   "Enables dynamic linking of IPP (only for standalone IPP)" OFF )
 OCV_OPTION(BUILD_FAT_JAVA_LIB       "Create fat java wrapper containing the whole OpenCV library" ON IF NOT BUILD_SHARED_LIBS AND CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(BUILD_ANDROID_SERVICE    "Build OpenCV Manager for Google Play" OFF IF ANDROID AND ANDROID_SOURCE_TREE )
 OCV_OPTION(BUILD_ANDROID_PACKAGE    "Build platform-specific package for Google Play" OFF IF ANDROID )
@ -293,6 +294,16 @@ if(NOT OPENCV_TEST_INSTALL_PATH)
  set(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}")
 endif()

+if(OPENCV_TEST_DATA_PATH AND NOT OPENCV_TEST_DATA_INSTALL_PATH)
+  if(ANDROID)
+    set(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata")
+  elseif(WIN32)
+    set(OPENCV_TEST_DATA_INSTALL_PATH "testdata")
+  else()
+    set(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata")
+  endif()
+endif()
+
 if(ANDROID)
  set(LIBRARY_OUTPUT_PATH         "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}")
  set(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}")
@ -598,10 +609,6 @@ if(INSTALL_TESTS AND OPENCV_TEST_DATA_PATH AND UNIX)
    install(PROGRAMS "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh"
            DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT tests)
  else()
-    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_testing.sh.in"
-                   "${CMAKE_BINARY_DIR}/unix-install/opencv_testing.sh" @ONLY)
-    install(FILES "${CMAKE_BINARY_DIR}/unix-install/opencv_testing.sh"
-            DESTINATION /etc/profile.d/ COMPONENT tests)
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_run_all_tests_unix.sh.in"
                   "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh" @ONLY)
    install(PROGRAMS "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh"
@ -960,6 +967,9 @@ status("  Other third-party libraries:")
 if(WITH_IPP AND HAVE_IPP)
  status("    Use IPP:" "${IPP_VERSION_STR} [${IPP_VERSION_MAJOR}.${IPP_VERSION_MINOR}.${IPP_VERSION_BUILD}]")
  status("         at:" "${IPP_ROOT_DIR}")
+  if(NOT HAVE_IPP_ICV_ONLY)
+    status("     linked:" BUILD_WITH_DYNAMIC_IPP THEN "dynamic" ELSE "static")
+  endif()
 else()
  status("    Use IPP:"   WITH_IPP AND NOT HAVE_IPP THEN "IPP not found" ELSE NO)
 endif()
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@ -31,10 +31,8 @@
 # The following variables affect the behavior of the macros in the
 # script (in alphebetical order).  Note that any of these flags can be
 # changed multiple times in the same directory before calling
-# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX
-# or CUDA_WRAP_SRCS.
-#
-# ::
+# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX,
+# CUDA_COMPILE_FATBIN, CUDA_COMPILE_CUBIN or CUDA_WRAP_SRCS::
 #
 #   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
 #   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
@ -43,19 +41,11 @@
 #      nvcc in the generated source.  If you compile to PTX and then load the
 #      file yourself, you can mix bit sizes between device and host.
 #
-#
-#
-# ::
-#
 #   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
 #   -- Set to ON if you want the custom build rule to be attached to the source
 #      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
 #      targets.
 #
-#
-#
-# ::
-#
 #      This allows the user to build the target from the CUDA file; however, bad
 #      things can happen if the CUDA source file is added to multiple targets.
 #      When performing parallel builds it is possible for the custom build
@ -68,44 +58,24 @@
 #      this script could detect the reuse of source files across multiple targets
 #      and turn the option off for the user, but no good solution could be found.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_CUBIN (Default OFF)
 #   -- Set to ON to enable and extra compilation pass with the -cubin option in
 #      Device mode. The output is parsed and register, shared memory usage is
 #      printed during build.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_EMULATION (Default OFF for device mode)
 #   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
 #      when CUDA_BUILD_EMULATION is TRUE.
 #
-#
-#
-# ::
-#
 #   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
 #   -- Set to the path you wish to have the generated files placed.  If it is
 #      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
 #      Intermediate files will always be placed in
 #      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
 #
-#
-#
-# ::
-#
 #   CUDA_HOST_COMPILATION_CPP (Default ON)
 #   -- Set to OFF for C compilation of host code.
 #
-#
-#
-# ::
-#
 #   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
 #   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
 #      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
@ -113,19 +83,11 @@
 #      $(VCInstallDir)/bin is a special value that expands out to the path when
 #      the command is run from withing VS.
 #
-#
-#
-# ::
-#
 #   CUDA_NVCC_FLAGS
 #   CUDA_NVCC_FLAGS_<CONFIG>
 #   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
 #      semi-colon delimited (e.g. --compiler-options;-Wall)
 #
-#
-#
-# ::
-#
 #   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
 #   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
 #      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
@ -137,10 +99,6 @@
 #      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
 #      shared library compilation are not affected by this flag.
 #
-#
-#
-# ::
-#
 #   CUDA_SEPARABLE_COMPILATION (Default OFF)
 #   -- If set this will enable separable compilation for all CUDA runtime object
 #      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
@ -148,38 +106,22 @@
 #      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
 #      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
 #
-#
-#
-# ::
-#
 #   CUDA_VERBOSE_BUILD (Default OFF)
 #   -- Set to ON to see all the commands used when building the CUDA file.  When
 #      using a Makefile generator the value defaults to VERBOSE (run make
 #      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
 #      always print the output.
 #
-#
-#
-# The script creates the following macros (in alphebetical order):
-#
-# ::
+# The script creates the following macros (in alphebetical order)::
 #
 #   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
 #   -- Adds the cufft library to the target (can be any target).  Handles whether
 #      you are in emulation mode or not.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
 #   -- Adds the cublas library to the target (can be any target).  Handles
 #      whether you are in emulation mode or not.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
 #                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
 #   -- Creates an executable "cuda_target" which is made up of the files
@ -193,42 +135,28 @@
 #      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
 #      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
 #                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
 #   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_CLEAN_TARGET()
 #   -- Creates a convience target that deletes all the dependency files
 #      generated.  You should make clean after running this target to ensure the
 #      dependency files get regenerated.
 #
-#
-#
-# ::
-#
 #   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
 #                 [OPTIONS ...] )
 #   -- Returns a list of generated files from the input source files to be used
 #      with ADD_LIBRARY or ADD_EXECUTABLE.
 #
-#
-#
-# ::
-#
 #   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
 #   -- Returns a list of PTX files generated from the input source files.
 #
+#   CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of FATBIN files generated from the input source files.
 #
-#
-# ::
+#   CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of CUBIN files generated from the input source files.
 #
 #   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
 #                                                        cuda_target
@ -242,10 +170,6 @@
 #      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
 #      this is a function and not a macro.
 #
-#
-#
-# ::
-#
 #   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
 #   -- Sets the directories that should be passed to nvcc
 #      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
@ -253,17 +177,9 @@
 #
 #
 #
-#
-#
-# ::
-#
 #   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
 #                                            nvcc_flags object_files)
 #
-#
-#
-# ::
-#
 #   -- Generates the link object required by separable compilation from the given
 #      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
 #      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
@ -273,91 +189,51 @@
 #      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
 #      instead of a macro.
 #
-#
-#
-# ::
-#
 #   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
 #                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
 #   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
 #      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
 #      function under the hood.
 #
-#
-#
-# ::
-#
 #      Given the list of files (file0 file1 ... fileN) this macro generates
 #      custom commands that generate either PTX or linkable objects (use "PTX" or
 #      "OBJ" for the format argument to switch).  Files that don't end with .cu
 #      or have the HEADER_FILE_ONLY property are ignored.
 #
-#
-#
-# ::
-#
 #      The arguments passed in after OPTIONS are extra command line options to
 #      give to nvcc.  You can also specify per configuration options by
 #      specifying the name of the configuration followed by the options.  General
 #      options must preceed configuration specific options.  Not all
 #      configurations need to be specified, only the ones provided will be used.
 #
-#
-#
-# ::
-#
 #         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
 #         DEBUG -g
 #         RELEASE --use_fast_math
 #         RELWITHDEBINFO --use_fast_math;-g
 #         MINSIZEREL --use_fast_math
 #
-#
-#
-# ::
-#
 #      For certain configurations (namely VS generating object files with
 #      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
 #      be produced for the given cuda file.  This is because when you add the
 #      cuda file to Visual Studio it knows that this file produces an object file
 #      and will link in the resulting object file automatically.
 #
-#
-#
-# ::
-#
 #      This script will also generate a separate cmake script that is used at
 #      build time to invoke nvcc.  This is for several reasons.
 #
-#
-#
-# ::
-#
 #        1. nvcc can return negative numbers as return values which confuses
 #        Visual Studio into thinking that the command succeeded.  The script now
 #        checks the error codes and produces errors when there was a problem.
 #
-#
-#
-# ::
-#
 #        2. nvcc has been known to not delete incomplete results when it
 #        encounters problems.  This confuses build systems into thinking the
 #        target was generated when in fact an unusable file exists.  The script
 #        now deletes the output files if there was an error.
 #
-#
-#
-# ::
-#
 #        3. By putting all the options that affect the build into a file and then
 #        make the build rule dependent on the file, the output files will be
 #        regenerated when the options change.
 #
-#
-#
-# ::
-#
 #      This script also looks at optional arguments STATIC, SHARED, or MODULE to
 #      determine when to target the object compilation for a shared library.
 #      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
@ -366,27 +242,17 @@
 #      <target_name>_EXPORTS is defined when a shared library compilation is
 #      detected.
 #
-#
-#
-# ::
-#
 #      Flags passed into add_definitions with -D or /D are passed along to nvcc.
 #
 #
 #
-# The script defines the following variables:
-#
-# ::
+# The script defines the following variables::
 #
 #   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
 #   CUDA_VERSION_MINOR    -- The minor version.
 #   CUDA_VERSION
 #   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
 #
-#
-#
-# ::
-#
 #   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
 #   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
 #                            SDK.  This script will not directly support finding
@ -412,13 +278,13 @@
 #                            Only available for CUDA version 3.2+.
 #   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
 #                            Only available for CUDA version 3.2+.
-#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives library.
+#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives lib.
 #                            Only available for CUDA version 4.0+.
-#   CUDA_nppc_LIBRARY      -- NVIDIA Performance Primitives library (core).
+#   CUDA_nppc_LIBRARY     -- NVIDIA Performance Primitives lib (core).
 #                            Only available for CUDA version 5.5+.
-#   CUDA_nppi_LIBRARY      -- NVIDIA Performance Primitives library (image processing).
+#   CUDA_nppi_LIBRARY     -- NVIDIA Performance Primitives lib (image processing).
 #                            Only available for CUDA version 5.5+.
-#   CUDA_npps_LIBRARY      -- NVIDIA Performance Primitives library (signal processing).
+#   CUDA_npps_LIBRARY     -- NVIDIA Performance Primitives lib (signal processing).
 #                            Only available for CUDA version 5.5+.
 #   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
 #                            Only available for CUDA version 3.2+.
@ -427,32 +293,15 @@
 #                            Only available for CUDA version 3.2+.
 #                            Windows only.
 #
-#
-#
-#
-#
-# ::
-#
+
 #   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
 #   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
 #
-#
-#
-# ::
-#
 #   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
 #
-#
-#
-# ::
-#
 #   Copyright (c) 2007-2009
 #   Scientific Computing and Imaging Institute, University of Utah
 #
-#
-#
-# ::
-#
 #   This code is licensed under the MIT License.  See the FindCUDA.cmake script
 #   for the text of the license.

@ -481,11 +330,6 @@

 # FindCUDA.cmake

-# We need to have at least this version to support the VERSION_LESS argument to 'if' (2.6.2) and unset (2.6.3)
-cmake_policy(PUSH)
-cmake_minimum_required(VERSION 2.6.3)
-cmake_policy(POP)
-
 # This macro helps us find the location of helper files we will need the full path to
 macro(CUDA_FIND_HELPER_FILE _name _extension)
  set(_full_name "${_name}.${_extension}")
@ -608,7 +452,17 @@ set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
 if(CMAKE_GENERATOR MATCHES "Visual Studio")
  set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC")
 else()
-  set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+  # Using cc which is symlink to clang may let NVCC think it is GCC and issue
+  # unhandled -dumpspecs option to clang. Also in case neither
+  # CMAKE_C_COMPILER is defined (project does not use C language) nor
+  # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let
+  # nvcc use its own default C compiler.
+  if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER)
+    get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+  else()
+    set(c_compiler_realpath "")
+  endif()
+  set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
 endif()

 # Propagate the host flags to the host compiler via -Xcompiler
@ -676,13 +530,15 @@ endmacro()
 # Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
 # if they have then clear the cache variables, so that will be detected again.
 if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
+  unset(CUDA_TARGET_TRIPLET CACHE)
  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
  unset(CUDA_NVCC_EXECUTABLE CACHE)
  unset(CUDA_VERSION CACHE)
  cuda_unset_include_and_libraries()
 endif()

-if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
+if(NOT "${CUDA_TARGET_TRIPLET}" STREQUAL "${CUDA_TARGET_TRIPLET_INTERNAL}" OR
+   NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
  cuda_unset_include_and_libraries()
 endif()

@ -758,27 +614,46 @@ endif()
 # Always set this convenience variable
 set(CUDA_VERSION_STRING "${CUDA_VERSION}")

-# Support for arm cross compilation with CUDA 5.5
-set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}")
-if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
-  if(ANDROID AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
-    set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
-  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-    set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-  endif()
-endif()
-set(CUDA_TOOLKIT_TARGET_DIR "${__cuda_toolkit_target_dir_initial}" CACHE PATH "Toolkit target location.")
-mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
-
 # Target CPU architecture
-if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+if(DEFINED CUDA_TARGET_CPU_ARCH)
+  set(_cuda_target_cpu_arch_initial "${CUDA_TARGET_CPU_ARCH}")
+elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|ARM)")
  set(_cuda_target_cpu_arch_initial "ARM")
 else()
  set(_cuda_target_cpu_arch_initial "")
 endif()
-set(CUDA_TARGET_CPU_ARCH ${_cuda_target_cpu_arch_initial} CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.")
+set(CUDA_TARGET_CPU_ARCH "${_cuda_target_cpu_arch_initial}" CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.")
 mark_as_advanced(CUDA_TARGET_CPU_ARCH)

+# Target OS variant
+if(DEFINED CUDA_TARGET_OS_VARIANT)
+  set(_cuda_target_os_variant_initial "${CUDA_TARGET_OS_VARIANT}")
+else()
+  set(_cuda_target_os_variant_initial "")
+endif()
+set(CUDA_TARGET_OS_VARIANT "${_cuda_target_os_variant_initial}" CACHE STRING "Specify the name of the class of OS for which the input files must be compiled.")
+mark_as_advanced(CUDA_TARGET_OS_VARIANT)
+
+# Target triplet
+if(DEFINED CUDA_TARGET_TRIPLET)
+  set(_cuda_target_triplet_initial "${CUDA_TARGET_TRIPLET}")
+elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND "${CUDA_TARGET_CPU_ARCH}" STREQUAL "ARM")
+  if("${CUDA_TARGET_OS_VARIANT}" STREQUAL "Android" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
+    set(_cuda_target_triplet_initial "armv7-linux-androideabi")
+  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
+    set(_cuda_target_triplet_initial "armv7-linux-gnueabihf")
+  endif()
+endif()
+set(CUDA_TARGET_TRIPLET "${_cuda_target_triplet_initial}" CACHE STRING "Specify the target triplet for which the input files must be compiled.")
+file(GLOB __cuda_available_target_tiplets RELATIVE "${CUDA_TOOLKIT_ROOT_DIR}/targets" "${CUDA_TOOLKIT_ROOT_DIR}/targets/*" )
+set_property(CACHE CUDA_TARGET_TRIPLET PROPERTY STRINGS ${__cuda_available_target_tiplets})
+mark_as_advanced(CUDA_TARGET_TRIPLET)
+
+# Target directory
+if(NOT DEFINED CUDA_TOOLKIT_TARGET_DIR AND CUDA_TARGET_TRIPLET AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/${CUDA_TARGET_TRIPLET}")
+  set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}/targets/${CUDA_TARGET_TRIPLET}")
+endif()
+
 # CUDA_TOOLKIT_INCLUDE
 find_path(CUDA_TOOLKIT_INCLUDE
  device_functions.h # Header included in toolkit
@ -802,10 +677,16 @@ macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
    # and old paths.
    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
  endif()
+  if(CUDA_VERSION VERSION_GREATER "6.0")
+    set(_cuda_static_lib_names "")
+    foreach(name ${_names})
+      list(APPEND _cuda_static_lib_names "${name}_static")
+    endforeach()
+  endif()
  # CUDA 3.2+ on Windows moved the library directories, so we need to new
  # (lib/Win32) and the old path (lib).
  find_library(${_var}
-    NAMES ${_names}
+    NAMES ${_names} ${_cuda_static_lib_names}
    PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
    ENV CUDA_PATH
    ENV CUDA_LIB_PATH
@ -815,7 +696,7 @@ macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
    )
  # Search default search paths, after we search our own set of paths.
  find_library(${_var}
-    NAMES ${_names}
+    NAMES ${_names} ${_cuda_static_lib_names}
    PATHS "/usr/lib/nvidia-current"
    DOC ${_doc}
    )
@ -853,18 +734,6 @@ if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
 else()
  set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
 endif()
-if(APPLE)
-  # We need to add the path to cudart to the linker using rpath, since the
-  # library name for the cuda libraries is prepended with @rpath.
-  if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
-    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDARTEMU_LIBRARY}" PATH)
-  else()
-    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDART_LIBRARY}" PATH)
-  endif()
-  if(_cuda_path_to_cudart)
-    list(APPEND CUDA_LIBRARIES -Wl,-rpath "-Wl,${_cuda_path_to_cudart}")
-  endif()
-endif()

 # 1.1 toolkit on linux doesn't appear to have a separate library on
 # some platforms.
@ -997,6 +866,8 @@ set(CUDA_FOUND TRUE)

 set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
  "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
+set(CUDA_TARGET_TRIPLET_INTERNAL "${CUDA_TARGET_TRIPLET}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TARGET_TRIPLET was set successfully." FORCE)
 set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
 set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
@ -1044,15 +915,15 @@ macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
  set( ${_options} )
  set( _found_options FALSE )
  foreach(arg ${ARGN})
-    if(arg STREQUAL "OPTIONS")
+    if("x${arg}" STREQUAL "xOPTIONS")
      set( _found_options TRUE )
    elseif(
-        arg STREQUAL "WIN32" OR
-        arg STREQUAL "MACOSX_BUNDLE" OR
-        arg STREQUAL "EXCLUDE_FROM_ALL" OR
-        arg STREQUAL "STATIC" OR
-        arg STREQUAL "SHARED" OR
-        arg STREQUAL "MODULE"
+        "x${arg}" STREQUAL "xWIN32" OR
+        "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR
+        "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+        "x${arg}" STREQUAL "xSTATIC" OR
+        "x${arg}" STREQUAL "xSHARED" OR
+        "x${arg}" STREQUAL "xMODULE"
        )
      list(APPEND ${_cmake_options} ${arg})
    else()
@ -1148,7 +1019,7 @@ function(CUDA_COMPUTE_BUILD_PATH path build_path)
    endif()
  endif()

-  # This recipie is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
+  # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
  # CMake source.

  # Remove leading /
@ -1177,7 +1048,7 @@ endfunction()
 # a .cpp or .ptx file.
 # INPUT:
 #   cuda_target         - Target name
-#   format              - PTX or OBJ
+#   format              - PTX, CUBIN, FATBIN or OBJ
 #   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
 #   OPTIONS             - Extra options to NVCC
 # OUTPUT:
@ -1227,6 +1098,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
  endif()

+  if(CUDA_TARGET_OS_VARIANT)
+    set(nvcc_flags ${nvcc_flags} "-target-os-variant=${CUDA_TARGET_OS_VARIANT}")
+  endif()
+
  # This needs to be passed in at this stage, because VS needs to fill out the
  # value of VCInstallDir from within VS.  Note that CCBIN is only used if
  # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
@ -1355,7 +1230,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
  foreach(file ${ARGN})
    # Ignore any file marked as a HEADER_FILE_ONLY
    get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-    if(${file} MATCHES ".*\\.cu$" AND NOT _is_header)
+    if(${file} MATCHES "\\.cu$" AND NOT _is_header)

      # Allow per source file overrides of the format.
      get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
@ -1363,16 +1238,22 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
        set(_cuda_source_format ${format})
      endif()

-      if( ${_cuda_source_format} MATCHES "PTX" )
-        set( compile_to_ptx ON )
-      elseif( ${_cuda_source_format} MATCHES "OBJ")
-        set( compile_to_ptx OFF )
+      if( ${_cuda_source_format} MATCHES "OBJ")
+        set( cuda_compile_to_external_module OFF )
      else()
-        message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ or PTX.")
+        set( cuda_compile_to_external_module ON )
+        if( ${_cuda_source_format} MATCHES "PTX" )
+          set( cuda_compile_to_external_module_type "ptx" )
+        elseif( ${_cuda_source_format} MATCHES "CUBIN")
+          set( cuda_compile_to_external_module_type "cubin" )
+        elseif( ${_cuda_source_format} MATCHES "FATBIN")
+          set( cuda_compile_to_external_module_type "fatbin" )
+        else()
+          message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ, PTX, CUBIN or FATBIN.")
+        endif()
      endif()

-
-      if(compile_to_ptx)
+      if(cuda_compile_to_external_module)
        # Don't use any of the host compilation flags for PTX targets.
        set(CUDA_HOST_FLAGS)
        set(CUDA_NVCC_FLAGS_CONFIG)
@ -1387,7 +1268,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
      if(CUDA_GENERATED_OUTPUT_DIR)
        set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
      else()
-        if ( compile_to_ptx )
+        if ( cuda_compile_to_external_module )
          set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
        else()
          set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
@ -1397,10 +1278,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
      # Add a custom target to generate a c or ptx file. ######################

      get_filename_component( basename ${file} NAME )
-      if( compile_to_ptx )
+      if( cuda_compile_to_external_module )
        set(generated_file_path "${cuda_compile_output_dir}")
-        set(generated_file_basename "${cuda_target}_generated_${basename}.ptx")
-        set(format_flag "-ptx")
+        set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}")
+        set(format_flag "-${cuda_compile_to_external_module_type}")
        file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
      else()
        set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
@ -1423,7 +1304,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake")

      # Setup properties for obj files:
-      if( NOT compile_to_ptx )
+      if( NOT cuda_compile_to_external_module )
        set_source_files_properties("${generated_file}"
          PROPERTIES
          EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
@ -1438,7 +1319,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
        set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
      endif()

-      if( NOT compile_to_ptx AND CUDA_SEPARABLE_COMPILATION)
+      if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION)
        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
      endif()

@ -1455,7 +1336,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
      # Build the NVCC made dependency file ###################################
      set(build_cubin OFF)
      if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
-         if ( NOT compile_to_ptx )
+         if ( NOT cuda_compile_to_external_module )
           set ( build_cubin ON )
         endif()
      endif()
@ -1482,8 +1363,8 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)

      # Create up the comment string
      file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-      if(compile_to_ptx)
-        set(cuda_build_comment_string "Building NVCC ptx file ${generated_file_relative_path}")
+      if(cuda_compile_to_external_module)
+        set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}")
      else()
        set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
      endif()
@ -1576,18 +1457,27 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
-    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
      list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
    endif()
+    # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG}
+    set(config_specific_flags)
    set(flags)
    foreach(config ${CUDA_configuration_types})
      string(TOUPPER ${config} config_upper)
+      # Add config specific flags
+      foreach(f ${CUDA_NVCC_FLAGS_${config_upper}})
+        list(APPEND config_specific_flags $<$<CONFIG:${config}>:${f}>)
+      endforeach()
      set(important_host_flags)
      _cuda_get_important_host_flags(important_host_flags ${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}})
      foreach(f ${important_host_flags})
        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
      endforeach()
    endforeach()
+    # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
+    set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
+
    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")

    # Some generators don't handle the multiple levels of custom command
@ -1713,21 +1603,29 @@ endmacro()

 ###############################################################################
 ###############################################################################
-# CUDA COMPILE
+# (Internal) helper for manually added cuda source files with specific targets
 ###############################################################################
 ###############################################################################
-macro(CUDA_COMPILE generated_files)
+macro(cuda_compile_base cuda_target format generated_files)

  # Separate the sources from the options
  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile OBJ _generated_files ${_sources} ${_cmake_options}
+  CUDA_WRAP_SRCS( ${cuda_target} ${format} _generated_files ${_sources} ${_cmake_options}
    OPTIONS ${_options} )

  set( ${generated_files} ${_generated_files})

 endmacro()

+###############################################################################
+###############################################################################
+# CUDA COMPILE
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE generated_files)
+  cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN})
+endmacro()

 ###############################################################################
 ###############################################################################
@ -1735,17 +1633,28 @@ endmacro()
 ###############################################################################
 ###############################################################################
 macro(CUDA_COMPILE_PTX generated_files)
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile_ptx PTX _generated_files ${_sources} ${_cmake_options}
-    OPTIONS ${_options} )
-
-  set( ${generated_files} ${_generated_files})
-
+  cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN})
 endmacro()

+###############################################################################
+###############################################################################
+# CUDA COMPILE FATBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_FATBIN generated_files)
+  cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE CUBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_CUBIN generated_files)
+  cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN})
+endmacro()
+
+
 ###############################################################################
 ###############################################################################
 # CUDA ADD CUFFT TO TARGET
--- a/cmake/FindCUDA/make2cmake.cmake
+++ b/cmake/FindCUDA/make2cmake.cmake
@ -37,12 +37,11 @@

 file(READ ${input_file} depend_text)

-if (${depend_text} MATCHES ".+")
+if (NOT "${depend_text}" STREQUAL "")

  # message("FOUND DEPENDS")

-  # Remember, four backslashes is escaped to one backslash in the string.
-  string(REGEX REPLACE "\\\\ " " " depend_text ${depend_text})
+  string(REPLACE "\\ " " " depend_text ${depend_text})

  # This works for the nvcc -M generated dependency files.
  string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
--- a/cmake/FindCUDA/parse_cubin.cmake
+++ b/cmake/FindCUDA/parse_cubin.cmake
@ -37,11 +37,10 @@

 file(READ ${input_file} file_text)

-if (${file_text} MATCHES ".+")
+if (NOT "${file_text}" STREQUAL "")

-  # Remember, four backslashes is escaped to one backslash in the string.
-  string(REGEX REPLACE ";" "\\\\;" file_text ${file_text})
-  string(REGEX REPLACE "\ncode" ";code" file_text ${file_text})
+  string(REPLACE ";" "\\;" file_text ${file_text})
+  string(REPLACE "\ncode" ";code" file_text ${file_text})

  list(LENGTH file_text len)

@ -57,7 +56,7 @@ if (${file_text} MATCHES ".+")

        # Extract kernel names.
        if (${entry} MATCHES "[^g]name = ([^ ]+)")
-          string(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+          set(entry "${CMAKE_MATCH_1}")

          # Check to see if the kernel name starts with "_"
          set(skip FALSE)
@ -76,19 +75,19 @@ if (${file_text} MATCHES ".+")

          # Registers
          if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
            message("Registers: ${entry}")
          endif()

          # Local memory
          if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
            message("Local:     ${entry}")
          endif()

          # Shared memory
          if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
            message("Shared:    ${entry}")
          endif()

--- a/cmake/FindCUDA/run_nvcc.cmake
+++ b/cmake/FindCUDA/run_nvcc.cmake
@ -62,7 +62,7 @@ set(cmake_dependency_file "@cmake_dependency_file@") # path
 set(CUDA_make2cmake "@CUDA_make2cmake@") # path
 set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
 set(build_cubin @build_cubin@) # bool
-set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # bool
+set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # path
 # We won't actually use these variables for now, but we need to set this, in
 # order to force this file to be run again if it changes.
 set(generated_file_path "@generated_file_path@") # path
@ -106,7 +106,7 @@ list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
 # Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
 list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
 list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
    set(CCBIN -ccbin "${CCBIN}")
  else()
@ -126,7 +126,7 @@ endif()
 # and other return variables are present after executing the process.
 macro(cuda_execute_process status command)
  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
+  if(NOT "x${_command}" STREQUAL "xCOMMAND")
    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
  endif()
  if(verbose)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -10,19 +10,10 @@ endif()

 set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})

-foreach(var INCLUDE LIBRARY PROGRAM)
-  set(__old_frpm_${var} "${CMAKE_FIND_ROOT_PATH_MODE_${var}}")
-endforeach()
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-
-find_package(CUDA "${MIN_VER_CUDA}" QUIET)
-
-foreach(var INCLUDE LIBRARY PROGRAM)
-  set(CMAKE_FIND_ROOT_PATH_MODE_${var} "${__old_frpm_${var}}")
-endforeach()
+if(ANDROID)
+  set(CUDA_TARGET_OS_VARIANT "Android")
+endif()
+find_host_package(CUDA "${MIN_VER_CUDA}" QUIET)

 list(REMOVE_AT CMAKE_MODULE_PATH 0)

@ -152,7 +143,6 @@ if(CUDA_FOUND)

  if(ANDROID)
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xptxas;-dlcm=ca")
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-target-os-variant=Android")
  endif()

  message(STATUS "CUDA NVCC target flags: ${CUDA_NVCC_FLAGS}")
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@ -34,9 +34,6 @@ unset(IPP_VERSION_MAJOR)
 unset(IPP_VERSION_MINOR)
 unset(IPP_VERSION_BUILD)

-set(IPP_LIB_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
-set(IPP_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
-
 set(IPP_X64 0)
 if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
    set(IPP_X64 1)
@ -125,19 +122,32 @@ macro(ipp_detect_version)
  endif()

  macro(_ipp_add_library name)
+    # dynamic linking is only supported for standalone version of IPP
+    if (BUILD_WITH_DYNAMIC_IPP AND NOT HAVE_IPP_ICV_ONLY)
+      set(IPP_LIB_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX})
+      set(IPP_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+    else ()
+      set(IPP_LIB_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
+      set(IPP_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif ()
    if (EXISTS ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX})
-      add_library(ipp${name} STATIC IMPORTED)
-      set_target_properties(ipp${name} PROPERTIES
-        IMPORTED_LINK_INTERFACE_LIBRARIES ""
-        IMPORTED_LOCATION ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
-      )
-      list(APPEND IPP_LIBRARIES ipp${name})
-      # CMake doesn't support "install(TARGETS ipp${name} " command with imported targets
-      install(FILES ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
-              DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
-      string(TOUPPER ${name} uname)
-      set(IPP${uname}_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_3P_LIB_INSTALL_PATH}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}" CACHE INTERNAL "" FORCE)
-      set(IPP${uname}_LOCATION_PATH "${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}" CACHE INTERNAL "" FORCE)
+      if (BUILD_WITH_DYNAMIC_IPP AND NOT HAVE_IPP_ICV_ONLY)
+        # When using dynamic libraries from standalone IPP it is your responsibility to install those on the target system
+        list(APPEND IPP_LIBRARIES ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX})
+      else ()
+        add_library(ipp${name} STATIC IMPORTED)
+        set_target_properties(ipp${name} PROPERTIES
+          IMPORTED_LINK_INTERFACE_LIBRARIES ""
+          IMPORTED_LOCATION ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
+        )
+        list(APPEND IPP_LIBRARIES ipp${name})
+        # CMake doesn't support "install(TARGETS ${IPP_PREFIX}${name} " command with imported targets
+        install(FILES ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
+                DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
+        string(TOUPPER ${name} uname)
+        set(IPP${uname}_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_3P_LIB_INSTALL_PATH}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}" CACHE INTERNAL "" FORCE)
+        set(IPP${uname}_LOCATION_PATH "${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}" CACHE INTERNAL "" FORCE)
+      endif()
    else()
      message(STATUS "Can't find IPP library: ${name} at ${IPP_LIBRARY_DIR}/${IPP_LIB_PREFIX}${IPP_PREFIX}${name}${IPP_SUFFIX}${IPP_LIB_SUFFIX}")
    endif()
@ -145,10 +155,18 @@ macro(ipp_detect_version)

  set(IPP_PREFIX "ipp")
  if(${IPP_VERSION_STR} VERSION_LESS "8.0")
-    set(IPP_SUFFIX "_l")      # static not threaded libs suffix IPP 7.x
-  else()
+    if (BUILD_WITH_DYNAMIC_IPP AND NOT HAVE_IPP_ICV_ONLY)
+      set(IPP_SUFFIX "")      # dynamic not threaded libs suffix IPP 7.x
+    else ()
+      set(IPP_SUFFIX "_l")    # static not threaded libs suffix IPP 7.x
+    endif ()
+  else ()
    if(WIN32)
-      set(IPP_SUFFIX "mt")    # static not threaded libs suffix IPP 8.x for Windows
+      if (BUILD_WITH_DYNAMIC_IPP AND NOT HAVE_IPP_ICV_ONLY)
+        set(IPP_SUFFIX "")    # dynamic not threaded libs suffix IPP 8.x for Windows
+      else ()
+        set(IPP_SUFFIX "mt")  # static not threaded libs suffix IPP 8.x for Windows
+      endif ()
    else()
      set(IPP_SUFFIX "")      # static not threaded libs suffix IPP 8.x for Linux/OS X
    endif()
@ -191,7 +209,7 @@ macro(ipp_detect_version)
        if (EXISTS ${INTEL_COMPILER_LIBRARY_DIR}/${IPP_LIB_PREFIX}${name}${CMAKE_SHARED_LIBRARY_SUFFIX})
          list(APPEND IPP_LIBRARIES ${INTEL_COMPILER_LIBRARY_DIR}/${IPP_LIB_PREFIX}${name}${CMAKE_SHARED_LIBRARY_SUFFIX})
        else()
-          message(STATUS "Can't find compiler library: ${name}")
+          message(STATUS "Can't find compiler library: ${name} at ${INTEL_COMPILER_LIBRARY_DIR}/${IPP_LIB_PREFIX}${name}${CMAKE_SHARED_LIBRARY_SUFFIX}")
        endif()
      endmacro()

--- a/cmake/OpenCVFindOpenNI2.cmake
+++ b/cmake/OpenCVFindOpenNI2.cmake
@ -21,8 +21,8 @@ if(WIN32)
        find_library(OPENNI2_LIBRARY "OpenNI2" PATHS $ENV{OPENNI2_LIB64} DOC "OpenNI2 library")
    endif()
 elseif(UNIX OR APPLE)
-    find_file(OPENNI2_INCLUDES "OpenNI.h" PATHS "/usr/include/ni2" "/usr/include/openni2" DOC "OpenNI2 c++ interface header")
-    find_library(OPENNI2_LIBRARY "OpenNI2" PATHS "/usr/lib" DOC "OpenNI2 library")
+    find_file(OPENNI2_INCLUDES "OpenNI.h" PATHS "/usr/include/ni2" "/usr/include/openni2" $ENV{OPENNI2_INCLUDE} DOC "OpenNI2 c++ interface header")
+    find_library(OPENNI2_LIBRARY "OpenNI2" PATHS "/usr/lib" $ENV{OPENNI2_REDIST} DOC "OpenNI2 library")
 endif()

 if(OPENNI2_LIBRARY AND OPENNI2_INCLUDES)
--- a/cmake/OpenCVGenInfoPlist.cmake
+++ b/cmake/OpenCVGenInfoPlist.cmake
@ -1,3 +1,11 @@
+if(OPENCV_EXTRA_WORLD)
+  set(OPENCV_APPLE_BUNDLE_NAME "OpenCV_contrib")
+  set(OPENCV_APPLE_BUNDLE_ID "org.opencv_contrib")
+else()
+  set(OPENCV_APPLE_BUNDLE_NAME "OpenCV")
+  set(OPENCV_APPLE_BUNDLE_ID "org.opencv")
+endif()
+
 if(IOS)
  configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.plist.in"
                 "${CMAKE_BINARY_DIR}/ios/Info.plist")
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -159,8 +159,13 @@ macro(ocv_add_module _name)
    endif()

    # add self to the world dependencies
+    # add to world only extra modules (ON) or only main modules (OFF)
+    set(__expected_extra 0)
+    if (OPENCV_EXTRA_WORLD)
+        set(__expected_extra 1)
+    endif()
    if((NOT DEFINED OPENCV_MODULE_IS_PART_OF_WORLD AND NOT OPENCV_MODULE_${the_module}_CLASS STREQUAL "BINDINGS"
-        AND NOT OPENCV_PROCESSING_EXTRA_MODULES)
+        AND __expected_extra EQUAL OPENCV_PROCESSING_EXTRA_MODULES)
        OR OPENCV_MODULE_IS_PART_OF_WORLD
        )
      set(OPENCV_MODULE_${the_module}_IS_PART_OF_WORLD ON CACHE INTERNAL "")
--- a/cmake/cl2cpp.cmake
+++ b/cmake/cl2cpp.cmake
@ -28,6 +28,7 @@ ${nested_namespace_start}

 set(STR_HPP "// This file is auto-generated. Do not edit!

+#include \"opencv2/core/ocl.hpp\"
 #include \"opencv2/core/ocl_genbase.hpp\"
 #include \"opencv2/core/opencl/ocl_defs.hpp\"

--- a/cmake/templates/opencv_testing.sh.in
+++ b/cmake/templates/opencv_testing.sh.in
@ -1,2 +0,0 @@
-# Environment setup for OpenCV testing
-export OPENCV_TEST_DATA_PATH=@CMAKE_INSTALL_PREFIX@/share/OpenCV/testdata
--- a/data/CMakeLists.txt
+++ b/data/CMakeLists.txt
@ -10,13 +10,5 @@ elseif(NOT WIN32)
 endif()

 if(INSTALL_TESTS AND OPENCV_TEST_DATA_PATH)
-  if(ANDROID)
-    install(DIRECTORY ${OPENCV_TEST_DATA_PATH} DESTINATION sdk/etc/testdata COMPONENT tests)
-  elseif(NOT WIN32)
-    # CPack does not set correct permissions by default, so we do it explicitly.
-    install(DIRECTORY ${OPENCV_TEST_DATA_PATH}
-            DIRECTORY_PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
-              GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
-            DESTINATION share/OpenCV/testdata COMPONENT tests)
-  endif()
+  install(DIRECTORY "${OPENCV_TEST_DATA_PATH}/" DESTINATION "${OPENCV_TEST_DATA_INSTALL_PATH}" COMPONENT "tests")
 endif()
--- a/doc/tutorials/definitions/noContent.rst
+++ b/doc/tutorials/definitions/noContent.rst
@ -1,3 +1,3 @@

 .. note::
-   Unfortunetly we have no tutorials into this section. And you can help us with that, since OpenCV is a community effort. If you have a tutorial suggestion or you have written a tutorial yourself (or coded a sample code) that you would like to see here, please contact follow these instructions: :ref:`howToWriteTutorial` and :how_to_contribute:`How to contribute <>`.
+   Unfortunetly we have no tutorials into this section. And you can help us with that, since OpenCV is a community effort. If you have a tutorial suggestion or you have written a tutorial yourself (or coded a sample code) that you would like to see here, please contact follow these instructions:  :how_to_contribute:`How to contribute <>`.
--- a/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown
+++ b/doc/tutorials/introduction/documenting_opencv/documentation_tutorial.markdown
@ -0,0 +1,610 @@
+Writing documentation for OpenCV {#tutorial_documentation}
+================================
+
+@tableofcontents
+
+Doxygen overview {#tutorial_documentation_overview}
+================
+
+Intro {#tutorial_documentation_intro}
+-----
+
+[Doxygen] is documentation generation system with a lot of great features, such as:
+-   parse program sources to produce actual and accurate documentation
+-   check documentation for errors
+-   insert images and formulas
+-   use markdown syntax and plain HTML for precise text formatting
+-   generate documentation in many different formats
+
+OpenCV library existing documentation has been converted to doxygen format.
+
+Installation {#tutorial_documentation_install}
+------------
+
+Please, check official [download][Doxygen download] and [installation][Doxygen installation] pages.
+Some linux distributions can also provide doxygen packages.
+
+Generate documentation {#tutorial_documentation_generate}
+----------------------
+
+-   Get the OpenCV sources (version 3.0 and later)
+-   _Optional:_ get the OpenCV_contrib sources
+-   Create build directory near the sources folder(s) and go into it
+-   Run cmake (assuming you put sources to _opencv_ folder):
+    @code{.sh}
+    cmake ../opencv
+    @endcode
+    Or if you get contrib sources too:
+    @code{.sh}
+    cmake -DOPENCV_EXTRA_MODULES_PATH=../opencv_contrib/modules ../opencv
+    @endcode
+-   Run make:
+    @code{.sh}
+    make doxygen
+    @endcode
+-   Open <i>doc/doxygen/html/index.html</i> file in your favorite browser
+
+Quick start {#tutorial_documentation_quick_start}
+===========
+
+@note These instructions are specific to OpenCV library documentation, other projects can use
+different layout scheme and documenting agreements.
+
+Documentation locations {#tutorial_documentation_quick_start_1}
+-----------------------
+
+Whole documentation is gathered from many different places:
+
+-   __source code__ entities, like classes, functions or enumerations, should be documented in
+    corresponding header files, right prior entity definition. See examples in next sections.
+
+-   __pages__ are good place to put big pieces of text with images and code examples not directly
+    connected with any source code entity. Pages should be located in separate files and
+    contained in several predefined places. This tutorial is example of such page.
+
+-   __images__ can be used to illustrate described things. Usually located at the same places as pages,
+    images can be inserted to any place of the documentation.
+
+-   __code examples__ show how to use the library in real applications. Each sample is
+    self-contained file which represents one simple application. Parts of these files can be
+    included into documentation and tutorials to demonstrate function calls and objects collaboration.
+
+-   __BibTeX references__ are used to create one common bibliography. All science books, articles and
+    proceedings served as basis for library functionality should be put in this reference list.
+
+Following scheme represents common documentation places for _opencv_ repository:
+~~~
+<opencv>
+├── doc             - doxygen config files, root page (root.markdown.in), BibTeX file (opencv.bib)
+│   ├── tutorials       - tutorials hierarchy (pages and images)
+│   ├── py_tutorials    - python tutorials hierarchy (pages and images)
+│   └── user_guide      - old user guide (pages and images)
+├── modules
+│   └── <modulename>
+│       ├── doc         - documentation pages and images for module
+│       └── include     - code documentation in header files
+└── samples         - place for all code examples
+    ├── cpp
+    │   └── tutorial_code   - place for tutorial code examples
+    └── ...
+~~~
+
+@note Automatic code parser looks for all header files (<i>".h, .hpp"</i> except for <i>".inl.hpp;
+.impl.hpp; _detail.hpp"</i>) in _include_ folder and its subfolders. Some module-specific
+instructions (group definitions) and documentation should be put into
+<i>"include/opencv2/<module-name>.hpp"</i> file.
+
+@note You can put C++ template implementation and specialization to separate files
+(<i>".impl.hpp"</i>) ignored by doxygen.
+
+@note Files in _src_ subfolder are not parsed, because documentation is intended mostly for the
+library users, not developers. But it still is possible to generate full documentation by
+customizing processed files list in cmake script (<i>doc/CMakeLists.txt</i>) and doxygen options in
+its configuration file (<i>doc/Doxyfile.in</i>).
+
+Since version 3.0 all new modules are placed into _opencv_contrib_ repository, it has slightly
+different layout:
+~~~
+<opencv_contrib>
+└── modules
+    └── <modulename>
+        ├── doc         - documentation pages and images, BibTeX file (<modulename>.bib)
+        ├── include     - code documentation in header files
+        ├── samples     - place for code examples for documentation and tutorials
+        └── tutorials   - tutorial pages and images
+~~~
+
+Example {#tutorial_documentation_quick_start_2}
+-------
+
+To add documentation for functions, classes and other entities, just insert special comment prior
+its definition. Like this:
+
+@verbatim
+/** @brief Calculates the exponent of every array element.
+
+The function exp calculates the exponent of every element of the input array:
+\f[ \texttt{dst} [I] = e^{ src(I) } \f]
+
+The maximum relative error is about 7e-6 for single-precision input and less than 1e-10 for
+double-precision input. Currently, the function converts denormalized values to zeros on output.
+Special values (NaN, Inf) are not handled.
+
+@param src input array.
+@param dst output array of the same size and type as src.
+
+@sa log , cartToPolar , polarToCart , phase , pow , sqrt , magnitude
+*/
+CV_EXPORTS_W void exp(InputArray src, OutputArray dst);
+@endverbatim
+
+Here you can see:
+
+-   special C-comment syntax denotes it is doxygen comment
+    @verbatim /** ... */ @endverbatim
+
+-   command `brief` denotes following paragraph is a brief description
+    @verbatim @brief @endverbatim
+
+-   empty line denotes paragraph end
+
+-   TeX formula between `f[` and `f]` commands
+    @verbatim \f[ ... \f] @endverbatim
+
+-   command `param` denotes following word is name of the parameter and following text is
+    description of the parameter; all parameters are placed in a list
+    @verbatim @param @endverbatim
+
+-   command `sa` starts "See also" paragraph containing references to some classes, methods, pages or URLs.
+    @verbatim @sa @endverbatim
+
+Produced reference item looks like this:
+![Reference link](doxygen-2.png)
+
+The "More..." link brings you to the function documentation:
+![Function documentation](doxygen-1.png)
+
+
+Another example {#tutorial_documentation_quick_start_3}
+---------------
+
+Different comment syntax can be used for one-line short comments:
+
+@verbatim
+//! type of line
+enum LineTypes {
+    FILLED  = -1,
+    LINE_4  = 4, //!< 4-connected line
+    LINE_8  = 8, //!< 8-connected line
+    LINE_AA = 16 //!< antialiased line
+};
+@endverbatim
+
+Here:
+
+-   special C++-comment syntax denotes it is doxygen comment
+    @verbatim //! @endverbatim
+
+-   additional symbol `<` denotes this comment is located _after_ documented entity
+    @verbatim //!< @endverbatim
+
+Produced documentation block looks like this:
+![Enumeration documentation](doxygen-3.png)
+
+More details {#tutorial_documentation_quick_start_4}
+------------
+
+### Command prefix
+
+Doxygen commands starts with `@` or `\` sign:
+@verbatim
+@brief ...
+or
+\brief ...
+@endverbatim
+
+### Comment syntax
+
+Doxygen comment can have different forms:
+@verbatim
+C-style:
+/** ... */
+or
+/*! ... */
+
+C++-style
+//! ...
+or
+/// ...
+
+Lines can start with '*':
+/**
+ * ...
+ * ...
+ */
+
+Can be placed after documented entity:
+//!< ...
+/**< ... */
+@endverbatim
+
+### Paragraph end
+
+To end paragraph, insert empty line or any command starting new paragraph:
+@verbatim
+@brief brief description paragraph
+brief continues
+
+new paragraph
+
+@note new note paragraph
+note paragraph continues
+
+another paragraph
+paragraph continues
+@endverbatim
+
+### Naming
+
+Pages, anchors, groups and other named entities should have unique name inside the whole project.
+It is a good idea to prefix such identifiers with module name:
+@verbatim
+@page core_explanation_1 Usage explanation
+@defgroup imgproc_transform Image transformations
+@anchor mymodule_interesting_note
+@endverbatim
+
+Supported Markdown {#tutorial_documentation_quick_start_md}
+------------------
+
+Doxygen supports Markdown formatting with some extensions. Short syntax reference is described
+below, for details visit [Markdown support].
+
+### lists {#tutorial_documentation_md_list}
+
+@verbatim
+Bulleted:
+- item1
+- item2
+Numbered:
+1. item1
+2. item2
+or
+-# item1
+-# item2
+@endverbatim
+
+### emphasis {#tutorial_documentation_md_emph}
+
+@verbatim
+_italic_
+__bold__
+use html in complex cases:
+<em>"path/to/file"</em>
+@endverbatim
+
+### links {#tutorial_documentation_md_links}
+
+@verbatim
+explicit link:
+[OpenCV main site](http://opencv.org)
+automatic links:
+<http://opencv.org>
+or even:
+http://opencv.org
+@endverbatim
+
+### images {#tutorial_documentation_md_image}
+
+@verbatim
+![image caption](image path)
+@endverbatim
+
+### headers {#tutorial_documentation_md_head}
+
+@verbatim
+Level1
+======
+Level2
+------
+### Level3
+#### Level4
+@endverbatim
+
+### header id {#tutorial_documentation_md_headid}
+
+You can assign a unique identifier to any header to reference it from other places.
+@verbatim
+Header {#some_unique_identifier}
+------
+...
+See @ref some_unique_identifier for details
+@endverbatim
+
+### page id {#tutorial_documentation_md_page}
+
+Each page should have additional Level1 header at the beginning with page title and identifier:
+@verbatim
+Writing documentation for OpenCV {#tutorial_documentation}
+================================
+@endverbatim
+
+### tables {#tutorial_documentation_md_table}
+
+Example from doxygen documentation:
+@verbatim
+First Header  | Second Header
+------------- | -------------
+Content Cell  | Content Cell
+Content Cell  | Content Cell
+@endverbatim
+
+Commonly used commands {#tutorial_documentation_quick_start_5}
+----------------------
+
+Most often used doxygen commands are described here with short examples. For the full list of
+available commands and detailed description, please visit [Command reference].
+
+### Basic commands {#tutorial_documentation_commands_basic}
+
+-   __brief__ - paragraph with brief entity description
+
+-   __param__ - description of function argument.
+
+    Multiple adjacent statements are merged into one list. If argument with this name is not found
+    in actual function signature - doxygen warning will be produced. Function can have either _no_
+    documented parameters, either _all_ should be documented.
+
+-   __sa__ - "See also" paragraph, contains references to classes, functions, pages or URLs
+
+-   __note__ - visually highlighted "Note" paragraph. Multiple adjacent statements are merged into
+    one block.
+
+-   __return, returns__ - describes returned value of a function
+
+-   __overload__ - adds fixed text to the function description: <em>"This is an overloaded member
+    function, provided for convenience. It differs from the above function only in what argument(s)
+    it accepts."</em>
+
+-   __anchor__ - places invisible named anchor, which can be referenced by `ref` command. It can be
+    used in pages only.
+
+-   __ref__ - explicit reference to a named section, page or anchor.
+
+    If such entity can not be found - doxygen warning will be generated. This command has an
+    optional argument - link text.
+
+    Doxygen also generates some links automatically: if text contains word which can be found in
+    documented entities - reference will be generated. This functionality can be disabled by prefixing
+    the word with `%` symbol.
+    @verbatim
+Explicit reference: @ref MyClass
+Explicit named reference: @ref example_page "Example page"
+Implicit reference: cv::abc::MyClass1 or just MyClass1
+Disable implicit reference: %MyClass1
+    @endverbatim
+
+-   __f__ - formula
+
+    Inline formulas are bounded with `f$` command:
+    @verbatim
+\f$ ... \f$
+    @endverbatim
+
+    Block formulas - with `f[` and `f]` commands:
+    @verbatim
+\f[ ... \f]
+    @endverbatim
+
+### Code inclusion commands {#tutorial_documentation_commands_include}
+
+To mark some text as a code in documentation, _code_ and _endcode_ commands are used.
+@verbatim
+@code
+float val = img.at<float>(borderInterpolate(100, img.rows, cv::BORDER_REFLECT_101),
+                          borderInterpolate(-5, img.cols, cv::BORDER_WRAP));
+@endcode
+@endverbatim
+
+Syntax will be highlighted according to the currently parsed file type (C++ for <em>.hpp</em>, C for <em>.h</em>) or
+you can manually specify it in curly braces:
+
+@verbatim
+@code{.xml}
+@endverbatim
+
+To include whole example file into documentation, _include_ and _includelineno_ commands are used.
+The file is searched in common samples locations, so you can specify just its name or short part of
+the path. The _includelineno_ version also shows line numbers.
+
+@verbatim
+@include samples/cpp/test.cpp
+@endverbatim
+
+If you want to include some parts of existing example file - use _snippet_ command.
+
+First, mark the needed parts of the file with special doxygen comments:
+@verbatim
+//! [var_init]
+int a = 0;
+//! [var_init]
+@endverbatim
+
+Then include this snippet into documentation:
+@verbatim
+@snippet samples/cpp/test.cpp var_init
+@endverbatim
+
+@note Currently most of such partial inclusions are made with _dontinclude_ command for
+compatibility with the old rST documentation. But newly created samples should be included with the
+_snippet_ command, since this method is less affected by the changes in processed file.
+
+### Grouping commands {#tutorial_documentation_commands_group}
+
+All code entities should be put into named groups representing OpenCV modules and their internal
+structure, thus each module should be associated with a group with the same name. Good place to
+define groups and subgroups is the main header file for this module:
+<em>"<module>/include/opencv2/<module>.hpp"</em>.
+
+@note Doxygen groups are called "modules" and are shown on "Modules" page.
+
+@verbatim
+/**
+@defgroup mymodule My great module
+    optional description
+@{
+    @defgroup mymodule_basic Basic operations
+        optional description
+    @defgroup mymodule_experimental Experimental operations
+        optional description
+@}
+*/
+@endverbatim
+
+To put classes and functions into specific group, just add `ingroup` command to its documentation,
+or wrap the whole code block with `addtogroup` command:
+
+@verbatim
+/** @brief Example function
+    @ingroup mymodule
+*/
+or
+/**
+@addtogroup mymodule_experimental
+@{
+*/
+... several functions, classes or enumerations here
+/**
+@}
+*/
+@endverbatim
+
+### Publication reference {#tutorial_documentation_commands_cite}
+
+Use _cite_ command to insert reference to related publications listed in @ref citelist page.
+
+First, add publication BibTeX record into <i>"<opencv>/doc/opencv.bib"</i> or
+<i>"<opencv_contrib>/modules/<module>/doc/<module>.bib"</i> file:
+@verbatim
+@ARTICLE{Bradski98,
+    author = {Bradski, Gary R},
+    title = {Computer vision face tracking for use in a perceptual user interface},
+    year = {1998},
+    publisher = {Citeseer}
+}
+@endverbatim
+
+@note Try not to add publication duplicates because it can confuse documentation readers and writers later.
+
+Then make reference with _cite_ command:
+@verbatim
+@cite Bradski98
+@endverbatim
+
+@note To get BibTeX record for the publications one can use [Google Scholar]. Once the publication
+have been found - follow its "Cite" link and then choose "BibTeX" option:
+![](scholarship_cite_dialog.png)
+
+Step-by-step {#tutorial_documentation_steps}
+============
+
+Steps described in this section can be used as checklist during documentation writing. It is not
+necessary to do things in the same order, but some steps really depend on previous. And of course
+these steps are just basic guidelines, there is always a place for creativity.
+
+Document the function {#tutorial_documentation_steps_fun}
+---------------------
+
+1. Add empty doxygen comment preceding function definition.
+2. Add _brief_ command with short description of function meaning at the beginning.
+3. Add detailed description of the function.
+4. _Optional_: insert formulas, images and blocks of example code to illustrate complex cases
+5. _Optional_: describe each parameter using the _param_ command.
+6. _Optional_: describe return value of the function using the _returns_ command.
+7. _Optional_: add "See also" section with links to similar functions or classes
+8. _Optional_: add bibliographic reference if any.
+9. Generate doxygen documentation and verify results.
+
+Write the tutorial {#tutorial_documentation_steps_tutorial}
+------------------
+
+1.  Formulate the idea to be illustrated in the tutorial.
+
+2.  Make the example application, simple enough to be understood by a beginning developer. Be
+    laconic and write descriptive comments, don't try to avoid every possible runtime error or to make
+    universal utility. Your goal is to illustrate the idea. And it should fit one source file!
+
+    If you want to insert code blocks from this file into your tutorial, mark them with special doxygen comments (see [here](@ref tutorial_documentation_commands_include)).
+
+3.  Collect results  of the application work. It can be "before/after" images or some numbers
+    representing performance or even a video.
+
+    Save it in appropriate format for later use in the tutorial:
+    - To save simple graph-like images use lossless ".png" format.
+    - For photo-like images - lossy ".jpg" format.
+    - Numbers will be inserted as plain text, possibly formatted as table.
+    - Video should be uploaded on YouTube.
+
+4.  Create new tutorial page (<em>".markdown"</em>-file) in corresponding location (see
+    [here](@ref tutorial_documentation_quick_start_1)), and place all image files near it (or in "images"
+    subdirectory). Also put your example application file and make sure it is compiled together with the
+    OpenCV library when `-DBUILD_EXAMPLES=ON` option is enabled on cmake step.
+
+5.  Modify your new page:
+    -   Add page title and identifier, usually prefixed with <em>"tutorial_"</em> (see [here](@ref tutorial_documentation_md_page)).
+    -   Add brief description of your idea and tutorial goals.
+    -   Describe your program and/or its interesting pieces.
+    -   Describe your results, insert previously added images or other results.
+
+        To add a video use _htmlonly_, _endhtmlonly_ commands with raw html block inside:
+        @verbatim
+@htmlonly
+<div align="center">
+<iframe
+    title="my title" width="560" height="349"
+    src="http://www.youtube.com/embed/ViPN810E0SU?rel=0&loop=1"
+    frameborder="0" allowfullscreen align="middle">
+</iframe>
+</div>
+@endhtmlonly
+        @endverbatim
+    -   Add bibliographic references if any (see [here](@ref tutorial_documentation_commands_cite)).
+
+6.  Add newly created tutorial to the corresponding table of contents. Just find
+    <em>"table_of_content_*.markdown"</em> file with the needed table and place new record in it
+    similar to existing ones.
+    @verbatim
+-   @subpage tutorial_windows_visual_studio_image_watch
+
+    _Compatibility:_ \>= OpenCV 2.4
+
+    _Author:_ Wolf Kienzle
+
+    You will learn how to visualize OpenCV matrices and images within Visual Studio 2012.
+    @endverbatim
+    As you can see it is just a list item with special _subpage_ command which marks your page as a
+    child and places it into the existing pages hierarchy. Add compatibility information,
+    authors list and short description. Also note the list item indent, empty lines between
+    paragraphs and special _italic_ markers.
+
+7.  Generate doxygen documentation and verify results.
+
+References {#tutorial_documentation_refs}
+==========
+- [Doxygen] - main Doxygen page
+- [Documenting basics] - how to include documentation in code
+- [Markdown support] - supported syntax and extensions
+- [Formulas support] - how to include formulas
+- [Supported formula commands] - HTML formulas use MathJax script for rendering
+- [Command reference] - supported commands and their parameters
+
+<!-- invisible references list -->
+[Doxygen]: http://www.stack.nl/~dimitri/doxygen/index.html)
+[Doxygen download]: http://www.stack.nl/~dimitri/doxygen/download.html
+[Doxygen installation]: http://www.stack.nl/~dimitri/doxygen/manual/install.html
+[Documenting basics]: http://www.stack.nl/~dimitri/doxygen/manual/docblocks.html
+[Markdown support]: http://www.stack.nl/~dimitri/doxygen/manual/markdown.html
+[Formulas support]: http://www.stack.nl/~dimitri/doxygen/manual/formulas.html
+[Supported formula commands]: http://docs.mathjax.org/en/latest/tex.html#supported-latex-commands
+[Command reference]: http://www.stack.nl/~dimitri/doxygen/manual/commands.html
+[Google Scholar]: http://scholar.google.ru/
--- a/doc/tutorials/introduction/documenting_opencv/doxygen-1.png
+++ b/doc/tutorials/introduction/documenting_opencv/doxygen-1.png
--- a/doc/tutorials/introduction/documenting_opencv/doxygen-2.png
+++ b/doc/tutorials/introduction/documenting_opencv/doxygen-2.png
--- a/doc/tutorials/introduction/documenting_opencv/doxygen-3.png
+++ b/doc/tutorials/introduction/documenting_opencv/doxygen-3.png
--- a/doc/tutorials/introduction/documenting_opencv/scholarship_cite_dialog.png
+++ b/doc/tutorials/introduction/documenting_opencv/scholarship_cite_dialog.png
--- a/doc/tutorials/introduction/how_to_write_a_tutorial/how_to_write_a_tutorial.markdown
+++ b/doc/tutorials/introduction/how_to_write_a_tutorial/how_to_write_a_tutorial.markdown
@ -1,3 +0,0 @@
-How to write a tutorial for OpenCV {#tutorial_how_to_write_a_tutorial}
-==================================
-@todo new tutorial guide needed
--- a/doc/tutorials/introduction/how_to_write_a_tutorial/how_to_write_a_tutorial.rst
+++ b/doc/tutorials/introduction/how_to_write_a_tutorial/how_to_write_a_tutorial.rst
@ -1,440 +0,0 @@
-.. _howToWriteTutorial:
-
-How to write a tutorial for OpenCV
-**********************************
-
-Okay, so assume you have just finished a project of yours implementing something
-based on OpenCV and you want to present/share it with the community. Luckily, OpenCV
-is an *open source project*. This means that anyone has access to the full source
-code and may propose extensions. And a good tutorial is a valuable addition to the
-library! Please read instructions on contribution process here:
-http://opencv.org/contribute.html. You may also find this page helpful:
-:how_to_contribute:`How to contribute <>`.
-
-While making a robust and practical library (like OpenCV) is great, the success of a
-library also depends on how user friendly it is. To improve on this aspect, the
-OpenCV team has already been listening to user feedback at :opencv_qa:`OpenCV Q&A
-forum <>` and by making samples you can find in the source directories
-:file:`samples` folder. The addition of the tutorials (in both online and PDF format)
-is an extension of these efforts.
-
-Goal
-====
-
-.. _reST: http://docutils.sourceforge.net/rst.html
-.. |reST| replace:: reStructuredText
-.. |Sphinx| replace:: Sphinx
-.. _Sphinx: http://sphinx.pocoo.org/
-
-The tutorials are just as an important part of the library as the implementation of
-those crafty data structures and algorithms you can find in OpenCV. Therefore, the
-source codes for the tutorials are part of the library. And yes, I meant source
-codes. The reason for this formulation is that the tutorials are written by using the
-|Sphinx|_ documentation generation system. This is based on the popular Python
-documentation system called |reST|_ (reST). ReStructuredText is a really neat
-language that by using a few simple conventions (indentation, directives) and
-emulating old school email writing techniques (text only) tries to offer a simple
-way to create and edit documents. Sphinx extends this with some new features and
-creates the resulting document in both HTML (for web) and PDF (for offline usage)
-format.
-
-
-Usually, an OpenCV tutorial has the following parts:
-
-1. A source code demonstration of an OpenCV feature:
-
-   a. One or more CPP, Python, Java or other type of files depending for what OpenCV offers support and for what language you make the tutorial.
-   #. Occasionaly, input resource files required for running your tutorials application.
-
-
-#. A table of content entry (so people may easily find the tutorial):
-
-   a. Adding your stuff to the tutorials table of content (**reST** file).
-   #. Add an image file near the TOC entry.
-
-
-#. The content of the tutorial itself:
-
-   a. The **reST** text of the tutorial
-   #. Images following the idea that "*A picture is worth a thousand words*".
-   #. For more complex demonstrations you may create a video.
-
-As you can see you will need at least some basic knowledge of the *reST* system in order to complete the task at hand with success. However, don't worry *reST* (and *Sphinx*) was made with simplicity in mind. It is easy to grasp its basics. I found that the `OpenAlea documentations introduction on this subject <http://openalea.gforge.inria.fr/doc/openalea/doc/_build/html/source/tutorial/rest_syntax.html>`_ (or the `Thomas Cokelaer one <http://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html>`_ ) should enough for this. If for some directive or feature you need a more in-depth description look it up in the official |reST|_ help files or at the |Sphinx|_ documentation.
-
-In our world achieving some tasks is possible in multiple ways. However, some of the roads to take may have obvious or hidden advantages over others. Then again, in some other cases it may come down to just simple user preference. Here, I'll present how I decided to write the tutorials, based on my personal experience. If for some of them you know a better solution and you can back it up feel free to use that. I've nothing against it, as long as it gets the job done in an elegant fashion.
-
-Now the best would be if you could make the integration yourself. For this you need first to have the source code. I recommend following the guides for your operating system on acquiring OpenCV sources. For Linux users look :ref:`here <Linux-Installation>` and for :ref:`Windows here <Windows_Installation>`. You must also install python and sphinx with its dependencies in order to be able to build the documentation.
-
-Once you have downloaded the repository to your hard drive you can take a look in the OpenCV directory to make sure you have both the samples and doc folder present. Anyone may download the latest source files from :file:`git://github.com/Itseez/opencv.git` . Nevertheless, not everyone has upload (commit/submit) rights. This is to protect the integrity of the library. If you plan doing more than one tutorial, and would like to have an account with commit user rights you should first register an account at http://code.opencv.org/ and then contact OpenCV administrator -delete-admin@-delete-opencv.org. Otherwise, you can just send the resulting files to us at -delete-admin@-delete-opencv.org and we'll add it.
-
-
-Format the Source Code
-======================
-
-Before I start this let it be clear: the main goal is to have a working sample code. However, for your tutorial to be of a top notch quality you should follow a few guide lines I am going to present here. In case you have an application by using the older interface (with *IplImage*, *cvMat*, *cvLoadImage* and such) consider migrating it to the new C++ interface. The tutorials are intended to be an up to date help for our users. And as of OpenCV 2 the OpenCV emphasis on using the less error prone and clearer C++ interface. Therefore, if possible please convert your code to the C++ interface. For this it may help to read the :ref:`InteroperabilityWithOpenCV1` tutorial. However, once you have an OpenCV 2 working code, then you should make your source code snippet as easy to read as possible. Here're a couple of advices for this:
-
-
-.. container:: enumeratevisibleitemswithsquare
-
-   + Add a standard output with the description of what your program does. Keep it short and yet, descriptive. This output is at the start of the program. In my example files this usually takes the form of a *help* function containing the output. This way both the source file viewer and application runner can see what all is about in your sample. Here's an instance of this:
-
-     .. code-block:: cpp
-
-        void help()
-        {
-        cout
-        << "--------------------------------------------------------------------------"   << endl
-        << "This program shows how to write video files. You can extract the R or G or B color channel "
-        << " of the input video. You can choose to use the source codec (Y) or select a custom one. (N)"<< endl
-        << "Usage:"                                                                       << endl
-        << "./video-write inputvideoName [ R | G | B] [Y | N]"                            << endl
-        << "--------------------------------------------------------------------------"   << endl
-        << endl;
-        }
-        // ...
-        int main(int argc, char *argv[], char *window_name)
-        {
-        help();
-        // here comes the actual source code
-        }
-
-     Additionally, finalize the description with a short usage guide. This way the user will know how to call your programs, what leads us to the next point.
-
-   + Prefer command line argument controlling instead of hard coded one. If your program has some variables that may be changed use command line arguments for this. The tutorials, can be a simple try-out ground for the user. If you offer command line controlling for the input image (for example), then you offer the possibility for the user to try it out with his/her own images, without the need to mess in the source code. In the upper example you can see that the input image, channel and codec selection may all be changed from the command line. Just compile the program and run it with your own input arguments.
-
-   + Be as verbose as possible. There is no shame in filling the source code with comments. This way the more advanced user may figure out what's happening right from the sample code. This advice goes for the output console too. Specify to the user what's happening. Never leave the user hanging there and thinking on: "Is this program now crashing or just doing some computationally intensive task?." So, if you do a training task that may take some time, make sure you print out a message about this before starting and after finishing it.
-
-   + Throw out unnecessary stuff from your source code. This is a warning to not take the previous point too seriously. Balance is the key. If it's something that can be done in a fewer lines or simpler than that's the way you should do it. Nevertheless, if for some reason you have such sections notify the user why you have chosen to do so. Keep the amount of information as low as possible, while still getting the job done in an elegant way.
-
-   + Put your sample file into the :file:`opencv/samples/cpp/tutorial_code/sectionName` folder. If you write a tutorial for other languages than cpp, then change that part of the path. Before completing this you need to decide that to what section (module) does your tutorial goes. Think about on what module relies most heavily your code and that is the one to use. If the answer to this question is more than one modules then the *general* section is the one to use. For finding the *opencv* directory open up your file system and navigate where you downloaded our repository.
-
-   + If the input resources are hard to acquire for the end user consider adding a few of them to the :file:`opencv/samples/cpp/tutorial_code/images`. Make sure that who reads your code can try it out!
-
-Add the TOC entry
-=================
-
-For this you will need to know some |reST|_. There is no going around this. |reST|_ files have **rst** extensions. However, these are simple text files. Use any text editor you like. Finding a text editor that offers syntax highlighting for |reST|_ was quite a challenge at the time of writing this tutorial. In my experience, `Intype <http://intype.info/>`_ is a solid option on Windows, although there is still place for improvement.
-
-Adding your source code to a table of content is important for multiple reasons. First and foremost this will allow for the user base to find your tutorial from our websites tutorial table of content. Secondly, if you omit this *Sphinx* will throw a warning that your tutorial file isn't part of any TOC tree entry. And there is nothing more than the developer team hates than an ever increasing warning/error list for their builds. *Sphinx* also uses this to build up the previous-back-up buttons on the website. Finally, omitting this step will lead to that your tutorial will **not** be added to the PDF version of the tutorials.
-
-Navigate to the :file:`opencv/doc/tutorials/section/table_of_content_section` folder (where the section is the module to which you're adding the tutorial). Open the *table_of_content_section* file. Now this may have two forms. If no prior tutorials are present in this section that there is a template message about this and has the following form:
-
-.. code-block:: rst
-
-  .. _Table-Of-Content-Section:
-
-   Section title
-   -----------------------------------------------------------
-
-   Description about the section.
-
-   .. include:: ../../definitions/noContent.rst
-
-   .. raw:: latex
-
-      \pagebreak
-
-The first line is a reference to the section title in the reST system. The section title will be a link and you may refer to it via the ``:ref:`` directive. The *include* directive imports the template text from the definitions directories *noContent.rst* file. *Sphinx* does not creates the PDF from scratch. It does this by first creating a latex file. Then creates the PDF from the latex file. With the *raw* directive you can directly add to this output commands. Its unique argument is for what kind of output to add the content of the directive. For the PDFs it may happen that multiple sections will overlap on a single page. To avoid this at the end of the TOC we add a *pagebreak* latex command, that hints to the LATEX system that the next line should be on a new page.
-
-If you have one of this, try to transform it to the following form:
-
-.. include:: ../../definitions/tocDefinitions.rst
-
-.. code-block:: rst
-
-   .. _Table-Of-Content-Section:
-
-   Section title
-   -----------------------------------------------------------
-
-   .. include:: ../../definitions/tocDefinitions.rst
-
-   +
-     .. tabularcolumns:: m{100pt} m{300pt}
-     .. cssclass:: toctableopencv
-
-     =============== ======================================================
-      |MatBasicIma|  **Title:** :ref:`matTheBasicImageContainer`
-
-                     *Compatibility:* > OpenCV 2.0
-
-                     *Author:* |Author_BernatG|
-
-                     You will learn how to store images in the memory and how to print out their content to the console.
-
-     =============== =====================================================
-
-     .. |MatBasicIma| image:: images/matTheBasicImageStructure.jpg
-                      :height: 90pt
-                      :width:  90pt
-
-   .. raw:: latex
-
-      \pagebreak
-
-   .. toctree::
-      :hidden:
-
-      ../mat - the basic image container/mat - the basic image container
-
-If this is already present just add a new section of the content between the include and the raw directives (excluding those lines). Here you'll see a new include directive. This should be present only once in a TOC tree and the reST file contains the definitions of all the authors contributing to the OpenCV tutorials. We are a multicultural community and some of our name may contain some funky characters. However, reST **only supports** ANSI characters. Luckily we can specify Unicode characters with the *unicode* directive. Doing this for all of your tutorials is a troublesome procedure. Therefore, the tocDefinitions file contains the definition of your author name. Add it here once and afterwards just use the replace construction. For example here's the definition for my name:
-
-.. code-block:: rst
-
-   .. |Author_BernatG| unicode:: Bern U+00E1 t U+0020 G U+00E1 bor
-
-The ``|Author_BernatG|`` is the text definitions alias. I can use later this to add the definition, like I've done in the TOCs *Author* part. After the ``::`` and a space you start the definition. If you want to add an UNICODE character (non-ASCI) leave an empty space and specify it in the format U+(UNICODE code). To find the UNICODE code of a character I recommend using the `FileFormat <http://www.fileformat.info>`_ websites service. Spaces are trimmed from the definition, therefore we add a space by its UNICODE character (U+0020).
-
-Until the *raw* directive what you can see is a TOC tree entry. Here's how a TOC entry will look like:
-
-+
-  .. tabularcolumns:: m{100pt} m{300pt}
-  .. cssclass:: toctableopencv
-
-  =============== ======================================================
-   |MatBasicIma|  **Title:** :ref:`matTheBasicImageContainer`
-
-                  *Compatibility:* > OpenCV 2.0
-
-                  *Author:* |Author_BernatG|
-
-                  You will learn how to store images in the memory and how to print out their content to the console.
-
-  =============== ======================================================
-
-  .. |MatBasicIma| image:: images/matTheBasicImageStructure.jpg
-                   :height: 90pt
-                   :width:  90pt
-
-As you can see we have an image to the left and a description box to the right. To create two boxes we use a table with two columns and a single row. In the left column is the image and in the right one the description. However, the image directive is way too long to fit in a column. Therefore, we need to use the substitution definition system. We add this definition after the TOC tree. All images for the TOC tree are to be put in the images folder near its |reST|_ file. We use the point measurement system because we are also creating PDFs. PDFs are printable documents, where there is no such thing that pixels (px), just points (pt). And while generally space is no problem for web pages (we have monitors with **huge** resolutions) the size of the paper (A4 or letter) is constant and will be for a long time in the future. Therefore, size constrains come in play more like for the PDF, than the generated HTML code.
-
-Now your images should be as small as possible, while still offering the intended information for the user. Remember that the tutorial will become part of the OpenCV source code. If you add large images (that manifest in form of large image size) it will just increase the size of the repository pointlessly. If someone wants to download it later, its download time will be that much longer. Not to mention the larger PDF size for the tutorials and the longer load time for the web pages. In terms of pixels a TOC image should not be larger than 120 X 120 pixels. Resize your images if they are larger!
-
-.. note::
-
-   If you add a larger image and specify a smaller image size, *Sphinx* will not resize that. At build time will add the full size image and the resize will be done by your browser after the image is loaded. A 120 X 120 image is somewhere below 10KB. If you add a 110KB image, you have just pointlessly added a 100KB extra data to transfer over the internet for every user!
-
-Generally speaking you shouldn't need to specify your images size (excluding the TOC entries). If no such is found *Sphinx* will use the size of the image itself (so no resize occurs). Then again if for some reason you decide to specify a size that should be the **width** of the image rather than its height. The reason for this again goes back to the PDFs. On a PDF page the height is larger than the width. In the PDF the images will not be resized. If you specify a size that does not fit in the page, then what does not fits in **will be cut off**. When creating your images for your tutorial you should try to keep the image widths below 500 pixels, and calculate with around 400 point page width when specifying image widths.
-
-The image format depends on the content of the image. If you have some complex scene (many random like colors) then use *jpg*. Otherwise, prefer using *png*. They are even some tools out there that optimize the size of *PNG* images, such as `PNGGauntlet <http://pnggauntlet.com/>`_. Use them to make your images as small as possible in size.
-
-Now on the right side column of the table we add the information about the tutorial:
-
-.. container:: enumeratevisibleitemswithsquare
-
-   + In the first line it is the title of the tutorial. However, there is no need to specify it explicitly. We use the reference system. We'll start up our tutorial with a reference specification, just like in case of this TOC entry with its  `` .. _Table-Of-Content-Section:`` . If after this you have a title (pointed out by the following line of -), then Sphinx will replace the ``:ref:`Table-Of-Content-Section``` directive with the tile of the section in reference form (creates a link in web page). Here's how the definition looks in my case:
-
-     .. code-block:: rst
-
-        .. _matTheBasicImageContainer:
-
-           Mat - The Basic Image Container
-           *******************************
-
-     Note, that according to the |reST|_ rules the * should be as long as your title.
-
-   + Compatibility. What version of OpenCV is required to run your sample code.
-
-   + Author. Use the substitution markup of |reST|_.
-
-   + A short sentence describing the essence of your tutorial.
-
-Now before each TOC entry you need to add the three lines of:
-
-.. code-block:: cpp
-
-   +
-     .. tabularcolumns:: m{100pt} m{300pt}
-     .. cssclass:: toctableopencv
-
-The plus sign (+) is to enumerate tutorials by using bullet points. So for every TOC entry we have a corresponding bullet point represented by the +. Sphinx is highly indenting sensitive. Indentation is used to express from which point until to which point does a construction last. Un-indentation means end of that construction. So to keep all the bullet points to the same group the following TOC entries (until the next +) should be indented by two spaces.
-
-Here, I should also mention that **always** prefer using spaces instead of tabs. Working with only spaces makes possible that if we both use monotype fonts we will see the same thing. Tab size is text editor dependent and as should be avoided. *Sphinx* translates all tabs into 8 spaces before interpreting it.
-
-It turns out that the automatic formatting of both the HTML and PDF(LATEX) system messes up our tables. Therefore, we need to help them out a little. For the PDF generation we add the ``.. tabularcolumns:: m{100pt} m{300pt}`` directive. This means that the first column should be 100 points wide and middle aligned. For the HTML look we simply name the following table of a *toctableopencv* class type. Then, we can modify the look of the table by modifying the CSS of our web page. The CSS definitions go into the :file:`opencv/doc/_themes/blue/static/default.css_t` file.
-
-.. code-block:: css
-
-   .toctableopencv
-   {
-    width: 100% ;
-    table-layout: fixed;
-   }
-
-
-   .toctableopencv colgroup col:first-child
-   {
-    width: 100pt !important;
-    max-width: 100pt !important;
-    min-width: 100pt !important;
-   }
-
-   .toctableopencv colgroup col:nth-child(2)
-   {
-    width: 100% !important;
-   }
-
-However, you should not need to modify this. Just add these three lines (plus keep the two space indentation) for all TOC entries you add. At the end of the TOC file you'll find:
-
-.. code-block:: rst
-
-   .. raw:: latex
-
-      \pagebreak
-
-   .. toctree::
-      :hidden:
-
-      ../mat - the basic image container/mat - the basic image container
-
-The page break entry comes for separating sections and should be only one in a TOC tree |reST|_ file. Finally, at the end of the TOC tree we need to add our tutorial to the *Sphinx* TOC tree system. *Sphinx* will generate from this the previous-next-up information for the HTML file and add items to the PDF according to the order here. By default this TOC tree directive generates a simple table of contents. However, we already created a fancy looking one so we no longer need this basic one. Therefore, we add the *hidden* option to do not show it.
-
-The path is of a relative type. We step back in the file system and then go into the :file:`mat - the basic image container` directory for the :file:`mat - the basic image container.rst` file. Putting out the *rst* extension for the file is optional.
-
-Write the tutorial
-==================
-
-Create a folder with the name of your tutorial. Preferably, use small letters only. Then create a text file in this folder with *rst* extension and the same name. If you have images for the tutorial create an :file:`images` folder and add your images there. When creating your images follow the guidelines described in the previous part!
-
-Now here's our recommendation for the structure of the tutorial (although, remember that this is not carved in the stone; if you have a better idea, use it!):
-
-
-.. container:: enumeratevisibleitemswithsquare
-
-   + Create the reference point and the title.
-
-     .. code-block:: rst
-
-        .. _matTheBasicImageContainer:
-
-        Mat - The Basic Image Container
-        *******************************
-
-     You start the tutorial by specifying a reference point by the ``.. _matTheBasicImageContainer:`` and then its title. The name of the reference point should be a unique one over the whole documentation. Therefore, do not use general names like *tutorial1*. Use the * character to underline the title for its full width. The subtitles of the tutorial should be underlined with = charachter.
-
-   + Goals. You start your tutorial by specifying what you will present. You can also enumerate the sub jobs to be done. For this you can use a bullet point construction. There is a single configuration file for both the reference manual and the tutorial documentation. In the reference manuals at the argument enumeration we do not want any kind of bullet point style enumeration. Therefore, by default all the bullet points at this level are set to do not show the dot before the entries in the HTML. You can override this by putting the bullet point in a container. I've defined a square type bullet point view under the name *enumeratevisibleitemswithsquare*. The CSS style definition for this is again in the  :file:`opencv\doc\_themes\blue\static\default.css_t` file. Here's a quick example of using it:
-
-     .. code-block:: rst
-
-        .. container:: enumeratevisibleitemswithsquare
-
-           + Create the reference point and the title.
-           + Second entry
-           + Third entry
-
-     Note that you need the keep the indentation of the container directive. Directive indentations are always three (3) spaces. Here you may even give usage tips for your sample code.
-
-   + Source code. Present your samples code to the user. It's a good idea to offer a quick download link for the HTML page by using the *download* directive and pointing out where the user may find your source code in the file system by using the *file* directive:
-
-     .. code-block:: rst
-
-        Text :file:`samples/cpp/tutorial_code/highgui/video-write/` folder of the OpenCV source library
-        or :download:`text to appear in the webpage
-        <../../../../samples/cpp/tutorial_code/HighGUI/video-write/video-write.cpp>`.
-
-     For the download link the path is a relative one, hence the multiple back stepping operations (..). Then you can add the source code either by using the *code block* directive or the *literal include* one. In case of the code block you will need to actually add all the source code text into your |reST|_ text and also apply the required indentation:
-
-     .. code-block:: rst
-
-        .. code-block:: cpp
-
-           int i = 0;
-           l = ++j;
-
-     The only argument of the directive is the language used (here CPP). Then you add the source code into its content (meaning one empty line after the directive) by keeping the indentation of the directive (3 spaces). With the *literal include* directive you do not need to add the source code of the sample. You just specify the sample and *Sphinx* will load it for you, during build time. Here's an example usage:
-
-     .. code-block:: rst
-
-        .. literalinclude:: ../../../../samples/cpp/tutorial_code/HighGUI/video-write/video-write.cpp
-           :language: cpp
-           :linenos:
-           :tab-width: 4
-           :lines: 1-8, 21-23, 25-
-
-     After the directive you specify a relative path to the file from what to import. It has four options: the language to use, if you add the ``:linenos:`` the line numbers will be shown, you can specify the tab size with the ``:tab-width:`` and you do not need to load the whole file, you can show just the important lines. Use the *lines* option to do not show redundant information (such as the *help* function). Here basically you specify ranges, if the second range line number is missing than that means that until the end of the file. The ranges specified here do no need to be in an ascending order, you may even reorganize the structure of how you want to show your sample inside the tutorial.
-
-   + The tutorial. Well here goes the explanation for why and what have you used. Try to be short, clear, concise and yet a thorough one. There's no magic formula. Look into a few already made tutorials and start out from there. Try to mix sample OpenCV code with your explanations. If with words is hard to describe something do not hesitate to add in a reasonable size image, to overcome this issue.
-
-     When you present OpenCV functionality it's a good idea to give a link to the used OpenCV data structure or function. Because the OpenCV tutorials and reference manual are in separate PDF files it is not possible to make this link work for the PDF format. Therefore, we use here only web page links to the http://docs.opencv.org website. The OpenCV functions and data structures may be used for multiple tasks. Nevertheless, we want to avoid that every users creates its own reference to a commonly used function. So for this we use the global link collection of *Sphinx*. This is defined in the file:`opencv/doc/conf.py` configuration file. Open it and go all the way down to the last entry:
-
-     .. code-block:: py
-
-       # ---- External links for tutorials -----------------
-       extlinks = {
-           'rwimg' : ('http://docs.opencv.org/modules/imgcodecs/doc/reading_and_writing_images.html#%s', None)
-           }
-
-     In short here we defined a new **rwimg** directive that refers to an external webpage link. Its usage is:
-
-     .. code-block:: rst
-
-       A sample function of the highgui modules image write and read page is the :rwimg:`imread() function <imread>`.
-
-     Which turns to: A sample function of the highgui modules image write and read page is the :rwimg:`imread() function <imread>`. The argument you give between the <> will be put in place of the ``%s`` in the upper definition, and as the link will anchor to the correct function. To find out the anchor of a given function just open up a web page, search for the function and click on it. In the address bar it should appear like: ``http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images.html#imread`` .  Look here for the name of the directives for each page of the OpenCV reference manual. If none present for one of them feel free to add one for it.
-
-     For formulas you can add LATEX code that will translate in the web pages into images. You do this by using the *math* directive. A usage tip:
-
-     .. code-block:: latex
-
-        .. math::
-
-           MSE = \frac{1}{c*i*j} \sum{(I_1-I_2)^2}
-
-     That after build turns into:
-
-     .. math::
-
-        MSE = \frac{1}{c*i*j} \sum{(I_1-I_2)^2}
-
-     You can even use it inline as ``:math:` MSE = \frac{1}{c*i*j} \sum{(I_1-I_2)^2}``` that turns into :math:`MSE = \frac{1}{c*i*j} \sum{(I_1-I_2)^2}`.
-
-     If you use some crazy LATEX library extension you need to add those to the ones to use at build time. Look into the file:`opencv/doc/conf.py` configuration file for more information on this.
-
-   + Results. Well, here depending on your program show one of more of the following:
-
-     - Console outputs by using the code block directive.
-     - Output images.
-     - Runtime videos, visualization. For this use your favorite screens capture software. `Camtasia Studio <http://www.techsmith.com/camtasia/>`_ certainly is one of the better choices, however their prices are out of this world. `CamStudio <http://camstudio.org/>`_ is a free alternative, but less powerful. If you do a video you can upload it to YouTube and then use the raw directive with HTML option to embed it into the generated web page:
-
-       .. code-block:: rst
-
-          You may observe a runtime instance of this on the `YouTube here <https://www.youtube.com/watch?v=jpBwHxsl1_0>`_.
-
-          .. raw:: html
-
-             <div align="center">
-             <iframe title="Creating a video with OpenCV" width="560" height="349" src="http://www.youtube.com/embed/jpBwHxsl1_0?rel=0&loop=1" frameborder="0" allowfullscreen align="middle"></iframe>
-             </div>
-
-       This results in the text and video: You may observe a runtime instance of this on the `YouTube here <https://www.youtube.com/watch?v=jpBwHxsl1_0>`_.
-
-       .. raw:: html
-
-          <div align="center">
-          <iframe title="Creating a video with OpenCV" width="560" height="349" src="http://www.youtube.com/embed/jpBwHxsl1_0?rel=0&loop=1" frameborder="0" allowfullscreen align="middle"></iframe>
-          </div>
-
-     When these aren't self-explanatory make sure to throw in a few guiding lines about what and why we can see.
-
-   + Build the documentation and check for errors or warnings. In the CMake make sure you check or pass the option for building documentation. Then simply build the **docs** project for the PDF file and the **docs_html** project for the web page. Read the output of the build and check for errors/warnings for what you have added. This is also the time to observe and correct any kind of *not so good looking* parts. Remember to keep clean our build logs.
-
-   + Read again your tutorial and check for both programming and spelling errors. If found any, please correct them.
-
-
-Take home the pride and joy of a job well done!
-===============================================
-
-Once you are done please make a GitHub pull request with the tutorial. Now, to see
-your work **live** you may need to wait some time. The PDFs are updated usually at
-the launch of a new OpenCV version. The web pages are a little more diverse. They are
-automatically rebuilt nightly. Currently we use ``2.4`` and ``master`` branches for
-daily builds. So, if your pull request was merged to any of these branches, your
-material will be published at `docs.opencv.org/2.4 <http:/docs.opencv.org/2.4>`_ or
-`docs.opencv.org/master <http:/docs.opencv.org/master>`_ correspondingly. Everything
-that was added to ``2.4`` is merged to ``master`` branch every week. Although, we try
-to make a build every night, occasionally we might freeze any of the branches to fix
-upcoming issues. During this it may take a little longer to see your work online,
-however if you submitted it, be sure that eventually it will show up.
-
-If you have any questions or advices relating to this tutorial you can contact us at
-delete-admin@-delete-opencv.org (delete the -delete- parts of that email address).
--- a/doc/tutorials/introduction/how_to_write_a_tutorial/images/matTheBasicImageStructure.jpg
+++ b/doc/tutorials/introduction/how_to_write_a_tutorial/images/matTheBasicImageStructure.jpg
--- a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.markdown
+++ b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.markdown
@ -6,140 +6,138 @@ Additionally you can find very basic sample source code to introduce you to the

 -   @subpage tutorial_linux_install

-    *Compatibility:* \> OpenCV 2.0
+    _Compatibility:_ \> OpenCV 2.0

-    *Author:* Ana Huamán
+    _Author:_ Ana Huamán

    We will learn how to setup OpenCV in your computer!

 -   @subpage tutorial_linux_gcc_cmake

-    *Compatibility:* \> OpenCV 2.0
+    _Compatibility:_ \> OpenCV 2.0

-    *Author:* Ana Huamán
+    _Author:_ Ana Huamán

    We will learn how to compile your first project using gcc and CMake

 -   @subpage tutorial_linux_eclipse

-    *Compatibility:* \> OpenCV 2.0
+    _Compatibility:_ \> OpenCV 2.0

-    *Author:* Ana Huamán
+    _Author:_ Ana Huamán

    We will learn how to compile your first project using the Eclipse environment

 -   @subpage tutorial_windows_install

-    *Compatibility:* \> OpenCV 2.0
+    _Compatibility:_ \> OpenCV 2.0

-    *Author:* Bernát Gábor
+    _Author:_ Bernát Gábor

    You will learn how to setup OpenCV in your Windows Operating System!

 -   @subpage tutorial_windows_visual_studio_Opencv

-    *Compatibility:* \> OpenCV 2.0
+    _Compatibility:_ \> OpenCV 2.0

-    *Author:* Bernát Gábor
+    _Author:_ Bernát Gábor

    You will learn what steps you need to perform in order to use the OpenCV library inside a new
    Microsoft Visual Studio project.

 -   @subpage tutorial_windows_visual_studio_image_watch

-    *Compatibility:* \>= OpenCV 2.4
+    _Compatibility:_ \>= OpenCV 2.4

-    *Author:* Wolf Kienzle
+    _Author:_ Wolf Kienzle

    You will learn how to visualize OpenCV matrices and images within Visual Studio 2012.

 -   @subpage tutorial_java_dev_intro

-    *Compatibility:* \> OpenCV 2.4.4
+    _Compatibility:_ \> OpenCV 2.4.4

-    *Authors:* Eric Christiansen and Andrey Pavlenko
+    _Authors:_ Eric Christiansen and Andrey Pavlenko

    Explains how to build and run a simple desktop Java application using Eclipse, Ant or the
    Simple Build Tool (SBT).

 -   @subpage tutorial_java_eclipse

-    *Compatibility:* \> OpenCV 2.4.4
+    _Compatibility:_ \> OpenCV 2.4.4

-    *Author:* Barış Evrim Demiröz
+    _Author:_ Barış Evrim Demiröz

    A tutorial on how to use OpenCV Java with Eclipse.

 -   @subpage tutorial_clojure_dev_intro

-    *Compatibility:* \> OpenCV 2.4.4
+    _Compatibility:_ \> OpenCV 2.4.4

-    *Author:* Mimmo Cosenza
+    _Author:_ Mimmo Cosenza

    A tutorial on how to interactively use OpenCV from the Clojure REPL.

 -   @subpage tutorial_android_dev_intro

-    *Compatibility:* \> OpenCV 2.4.2
+    _Compatibility:_ \> OpenCV 2.4.2

-    *Author:* Vsevolod Glumov
+    _Author:_ Vsevolod Glumov

    Not a tutorial, but a guide introducing Android development basics and environment setup

 -   @subpage tutorial_O4A_SDK

-    *Compatibility:* \> OpenCV 2.4.2
+    _Compatibility:_ \> OpenCV 2.4.2

-    *Author:* Vsevolod Glumov
+    _Author:_ Vsevolod Glumov

    OpenCV4Android SDK: general info, installation, running samples

 -   @subpage tutorial_dev_with_OCV_on_Android

-    *Compatibility:* \> OpenCV 2.4.3
+    _Compatibility:_ \> OpenCV 2.4.3

-    *Author:* Vsevolod Glumov
+    _Author:_ Vsevolod Glumov

    Development with OpenCV4Android SDK

 -   @subpage tutorial_ios_install

-    *Compatibility:* \> OpenCV 2.4.2
+    _Compatibility:_ \> OpenCV 2.4.2

-    *Author:* Artem Myagkov, Eduard Feicho
+    _Author:_ Artem Myagkov, Eduard Feicho

    We will learn how to setup OpenCV for using it in iOS!

 -   @subpage tutorial_arm_crosscompile_with_cmake

-    *Compatibility:* \> OpenCV 2.4.4
+    _Compatibility:_ \> OpenCV 2.4.4

-    *Author:* Alexander Smorkalov
+    _Author:_ Alexander Smorkalov

    We will learn how to setup OpenCV cross compilation environment for ARM Linux.

 -   @subpage tutorial_display_image

-    *Compatibility:* \> OpenCV 2.0
+    _Compatibility:_ \> OpenCV 2.0

-    *Author:* Ana Huamán
+    _Author:_ Ana Huamán

    We will learn how to display an image using OpenCV

 -   @subpage tutorial_load_save_image

-    *Compatibility:* \> OpenCV 2.0
+    _Compatibility:_ \> OpenCV 2.0

-    *Author:* Ana Huamán
+    _Author:_ Ana Huamán

    We will learn how to save an Image in OpenCV...plus a small conversion to grayscale

-   @subpage tutorial_how_to_write_a_tutorial
+-   @subpage tutorial_documentation

-    *Compatibility:* \> OpenCV 1.0
+    _Compatibility:_ \> OpenCV 3.0

-    *Author:* Bernát Gábor
+    _Author:_ Maksim Shabunin

-    If you already have a good grasp on using OpenCV and have made some projects that would be
-    perfect presenting an OpenCV feature not yet part of these tutorials, here it is what you
-    need to know.
+    This tutorial describes new documenting process and some useful Doxygen features.
--- a/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
+++ b/doc/tutorials/introduction/table_of_content_introduction/table_of_content_introduction.rst
@ -293,26 +293,6 @@ world of the OpenCV.
                     :height: 90pt
                     :width:  90pt

-* **Want to contribute, and see your own work between the OpenCV tutorials?**
-
-  .. tabularcolumns:: m{100pt} m{300pt}
-  .. cssclass:: toctableopencv
-
-  =============== ======================================================
-   |HowToWriteT|  **Title:** :ref:`howToWriteTutorial`
-
-                  *Compatibility:* > OpenCV 1.0
-
-                  *Author:* |Author_BernatG|
-
-                  If you already have a good grasp on using OpenCV and have made some projects that would be perfect presenting an OpenCV feature not yet part of these tutorials, here it is what you need to know.
-
-  =============== ======================================================
-
-  .. |HowToWriteT| image:: images/how_to_write_a_tutorial.png
-                   :height: 90pt
-                   :width:  90pt
-
 .. raw:: latex

   \pagebreak
@ -337,4 +317,3 @@ world of the OpenCV.
   ../crosscompilation/arm_crosscompile_with_cmake
   ../display_image/display_image
   ../load_save_image/load_save_image
-   ../how_to_write_a_tutorial/how_to_write_a_tutorial
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@ -1855,4 +1855,8 @@ namespace fisheye

 } // cv

+#ifndef DISABLE_OPENCV_24_COMPATIBILITY
+#include "opencv2/calib3d/calib3d_c.h"
+#endif
+
 #endif
--- a/modules/core/doc/operations_on_arrays.rst
+++ b/modules/core/doc/operations_on_arrays.rst
@ -1902,7 +1902,7 @@ Performs the per-element multiplication of two Fourier spectrums.

    :param dst: output array of the same size and type as ``src1`` .

-    :param flags: operation flags; currently, the only supported flag is ``DFT_ROWS``, which indicates that each row of ``src1`` and ``src2`` is an independent 1D Fourier spectrum.
+    :param flags: operation flags; currently, the only supported flag is ``DFT_ROWS``, which indicates that each row of ``src1`` and ``src2`` is an independent 1D Fourier spectrum. If you do not want to use this flag, then simply add a `0` as value.

    :param conjB: optional flag that conjugates the second input array before the multiplication (true) or not (false).

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -1952,7 +1952,7 @@ arrays are real, they are assumed to be CCS-packed (see dft for details).
@param b second input array of the same size and type as src1 .
@param c output array of the same size and type as src1 .
@param flags operation flags; currently, the only supported flag is cv::DFT_ROWS, which indicates that
-each row of src1 and src2 is an independent 1D Fourier spectrum.
+each row of src1 and src2 is an independent 1D Fourier spectrum. If you do not want to use this flag, then simply add a `0` as value.
@param conjB optional flag that conjugates the second input array before the multiplication (true)
 or not (false).
 */
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -705,7 +705,7 @@ namespace ogl
 namespace cuda
 {
    class CV_EXPORTS GpuMat;
-    class CV_EXPORTS CudaMem;
+    class CV_EXPORTS HostMem;
    class CV_EXPORTS Stream;
    class CV_EXPORTS Event;
 }
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@ -51,12 +51,25 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/cuda_types.hpp"

+/**
+  @defgroup cuda CUDA-accelerated Computer Vision
+  @{
+    @defgroup cudacore Core part
+    @{
+      @defgroup cudacore_init Initalization and Information
+      @defgroup cudacore_struct Data Structures
+    @}
+  @}
+ */
+
 namespace cv { namespace cuda {

-//! @addtogroup cuda_struct
+//! @addtogroup cudacore_struct
 //! @{

-//////////////////////////////// GpuMat ///////////////////////////////
+//===================================================================================
+// GpuMat
+//===================================================================================

 /** @brief Base storage class for GPU memory with reference counting.

@ -314,13 +327,13 @@ The function does not reallocate memory if the matrix has proper attributes alre
 */
 CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);

-CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat);
-
 //! BufferPool management (must be called before Stream creation)
 CV_EXPORTS void setBufferPoolUsage(bool on);
 CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);

-//////////////////////////////// CudaMem ////////////////////////////////
+//===================================================================================
+// HostMem
+//===================================================================================

 /** @brief Class with reference counting wrapping special memory type allocation functions from CUDA.

@ -337,43 +350,45 @@ Its interface is also Mat-like but with additional memory type parameters.
@note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2
 Pinned Memory APIs* document or *CUDA C Programming Guide*.
 */
-class CV_EXPORTS CudaMem
+class CV_EXPORTS HostMem
 {
 public:
    enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };

-    explicit CudaMem(AllocType alloc_type = PAGE_LOCKED);
+    static MatAllocator* getAllocator(AllocType alloc_type = PAGE_LOCKED);

-    CudaMem(const CudaMem& m);
+    explicit HostMem(AllocType alloc_type = PAGE_LOCKED);

-    CudaMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
-    CudaMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);
+    HostMem(const HostMem& m);
+
+    HostMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED);
+    HostMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED);

    //! creates from host memory with coping data
-    explicit CudaMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);
+    explicit HostMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED);

-    ~CudaMem();
+    ~HostMem();

-    CudaMem& operator =(const CudaMem& m);
+    HostMem& operator =(const HostMem& m);

    //! swaps with other smart pointer
-    void swap(CudaMem& b);
+    void swap(HostMem& b);

    //! returns deep copy of the matrix, i.e. the data is copied
-    CudaMem clone() const;
+    HostMem clone() const;

    //! allocates new matrix data unless the matrix already has specified size and type.
    void create(int rows, int cols, int type);
    void create(Size size, int type);

-    //! creates alternative CudaMem header for the same data, with different
+    //! creates alternative HostMem header for the same data, with different
    //! number of channels and/or different number of rows
-    CudaMem reshape(int cn, int rows = 0) const;
+    HostMem reshape(int cn, int rows = 0) const;

    //! decrements reference counter and released memory if needed.
    void release();

-    //! returns matrix header with disabled reference counting for CudaMem data.
+    //! returns matrix header with disabled reference counting for HostMem data.
    Mat createMatHeader() const;

    /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting
@ -422,7 +437,9 @@ CV_EXPORTS void registerPageLocked(Mat& m);
 */
 CV_EXPORTS void unregisterPageLocked(Mat& m);

-///////////////////////////////// Stream //////////////////////////////////
+//===================================================================================
+// Stream
+//===================================================================================

 /** @brief This class encapsulates a queue of asynchronous calls.

@ -479,6 +496,7 @@ private:

    friend struct StreamAccessor;
    friend class BufferPool;
+    friend class DefaultDeviceInitializer;
 };

 class CV_EXPORTS Event
@ -514,11 +532,13 @@ private:
    friend struct EventAccessor;
 };

-//! @} cuda_struct
+//! @} cudacore_struct

-//////////////////////////////// Initialization & Info ////////////////////////
+//===================================================================================
+// Initialization & Info
+//===================================================================================

-//! @addtogroup cuda_init
+//! @addtogroup cudacore_init
 //! @{

 /** @brief Returns the number of installed CUDA-enabled devices.
@ -558,7 +578,9 @@ enum FeatureSet
    FEATURE_SET_COMPUTE_20 = 20,
    FEATURE_SET_COMPUTE_21 = 21,
    FEATURE_SET_COMPUTE_30 = 30,
+    FEATURE_SET_COMPUTE_32 = 32,
    FEATURE_SET_COMPUTE_35 = 35,
+    FEATURE_SET_COMPUTE_50 = 50,

    GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
    SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
@ -813,7 +835,7 @@ private:
 CV_EXPORTS void printCudaDeviceInfo(int device);
 CV_EXPORTS void printShortCudaDeviceInfo(int device);

-//! @} cuda_init
+//! @} cudacore_init

 }} // namespace cv { namespace cuda {

--- a/modules/core/include/opencv2/core/cuda.inl.hpp
+++ b/modules/core/include/opencv2/core/cuda.inl.hpp
@ -50,7 +50,9 @@

 namespace cv { namespace cuda {

-//////////////////////////////// GpuMat ///////////////////////////////
+//===================================================================================
+// GpuMat
+//===================================================================================

 inline
 GpuMat::GpuMat(Allocator* allocator_)
@ -145,6 +147,7 @@ void GpuMat::swap(GpuMat& b)
    std::swap(datastart, b.datastart);
    std::swap(dataend, b.dataend);
    std::swap(refcount, b.refcount);
+    std::swap(allocator, b.allocator);
 }

 inline
@ -374,16 +377,18 @@ void swap(GpuMat& a, GpuMat& b)
    a.swap(b);
 }

-//////////////////////////////// CudaMem ////////////////////////////////
+//===================================================================================
+// HostMem
+//===================================================================================

 inline
-CudaMem::CudaMem(AllocType alloc_type_)
+HostMem::HostMem(AllocType alloc_type_)
    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
 {
 }

 inline
-CudaMem::CudaMem(const CudaMem& m)
+HostMem::HostMem(const HostMem& m)
    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
 {
    if( refcount )
@ -391,7 +396,7 @@ CudaMem::CudaMem(const CudaMem& m)
 }

 inline
-CudaMem::CudaMem(int rows_, int cols_, int type_, AllocType alloc_type_)
+HostMem::HostMem(int rows_, int cols_, int type_, AllocType alloc_type_)
    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
 {
    if (rows_ > 0 && cols_ > 0)
@ -399,7 +404,7 @@ CudaMem::CudaMem(int rows_, int cols_, int type_, AllocType alloc_type_)
 }

 inline
-CudaMem::CudaMem(Size size_, int type_, AllocType alloc_type_)
+HostMem::HostMem(Size size_, int type_, AllocType alloc_type_)
    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
 {
    if (size_.height > 0 && size_.width > 0)
@ -407,24 +412,24 @@ CudaMem::CudaMem(Size size_, int type_, AllocType alloc_type_)
 }

 inline
-CudaMem::CudaMem(InputArray arr, AllocType alloc_type_)
+HostMem::HostMem(InputArray arr, AllocType alloc_type_)
    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
 {
    arr.getMat().copyTo(*this);
 }

 inline
-CudaMem::~CudaMem()
+HostMem::~HostMem()
 {
    release();
 }

 inline
-CudaMem& CudaMem::operator =(const CudaMem& m)
+HostMem& HostMem::operator =(const HostMem& m)
 {
    if (this != &m)
    {
-        CudaMem temp(m);
+        HostMem temp(m);
        swap(temp);
    }

@ -432,7 +437,7 @@ CudaMem& CudaMem::operator =(const CudaMem& m)
 }

 inline
-void CudaMem::swap(CudaMem& b)
+void HostMem::swap(HostMem& b)
 {
    std::swap(flags, b.flags);
    std::swap(rows, b.rows);
@ -446,86 +451,88 @@ void CudaMem::swap(CudaMem& b)
 }

 inline
-CudaMem CudaMem::clone() const
+HostMem HostMem::clone() const
 {
-    CudaMem m(size(), type(), alloc_type);
+    HostMem m(size(), type(), alloc_type);
    createMatHeader().copyTo(m);
    return m;
 }

 inline
-void CudaMem::create(Size size_, int type_)
+void HostMem::create(Size size_, int type_)
 {
    create(size_.height, size_.width, type_);
 }

 inline
-Mat CudaMem::createMatHeader() const
+Mat HostMem::createMatHeader() const
 {
    return Mat(size(), type(), data, step);
 }

 inline
-bool CudaMem::isContinuous() const
+bool HostMem::isContinuous() const
 {
    return (flags & Mat::CONTINUOUS_FLAG) != 0;
 }

 inline
-size_t CudaMem::elemSize() const
+size_t HostMem::elemSize() const
 {
    return CV_ELEM_SIZE(flags);
 }

 inline
-size_t CudaMem::elemSize1() const
+size_t HostMem::elemSize1() const
 {
    return CV_ELEM_SIZE1(flags);
 }

 inline
-int CudaMem::type() const
+int HostMem::type() const
 {
    return CV_MAT_TYPE(flags);
 }

 inline
-int CudaMem::depth() const
+int HostMem::depth() const
 {
    return CV_MAT_DEPTH(flags);
 }

 inline
-int CudaMem::channels() const
+int HostMem::channels() const
 {
    return CV_MAT_CN(flags);
 }

 inline
-size_t CudaMem::step1() const
+size_t HostMem::step1() const
 {
    return step / elemSize1();
 }

 inline
-Size CudaMem::size() const
+Size HostMem::size() const
 {
    return Size(cols, rows);
 }

 inline
-bool CudaMem::empty() const
+bool HostMem::empty() const
 {
    return data == 0;
 }

 static inline
-void swap(CudaMem& a, CudaMem& b)
+void swap(HostMem& a, HostMem& b)
 {
    a.swap(b);
 }

-//////////////////////////////// Stream ///////////////////////////////
+//===================================================================================
+// Stream
+//===================================================================================

 inline
 Stream::Stream(const Ptr<Impl>& impl)
@ -533,7 +540,9 @@ Stream::Stream(const Ptr<Impl>& impl)
 {
 }

-//////////////////////////////// Initialization & Info ////////////////////////
+//===================================================================================
+// Initialization & Info
+//===================================================================================

 inline
 bool TargetArchs::has(int major, int minor)
@ -592,7 +601,9 @@ bool DeviceInfo::supports(FeatureSet feature_set) const

 }} // namespace cv { namespace cuda {

-//////////////////////////////// Mat ////////////////////////////////
+//===================================================================================
+// Mat
+//===================================================================================

 namespace cv {

--- a/modules/core/include/opencv2/core/cuda/block.hpp
+++ b/modules/core/include/opencv2/core/cuda/block.hpp
@ -43,11 +43,14 @@
 #ifndef __OPENCV_CUDA_DEVICE_BLOCK_HPP__
 #define __OPENCV_CUDA_DEVICE_BLOCK_HPP__

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED

 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    struct Block
    {
        static __device__ __forceinline__ unsigned int id()
@ -201,7 +204,8 @@ namespace cv { namespace cuda { namespace device
            }
        }
    };
-//!@}
 }}}

+//! @endcond
+
 #endif /* __OPENCV_CUDA_DEVICE_BLOCK_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/border_interpolate.hpp
+++ b/modules/core/include/opencv2/core/cuda/border_interpolate.hpp
@ -47,11 +47,14 @@
 #include "vec_traits.hpp"
 #include "vec_math.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
-
    //////////////////////////////////////////////////////////////
    // BrdConstant

@ -712,7 +715,8 @@ namespace cv { namespace cuda { namespace device
        int width;
        D val;
    };
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif // __OPENCV_CUDA_BORDER_INTERPOLATE_HPP__
--- a/modules/core/include/opencv2/core/cuda/color.hpp
+++ b/modules/core/include/opencv2/core/cuda/color.hpp
@ -45,10 +45,14 @@

 #include "detail/color_detail.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    // All OPENCV_CUDA_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
    // {
@ -298,7 +302,8 @@ namespace cv { namespace cuda { namespace device
    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)

    #undef OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif // __OPENCV_CUDA_BORDER_INTERPOLATE_HPP__
--- a/modules/core/include/opencv2/core/cuda/common.hpp
+++ b/modules/core/include/opencv2/core/cuda/common.hpp
@ -48,6 +48,11 @@
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/base.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED

 #ifndef CV_PI_F
    #ifndef CV_PI
@ -58,14 +63,11 @@
 #endif

 namespace cv { namespace cuda {
-//! @addtogroup cuda
-//! @{
    static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
    {
        if (cudaSuccess != err)
            cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
    }
-//! @}
 }}

 #ifndef cudaSafeCall
@ -74,8 +76,6 @@ namespace cv { namespace cuda {

 namespace cv { namespace cuda
 {
-//! @addtogroup cuda
-//! @{
    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
    {
        return reinterpret_cast<size_t>(ptr) % size == 0;
@ -85,15 +85,12 @@ namespace cv { namespace cuda
    {
        return step % size == 0;
    }
-//! @}
 }}

 namespace cv { namespace cuda
 {
    namespace device
    {
-//! @addtogroup cuda
-//! @{
        __host__ __device__ __forceinline__ int divUp(int total, int grain)
        {
            return (total + grain - 1) / grain;
@ -104,8 +101,9 @@ namespace cv { namespace cuda
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
        }
-//! @}
    }
 }}

+//! @endcond
+
 #endif // __OPENCV_CUDA_COMMON_HPP__
--- a/modules/core/include/opencv2/core/cuda/datamov_utils.hpp
+++ b/modules/core/include/opencv2/core/cuda/datamov_utils.hpp
@ -45,11 +45,14 @@

 #include "common.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
-
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200

        // for Fermi memory space is detected automatically
@ -103,7 +106,8 @@ namespace cv { namespace cuda { namespace device
        #undef OPENCV_CUDA_ASM_PTR

    #endif // __CUDA_ARCH__ >= 200
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif // __OPENCV_CUDA_DATAMOV_UTILS_HPP__
--- a/modules/core/include/opencv2/core/cuda/dynamic_smem.hpp
+++ b/modules/core/include/opencv2/core/cuda/dynamic_smem.hpp
@ -43,10 +43,14 @@
 #ifndef __OPENCV_CUDA_DYNAMIC_SMEM_HPP__
 #define __OPENCV_CUDA_DYNAMIC_SMEM_HPP__

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template<class T> struct DynamicSharedMem
    {
        __device__ __forceinline__ operator T*()
@ -77,7 +81,8 @@ namespace cv { namespace cuda { namespace device
            return (double*)__smem_d;
        }
    };
-//! @}
 }}}

+//! @endcond
+
 #endif // __OPENCV_CUDA_DYNAMIC_SMEM_HPP__
--- a/modules/core/include/opencv2/core/cuda/emulation.hpp
+++ b/modules/core/include/opencv2/core/cuda/emulation.hpp
@ -46,10 +46,14 @@
 #include "common.hpp"
 #include "warp_reduce.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    struct Emulation
    {

@ -258,7 +262,8 @@ namespace cv { namespace cuda { namespace device
            }
        };
    }; //struct Emulation
-//!@}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif /* OPENCV_CUDA_EMULATION_HPP_ */
--- a/modules/core/include/opencv2/core/cuda/filters.hpp
+++ b/modules/core/include/opencv2/core/cuda/filters.hpp
@ -48,10 +48,14 @@
 #include "vec_math.hpp"
 #include "type_traits.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template <typename Ptr2D> struct PointFilter
    {
        typedef typename Ptr2D::elem_type elem_type;
@ -275,7 +279,8 @@ namespace cv { namespace cuda { namespace device
        float scale_x, scale_y;
        int width, haight;
    };
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif // __OPENCV_CUDA_FILTERS_HPP__
--- a/modules/core/include/opencv2/core/cuda/funcattrib.hpp
+++ b/modules/core/include/opencv2/core/cuda/funcattrib.hpp
@ -45,10 +45,14 @@

 #include <cstdio>

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template<class Func>
    void printFuncAttrib(Func& func)
    {
@ -68,7 +72,8 @@ namespace cv { namespace cuda { namespace device
        printf("\n");
        fflush(stdout);
    }
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif  /* __OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP_ */
--- a/modules/core/include/opencv2/core/cuda/functional.hpp
+++ b/modules/core/include/opencv2/core/cuda/functional.hpp
@ -49,10 +49,14 @@
 #include "type_traits.hpp"
 #include "device_functions.h"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    // Function Objects
    template<typename Argument, typename Result> struct unary_function : public std::unary_function<Argument, Result> {};
    template<typename Argument1, typename Argument2, typename Result> struct binary_function : public std::binary_function<Argument1, Argument2, Result> {};
@ -786,7 +790,8 @@ namespace cv { namespace cuda { namespace device

 #define OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(type) \
    template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif // __OPENCV_CUDA_FUNCTIONAL_HPP__
--- a/modules/core/include/opencv2/core/cuda/limits.hpp
+++ b/modules/core/include/opencv2/core/cuda/limits.hpp
@ -47,10 +47,14 @@
 #include <float.h>
 #include "common.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
 template <class T> struct numeric_limits;

 template <> struct numeric_limits<bool>
@ -117,7 +121,8 @@ template <> struct numeric_limits<double>
    __device__ __forceinline__ static double epsilon() { return DBL_EPSILON; }
    static const bool is_signed = true;
 };
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev {

+//! @endcond
+
 #endif // __OPENCV_CUDA_LIMITS_HPP__
--- a/modules/core/include/opencv2/core/cuda/reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/reduce.hpp
@ -47,10 +47,14 @@
 #include "detail/reduce.hpp"
 #include "detail/reduce_key_val.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template <int N, typename T, class Op>
    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
    {
@ -194,7 +198,8 @@ namespace cv { namespace cuda { namespace device
    {
        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
    }
-//! @}
 }}}

+//! @endcond
+
 #endif // __OPENCV_CUDA_UTILITY_HPP__
--- a/modules/core/include/opencv2/core/cuda/saturate_cast.hpp
+++ b/modules/core/include/opencv2/core/cuda/saturate_cast.hpp
@ -45,10 +45,14 @@

 #include "common.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
@ -281,7 +285,8 @@ namespace cv { namespace cuda { namespace device
        return saturate_cast<uint>((float)v);
    #endif
    }
-//! @}
 }}}

+//! @endcond
+
 #endif /* __OPENCV_CUDA_SATURATE_CAST_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/scan.hpp
+++ b/modules/core/include/opencv2/core/cuda/scan.hpp
@ -48,10 +48,14 @@
 #include "opencv2/core/cuda/warp.hpp"
 #include "opencv2/core/cuda/warp_shuffle.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };

    template <ScanKind Kind, typename T, typename F> struct WarpScan
@ -247,7 +251,8 @@ namespace cv { namespace cuda { namespace device
            return warpScanInclusive(idata, s_Data, tid);
        }
    }
-//! @}
 }}}

+//! @endcond
+
 #endif // __OPENCV_CUDA_SCAN_HPP__
--- a/modules/core/include/opencv2/core/cuda/simd_functions.hpp
+++ b/modules/core/include/opencv2/core/cuda/simd_functions.hpp
@ -76,57 +76,13 @@
 #include "common.hpp"

 /** @file
-  This header file contains inline functions that implement intra-word SIMD
-  operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
-  emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
-  to make the code portable across all GPUs supported by CUDA. The following
-  functions are currently implemented:
+ * @deprecated Use @ref cudev instead.
+ */

-  vadd2(a,b)      per-halfword unsigned addition, with wrap-around: a + b
-  vsub2(a,b)      per-halfword unsigned subtraction, with wrap-around: a - b
-  vabsdiff2(a,b)  per-halfword unsigned absolute difference: |a - b|
-  vavg2(a,b)      per-halfword unsigned average: (a + b) / 2
-  vavrg2(a,b)     per-halfword unsigned rounded average: (a + b + 1) / 2
-  vseteq2(a,b)    per-halfword unsigned comparison: a == b ? 1 : 0
-  vcmpeq2(a,b)    per-halfword unsigned comparison: a == b ? 0xffff : 0
-  vsetge2(a,b)    per-halfword unsigned comparison: a >= b ? 1 : 0
-  vcmpge2(a,b)    per-halfword unsigned comparison: a >= b ? 0xffff : 0
-  vsetgt2(a,b)    per-halfword unsigned comparison: a > b ? 1 : 0
-  vcmpgt2(a,b)    per-halfword unsigned comparison: a > b ? 0xffff : 0
-  vsetle2(a,b)    per-halfword unsigned comparison: a <= b ? 1 : 0
-  vcmple2(a,b)    per-halfword unsigned comparison: a <= b ? 0xffff : 0
-  vsetlt2(a,b)    per-halfword unsigned comparison: a < b ? 1 : 0
-  vcmplt2(a,b)    per-halfword unsigned comparison: a < b ? 0xffff : 0
-  vsetne2(a,b)    per-halfword unsigned comparison: a != b ? 1 : 0
-  vcmpne2(a,b)    per-halfword unsigned comparison: a != b ? 0xffff : 0
-  vmax2(a,b)      per-halfword unsigned maximum: max(a, b)
-  vmin2(a,b)      per-halfword unsigned minimum: min(a, b)
-
-  vadd4(a,b)      per-byte unsigned addition, with wrap-around: a + b
-  vsub4(a,b)      per-byte unsigned subtraction, with wrap-around: a - b
-  vabsdiff4(a,b)  per-byte unsigned absolute difference: |a - b|
-  vavg4(a,b)      per-byte unsigned average: (a + b) / 2
-  vavrg4(a,b)     per-byte unsigned rounded average: (a + b + 1) / 2
-  vseteq4(a,b)    per-byte unsigned comparison: a == b ? 1 : 0
-  vcmpeq4(a,b)    per-byte unsigned comparison: a == b ? 0xff : 0
-  vsetge4(a,b)    per-byte unsigned comparison: a >= b ? 1 : 0
-  vcmpge4(a,b)    per-byte unsigned comparison: a >= b ? 0xff : 0
-  vsetgt4(a,b)    per-byte unsigned comparison: a > b ? 1 : 0
-  vcmpgt4(a,b)    per-byte unsigned comparison: a > b ? 0xff : 0
-  vsetle4(a,b)    per-byte unsigned comparison: a <= b ? 1 : 0
-  vcmple4(a,b)    per-byte unsigned comparison: a <= b ? 0xff : 0
-  vsetlt4(a,b)    per-byte unsigned comparison: a < b ? 1 : 0
-  vcmplt4(a,b)    per-byte unsigned comparison: a < b ? 0xff : 0
-  vsetne4(a,b)    per-byte unsigned comparison: a != b ? 1: 0
-  vcmpne4(a,b)    per-byte unsigned comparison: a != b ? 0xff: 0
-  vmax4(a,b)      per-byte unsigned maximum: max(a, b)
-  vmin4(a,b)      per-byte unsigned minimum: min(a, b)
-*/
+//! @cond IGNORED

 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    // 2

    static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
@ -906,7 +862,8 @@ namespace cv { namespace cuda { namespace device

        return r;
    }
-//! @}
 }}}

+//! @endcond
+
 #endif // __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
--- a/modules/core/include/opencv2/core/cuda/transform.hpp
+++ b/modules/core/include/opencv2/core/cuda/transform.hpp
@ -47,10 +47,14 @@
 #include "utility.hpp"
 #include "detail/transform_detail.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template <typename T, typename D, typename UnOp, typename Mask>
    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
    {
@ -64,7 +68,8 @@ namespace cv { namespace cuda { namespace device
        typedef TransformFunctorTraits<BinOp> ft;
        transform_detail::TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
    }
-//! @}
 }}}

+//! @endcond
+
 #endif // __OPENCV_CUDA_TRANSFORM_HPP__
--- a/modules/core/include/opencv2/core/cuda/type_traits.hpp
+++ b/modules/core/include/opencv2/core/cuda/type_traits.hpp
@ -45,10 +45,14 @@

 #include "detail/type_traits_detail.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template <typename T> struct IsSimpleParameter
    {
        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
@ -79,7 +83,8 @@ namespace cv { namespace cuda { namespace device
        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
    };
-//! @}
 }}}

+//! @endcond
+
 #endif // __OPENCV_CUDA_TYPE_TRAITS_HPP__
--- a/modules/core/include/opencv2/core/cuda/utility.hpp
+++ b/modules/core/include/opencv2/core/cuda/utility.hpp
@ -46,10 +46,14 @@
 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    #define OPENCV_CUDA_LOG_WARP_SIZE        (5)
    #define OPENCV_CUDA_WARP_SIZE            (1 << OPENCV_CUDA_LOG_WARP_SIZE)
    #define OPENCV_CUDA_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
@ -210,7 +214,8 @@ namespace cv { namespace cuda { namespace device

        return false;
    }
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif // __OPENCV_CUDA_UTILITY_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_distance.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_distance.hpp
@ -47,10 +47,14 @@
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template <typename T> struct L1Dist
    {
        typedef int value_type;
@ -221,7 +225,8 @@ namespace cv { namespace cuda { namespace device

        U vec1Vals[MAX_LEN / THREAD_DIM];
    };
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif // __OPENCV_CUDA_VEC_DISTANCE_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_math.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_math.hpp
@ -46,12 +46,15 @@
 #include "vec_traits.hpp"
 #include "saturate_cast.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {

-//! @addtogroup cuda
-//! @{
-
 // saturate_cast

 namespace vec_math_detail
@ -920,8 +923,8 @@ CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)

 #undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC

-//! @}
-
 }}} // namespace cv { namespace cuda { namespace device

+//! @endcond
+
 #endif // __OPENCV_CUDA_VECMATH_HPP__
--- a/modules/core/include/opencv2/core/cuda/vec_traits.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_traits.hpp
@ -45,10 +45,14 @@

 #include "common.hpp"

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template<typename T, int N> struct TypeVec;

    struct __align__(8) uchar8
@ -277,7 +281,8 @@ namespace cv { namespace cuda { namespace device
        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
    };
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif // __OPENCV_CUDA_VEC_TRAITS_HPP__
--- a/modules/core/include/opencv2/core/cuda/warp.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp.hpp
@ -43,10 +43,14 @@
 #ifndef __OPENCV_CUDA_DEVICE_WARP_HPP__
 #define __OPENCV_CUDA_DEVICE_WARP_HPP__

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    struct Warp
    {
        enum
@ -128,7 +132,8 @@ namespace cv { namespace cuda { namespace device
                *t = value;
        }
    };
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev

+//! @endcond
+
 #endif /* __OPENCV_CUDA_DEVICE_WARP_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/warp_reduce.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp_reduce.hpp
@ -43,10 +43,14 @@
 #ifndef OPENCV_CUDA_WARP_REDUCE_HPP__
 #define OPENCV_CUDA_WARP_REDUCE_HPP__

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template <class T>
    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
    {
@ -65,7 +69,8 @@ namespace cv { namespace cuda { namespace device

        return ptr[tid - lane];
    }
-//! @}
 }}} // namespace cv { namespace cuda { namespace cudev {

+//! @endcond
+
 #endif /* OPENCV_CUDA_WARP_REDUCE_HPP__ */
--- a/modules/core/include/opencv2/core/cuda/warp_shuffle.hpp
+++ b/modules/core/include/opencv2/core/cuda/warp_shuffle.hpp
@ -43,10 +43,14 @@
 #ifndef __OPENCV_CUDA_WARP_SHUFFLE_HPP__
 #define __OPENCV_CUDA_WARP_SHUFFLE_HPP__

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 namespace cv { namespace cuda { namespace device
 {
-//! @addtogroup cuda
-//! @{
    template <typename T>
    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
    {
@ -142,7 +146,8 @@ namespace cv { namespace cuda { namespace device
        return 0.0;
    #endif
    }
-//! @}
 }}}

+//! @endcond
+
 #endif // __OPENCV_CUDA_WARP_SHUFFLE_HPP__
--- a/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/cuda_stream_accessor.hpp
@ -47,10 +47,9 @@
 #  error cuda_stream_accessor.hpp header must be compiled as C++
 #endif

-// This is only header file that depends on Cuda. All other headers are independent.
-// So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
-// But of you wanna use CUDA by yourself, may get cuda stream instance using the class below.
-// In this case you have to install Cuda Toolkit.
+/** @file cuda_stream_accessor.hpp
+ * This is only header file that depends on CUDA Runtime API. All other headers are independent.
+ */

 #include <cuda_runtime.h>
 #include "opencv2/core/cvdef.h"
@ -60,22 +59,21 @@ namespace cv
    namespace cuda
    {

-//! @addtogroup cuda_struct
+//! @addtogroup cudacore_struct
 //! @{

        class Stream;
        class Event;

        /** @brief Class that enables getting cudaStream_t from cuda::Stream
-
-        because it is the only public header that depends on the CUDA Runtime API. Including it
-        brings a dependency to your code.
         */
        struct StreamAccessor
        {
            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
        };

+        /** @brief Class that enables getting cudaEvent_t from cuda::Event
+         */
        struct EventAccessor
        {
            CV_EXPORTS static cudaEvent_t getEvent(const Event& event);
--- a/modules/core/include/opencv2/core/cuda_types.hpp
+++ b/modules/core/include/opencv2/core/cuda_types.hpp
@ -47,6 +47,12 @@
 #  error cuda_types.hpp header must be compiled as C++
 #endif

+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
 #ifdef __CUDACC__
    #define __CV_CUDA_HOST_DEVICE__ __host__ __device__ __forceinline__
 #else
@ -58,9 +64,6 @@ namespace cv
    namespace cuda
    {

-//! @addtogroup cuda_struct
-//! @{
-
        // Simple lightweight structures that encapsulates information about an image on device.
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile

@ -89,17 +92,11 @@ namespace cv
            size_t size;
        };

-        /** @brief Structure similar to cuda::PtrStepSz but containing only a pointer and row step.
-
-        Width and height fields are excluded due to performance reasons. The structure is intended
-        for internal use or for users who write device code.
-         */
        template <typename T> struct PtrStep : public DevPtr<T>
        {
            __CV_CUDA_HOST_DEVICE__ PtrStep() : step(0) {}
            __CV_CUDA_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}

-            //! stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!!
            size_t step;

            __CV_CUDA_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
@ -109,12 +106,6 @@ namespace cv
            __CV_CUDA_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
        };

-        /** @brief Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA
-        kernels).
-
-        Typically, it is used internally by OpenCV and by users who write device code. You can call
-        its members from both host and device code.
-         */
        template <typename T> struct PtrStepSz : public PtrStep<T>
        {
            __CV_CUDA_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
@ -136,9 +127,9 @@ namespace cv
        typedef PtrStep<float> PtrStepf;
        typedef PtrStep<int> PtrStepi;

-//! @}
-
    }
 }

+//! @endcond
+
 #endif /* __OPENCV_CORE_CUDA_TYPES_HPP__ */
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -73,6 +73,10 @@
 #  define CV_ENABLE_UNROLLED 1
 #endif

+#ifdef __OPENCV_BUILD
+#  define DISABLE_OPENCV_24_COMPATIBILITY
+#endif
+
 #if (defined WIN32 || defined _WIN32 || defined WINCE || defined __CYGWIN__) && defined CVAPI_EXPORTS
 #  define CV_EXPORTS __declspec(dllexport)
 #elif defined __GNUC__ && __GNUC__ >= 4
--- a/modules/core/include/opencv2/core/cvstd.hpp
+++ b/modules/core/include/opencv2/core/cvstd.hpp
@ -303,7 +303,10 @@ struct Ptr
    @note It is often easier to use makePtr instead.
     */
    template<typename Y>
-    explicit Ptr(Y* p);
+#ifdef DISABLE_OPENCV_24_COMPATIBILITY
+    explicit
+#endif
+    Ptr(Y* p);

    /** @overload
    @param d Deleter to use for the owned pointer.
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -160,8 +160,8 @@ public:
        STD_VECTOR_MAT    = 5 << KIND_SHIFT,
        EXPR              = 6 << KIND_SHIFT,
        OPENGL_BUFFER     = 7 << KIND_SHIFT,
-        CUDA_MEM          = 8 << KIND_SHIFT,
-        GPU_MAT           = 9 << KIND_SHIFT,
+        CUDA_HOST_MEM     = 8 << KIND_SHIFT,
+        CUDA_GPU_MAT      = 9 << KIND_SHIFT,
        UMAT              =10 << KIND_SHIFT,
        STD_VECTOR_UMAT   =11 << KIND_SHIFT
    };
@ -180,7 +180,7 @@ public:
    _InputArray(const double& val);
    _InputArray(const cuda::GpuMat& d_mat);
    _InputArray(const ogl::Buffer& buf);
-    _InputArray(const cuda::CudaMem& cuda_mem);
+    _InputArray(const cuda::HostMem& cuda_mem);
    template<typename _Tp> _InputArray(const cudev::GpuMat_<_Tp>& m);
    _InputArray(const UMat& um);
    _InputArray(const std::vector<UMat>& umv);
@ -277,7 +277,7 @@ public:
    _OutputArray(std::vector<Mat>& vec);
    _OutputArray(cuda::GpuMat& d_mat);
    _OutputArray(ogl::Buffer& buf);
-    _OutputArray(cuda::CudaMem& cuda_mem);
+    _OutputArray(cuda::HostMem& cuda_mem);
    template<typename _Tp> _OutputArray(cudev::GpuMat_<_Tp>& m);
    template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
    template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
@ -292,7 +292,7 @@ public:
    _OutputArray(const std::vector<Mat>& vec);
    _OutputArray(const cuda::GpuMat& d_mat);
    _OutputArray(const ogl::Buffer& buf);
-    _OutputArray(const cuda::CudaMem& cuda_mem);
+    _OutputArray(const cuda::HostMem& cuda_mem);
    template<typename _Tp> _OutputArray(const cudev::GpuMat_<_Tp>& m);
    template<typename _Tp> _OutputArray(const std::vector<_Tp>& vec);
    template<typename _Tp> _OutputArray(const std::vector<std::vector<_Tp> >& vec);
@ -310,7 +310,7 @@ public:
    virtual UMat& getUMatRef(int i=-1) const;
    virtual cuda::GpuMat& getGpuMatRef() const;
    virtual ogl::Buffer& getOGlBufferRef() const;
-    virtual cuda::CudaMem& getCudaMemRef() const;
+    virtual cuda::HostMem& getHostMemRef() const;
    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
@ -333,7 +333,7 @@ public:
    _InputOutputArray(std::vector<Mat>& vec);
    _InputOutputArray(cuda::GpuMat& d_mat);
    _InputOutputArray(ogl::Buffer& buf);
-    _InputOutputArray(cuda::CudaMem& cuda_mem);
+    _InputOutputArray(cuda::HostMem& cuda_mem);
    template<typename _Tp> _InputOutputArray(cudev::GpuMat_<_Tp>& m);
    template<typename _Tp> _InputOutputArray(std::vector<_Tp>& vec);
    template<typename _Tp> _InputOutputArray(std::vector<std::vector<_Tp> >& vec);
@ -348,7 +348,7 @@ public:
    _InputOutputArray(const std::vector<Mat>& vec);
    _InputOutputArray(const cuda::GpuMat& d_mat);
    _InputOutputArray(const ogl::Buffer& buf);
-    _InputOutputArray(const cuda::CudaMem& cuda_mem);
+    _InputOutputArray(const cuda::HostMem& cuda_mem);
    template<typename _Tp> _InputOutputArray(const cudev::GpuMat_<_Tp>& m);
    template<typename _Tp> _InputOutputArray(const std::vector<_Tp>& vec);
    template<typename _Tp> _InputOutputArray(const std::vector<std::vector<_Tp> >& vec);
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -100,13 +100,13 @@ inline _InputArray::_InputArray(const MatExpr& expr)
 { init(FIXED_TYPE + FIXED_SIZE + EXPR + ACCESS_READ, &expr); }

 inline _InputArray::_InputArray(const cuda::GpuMat& d_mat)
-{ init(GPU_MAT + ACCESS_READ, &d_mat); }
+{ init(CUDA_GPU_MAT + ACCESS_READ, &d_mat); }

 inline _InputArray::_InputArray(const ogl::Buffer& buf)
 { init(OPENGL_BUFFER + ACCESS_READ, &buf); }

-inline _InputArray::_InputArray(const cuda::CudaMem& cuda_mem)
-{ init(CUDA_MEM + ACCESS_READ, &cuda_mem); }
+inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }

 inline _InputArray::~_InputArray() {}

@ -174,13 +174,13 @@ _OutputArray::_OutputArray(const _Tp* vec, int n)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + DataType<_Tp>::type + ACCESS_WRITE, vec, Size(n, 1)); }

 inline _OutputArray::_OutputArray(cuda::GpuMat& d_mat)
-{ init(GPU_MAT + ACCESS_WRITE, &d_mat); }
+{ init(CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }

 inline _OutputArray::_OutputArray(ogl::Buffer& buf)
 { init(OPENGL_BUFFER + ACCESS_WRITE, &buf); }

-inline _OutputArray::_OutputArray(cuda::CudaMem& cuda_mem)
-{ init(CUDA_MEM + ACCESS_WRITE, &cuda_mem); }
+inline _OutputArray::_OutputArray(cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }

 inline _OutputArray::_OutputArray(const Mat& m)
 { init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_WRITE, &m); }
@ -195,13 +195,13 @@ inline _OutputArray::_OutputArray(const std::vector<UMat>& vec)
 { init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }

 inline _OutputArray::_OutputArray(const cuda::GpuMat& d_mat)
-{ init(FIXED_TYPE + FIXED_SIZE + GPU_MAT + ACCESS_WRITE, &d_mat); }
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }

 inline _OutputArray::_OutputArray(const ogl::Buffer& buf)
 { init(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER + ACCESS_WRITE, &buf); }

-inline _OutputArray::_OutputArray(const cuda::CudaMem& cuda_mem)
-{ init(FIXED_TYPE + FIXED_SIZE + CUDA_MEM + ACCESS_WRITE, &cuda_mem); }
+inline _OutputArray::_OutputArray(const cuda::HostMem& cuda_mem)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }

 ///////////////////////////////////////////////////////////////////////////////////////////

@ -261,13 +261,13 @@ _InputOutputArray::_InputOutputArray(const _Tp* vec, int n)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + DataType<_Tp>::type + ACCESS_RW, vec, Size(n, 1)); }

 inline _InputOutputArray::_InputOutputArray(cuda::GpuMat& d_mat)
-{ init(GPU_MAT + ACCESS_RW, &d_mat); }
+{ init(CUDA_GPU_MAT + ACCESS_RW, &d_mat); }

 inline _InputOutputArray::_InputOutputArray(ogl::Buffer& buf)
 { init(OPENGL_BUFFER + ACCESS_RW, &buf); }

-inline _InputOutputArray::_InputOutputArray(cuda::CudaMem& cuda_mem)
-{ init(CUDA_MEM + ACCESS_RW, &cuda_mem); }
+inline _InputOutputArray::_InputOutputArray(cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }

 inline _InputOutputArray::_InputOutputArray(const Mat& m)
 { init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_RW, &m); }
@ -282,13 +282,13 @@ inline _InputOutputArray::_InputOutputArray(const std::vector<UMat>& vec)
 { init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_RW, &vec); }

 inline _InputOutputArray::_InputOutputArray(const cuda::GpuMat& d_mat)
-{ init(FIXED_TYPE + FIXED_SIZE + GPU_MAT + ACCESS_RW, &d_mat); }
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_GPU_MAT + ACCESS_RW, &d_mat); }

 inline _InputOutputArray::_InputOutputArray(const ogl::Buffer& buf)
 { init(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER + ACCESS_RW, &buf); }

-inline _InputOutputArray::_InputOutputArray(const cuda::CudaMem& cuda_mem)
-{ init(FIXED_TYPE + FIXED_SIZE + CUDA_MEM + ACCESS_RW, &cuda_mem); }
+inline _InputOutputArray::_InputOutputArray(const cuda::HostMem& cuda_mem)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }

 //////////////////////////////////////////// Mat //////////////////////////////////////////

--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@ -92,26 +92,6 @@ static inline void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The

 namespace cv { namespace cuda
 {
-    class MemoryStack;
-
-    class CV_EXPORTS StackAllocator : public GpuMat::Allocator
-    {
-    public:
-        explicit StackAllocator(cudaStream_t stream);
-        ~StackAllocator();
-
-        bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
-        void free(GpuMat* mat);
-
-    private:
-        StackAllocator(const StackAllocator&);
-        StackAllocator& operator =(const StackAllocator&);
-
-        cudaStream_t stream_;
-        MemoryStack* memStack_;
-        size_t alignment_;
-    };
-
    class CV_EXPORTS BufferPool
    {
    public:
@ -120,6 +100,8 @@ namespace cv { namespace cuda
        GpuMat getBuffer(int rows, int cols, int type);
        GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }

+        GpuMat::Allocator* getAllocator() const { return allocator_; }
+
    private:
        GpuMat::Allocator* allocator_;
    };
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@ -746,4 +746,8 @@ template<> inline std::string CommandLineParser::get<std::string>(const String&

 } //namespace cv

+#ifndef DISABLE_OPENCV_24_COMPATIBILITY
+#include "opencv2/core/core_c.h"
+#endif
+
 #endif //__OPENCV_CORE_UTILITY_H__
--- a/modules/core/perf/cuda/perf_gpumat.cpp
+++ b/modules/core/perf/cuda/perf_gpumat.cpp
@ -40,7 +40,12 @@
 //
 //M*/

-#include "perf_precomp.hpp"
+#include "../perf_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/ts/cuda_perf.hpp"

 using namespace std;
 using namespace testing;
@ -49,7 +54,7 @@ using namespace perf;
 //////////////////////////////////////////////////////////////////////
 // SetTo

-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
+PERF_TEST_P(Sz_Depth_Cn, CUDA_GpuMat_SetTo,
            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
                    CUDA_CHANNELS_1_3_4))
@ -67,23 +72,21 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
        cv::cuda::GpuMat dst(size, type);

        TEST_CYCLE() dst.setTo(val);
-
-        CUDA_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst(size, type);

        TEST_CYCLE() dst.setTo(val);
-
-        CPU_SANITY_CHECK(dst);
    }
+
+    SANITY_CHECK_NOTHING();
 }

 //////////////////////////////////////////////////////////////////////
 // SetToMasked

-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
+PERF_TEST_P(Sz_Depth_Cn, CUDA_GpuMat_SetToMasked,
            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
                    CUDA_CHANNELS_1_3_4))
@ -106,23 +109,21 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
        const cv::cuda::GpuMat d_mask(mask);

        TEST_CYCLE() dst.setTo(val, d_mask);
-
-        CUDA_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
        cv::Mat dst = src;

        TEST_CYCLE() dst.setTo(val, mask);
-
-        CPU_SANITY_CHECK(dst);
    }
+
+    SANITY_CHECK_NOTHING();
 }

 //////////////////////////////////////////////////////////////////////
 // CopyToMasked

-PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
+PERF_TEST_P(Sz_Depth_Cn, CUDA_GpuMat_CopyToMasked,
            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
                    CUDA_CHANNELS_1_3_4))
@ -144,17 +145,15 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
        cv::cuda::GpuMat dst(d_src.size(), d_src.type(), cv::Scalar::all(0));

        TEST_CYCLE() d_src.copyTo(dst, d_mask);
-
-        CUDA_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
        cv::Mat dst(src.size(), src.type(), cv::Scalar::all(0));

        TEST_CYCLE() src.copyTo(dst, mask);
-
-        CPU_SANITY_CHECK(dst);
    }
+
+    SANITY_CHECK_NOTHING();
 }

 //////////////////////////////////////////////////////////////////////
@ -162,7 +161,7 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,

 DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);

-PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
+PERF_TEST_P(Sz_2Depth, CUDA_GpuMat_ConvertTo,
            Combine(CUDA_TYPICAL_MAT_SIZES,
                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
@ -183,15 +182,15 @@ PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
        cv::cuda::GpuMat dst;

        TEST_CYCLE() d_src.convertTo(dst, depth2, a, b);
-
-        CUDA_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
        cv::Mat dst;

        TEST_CYCLE() src.convertTo(dst, depth2, a, b);
-
-        CPU_SANITY_CHECK(dst);
    }
+
+    SANITY_CHECK_NOTHING();
 }
+
+#endif
--- a/modules/core/src/cuda_buffer_pool.cpp
+++ b/modules/core/src/cuda_buffer_pool.cpp
@ -1,435 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::cuda;
-
-#ifdef HAVE_CUDA
-
-#include "opencv2/cudev/common.hpp"
-
-/////////////////////////////////////////////////////////////
-/// MemoryStack
-
-namespace
-{
-    class MemoryPool;
-}
-
-class cv::cuda::MemoryStack
-{
-public:
-    uchar* requestMemory(size_t size);
-    void returnMemory(uchar* ptr);
-
-    uchar* datastart;
-    uchar* dataend;
-    uchar* tip;
-
-    bool isFree;
-    MemoryPool* pool;
-
-#if defined(DEBUG) || defined(_DEBUG)
-    std::vector<size_t> allocations;
-#endif
-};
-
-uchar* cv::cuda::MemoryStack::requestMemory(size_t size)
-{
-    const size_t freeMem = dataend - tip;
-
-    if (size > freeMem)
-        return 0;
-
-    uchar* ptr = tip;
-
-    tip += size;
-
-#if defined(DEBUG) || defined(_DEBUG)
-    allocations.push_back(size);
-#endif
-
-    return ptr;
-}
-
-void cv::cuda::MemoryStack::returnMemory(uchar* ptr)
-{
-    CV_DbgAssert( ptr >= datastart && ptr < dataend );
-
-#if defined(DEBUG) || defined(_DEBUG)
-    const size_t allocSize = tip - ptr;
-    CV_Assert( allocSize == allocations.back() );
-    allocations.pop_back();
-#endif
-
-    tip = ptr;
-}
-
-/////////////////////////////////////////////////////////////
-/// MemoryPool
-
-namespace
-{
-    class MemoryPool
-    {
-    public:
-        MemoryPool();
-
-        void initialize(size_t stackSize, int stackCount);
-        void release();
-
-        MemoryStack* getFreeMemStack();
-        void returnMemStack(MemoryStack* memStack);
-
-    private:
-        void initilizeImpl();
-
-        Mutex mtx_;
-
-        bool initialized_;
-        size_t stackSize_;
-        int stackCount_;
-
-        uchar* mem_;
-
-        std::vector<MemoryStack> stacks_;
-    };
-
-    MemoryPool::MemoryPool() : initialized_(false), mem_(0)
-    {
-        // default : 10 Mb, 5 stacks
-        stackSize_ = 10 * 1024 * 1024;
-        stackCount_ = 5;
-    }
-
-    void MemoryPool::initialize(size_t stackSize, int stackCount)
-    {
-        AutoLock lock(mtx_);
-
-        release();
-
-        stackSize_ = stackSize;
-        stackCount_ = stackCount;
-
-        initilizeImpl();
-    }
-
-    void MemoryPool::initilizeImpl()
-    {
-        const size_t totalSize = stackSize_ * stackCount_;
-
-        if (totalSize > 0)
-        {
-            cudaError_t err = cudaMalloc(&mem_, totalSize);
-            if (err != cudaSuccess)
-                return;
-
-            stacks_.resize(stackCount_);
-
-            uchar* ptr = mem_;
-
-            for (int i = 0; i < stackCount_; ++i)
-            {
-                stacks_[i].datastart = ptr;
-                stacks_[i].dataend = ptr + stackSize_;
-                stacks_[i].tip = ptr;
-                stacks_[i].isFree = true;
-                stacks_[i].pool = this;
-
-                ptr += stackSize_;
-            }
-
-            initialized_ = true;
-        }
-    }
-
-    void MemoryPool::release()
-    {
-        if (mem_)
-        {
-#if defined(DEBUG) || defined(_DEBUG)
-            for (int i = 0; i < stackCount_; ++i)
-            {
-                CV_DbgAssert( stacks_[i].isFree );
-                CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
-            }
-#endif
-
-            cudaFree( mem_ );
-
-            mem_ = 0;
-            initialized_ = false;
-        }
-    }
-
-    MemoryStack* MemoryPool::getFreeMemStack()
-    {
-        AutoLock lock(mtx_);
-        if (!initialized_)
-            initilizeImpl();
-
-        if (!mem_)
-            return 0;
-
-        for (int i = 0; i < stackCount_; ++i)
-        {
-            if (stacks_[i].isFree)
-            {
-                stacks_[i].isFree = false;
-                return &stacks_[i];
-            }
-        }
-
-        return 0;
-    }
-
-    void MemoryPool::returnMemStack(MemoryStack* memStack)
-    {
-        AutoLock lock(mtx_);
-
-        CV_DbgAssert( !memStack->isFree );
-
-#if defined(DEBUG) || defined(_DEBUG)
-        bool found = false;
-        for (int i = 0; i < stackCount_; ++i)
-        {
-            if (memStack == &stacks_[i])
-            {
-                found = true;
-                break;
-            }
-        }
-        CV_DbgAssert( found );
-#endif
-
-        CV_DbgAssert( memStack->tip == memStack->datastart );
-
-        memStack->isFree = true;
-    }
-}
-
-/////////////////////////////////////////////////////////////
-/// MemoryPoolManager
-
-namespace
-{
-    Mutex mtx_;
-    bool memory_pool_manager_initialized;
-
-    class MemoryPoolManager
-    {
-    public:
-        MemoryPoolManager();
-        ~MemoryPoolManager();
-        void Init();
-
-        MemoryPool* getPool(int deviceId);
-
-    private:
-        std::vector<MemoryPool> pools_;
-    } manager;
-
-    //MemoryPoolManager ;
-
-    MemoryPoolManager::MemoryPoolManager()
-    {
-    }
-
-    void MemoryPoolManager::Init()
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-        if (deviceCount > 0)
-            pools_.resize(deviceCount);
-    }
-
-    MemoryPoolManager::~MemoryPoolManager()
-    {
-        for (size_t i = 0; i < pools_.size(); ++i)
-        {
-            cudaSetDevice(static_cast<int>(i));
-            pools_[i].release();
-        }
-    }
-
-    MemoryPool* MemoryPoolManager::getPool(int deviceId)
-    {
-        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
-        return &pools_[deviceId];
-    }
-
-    MemoryPool* memPool(int deviceId)
-    {
-        {
-            AutoLock lock(mtx_);
-            if (!memory_pool_manager_initialized)
-            {
-                memory_pool_manager_initialized = true;
-                manager.Init();
-            }
-        }
-        return manager.getPool(deviceId);
-    }
-}
-
-/////////////////////////////////////////////////////////////
-/// StackAllocator
-
-namespace
-{
-    bool enableMemoryPool = true;
-}
-
-cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
-{
-    if (enableMemoryPool)
-    {
-        const int deviceId = getDevice();
-        {
-            AutoLock lock(mtx_);
-            memStack_ = memPool(deviceId)->getFreeMemStack();
-        }
-        DeviceInfo devInfo(deviceId);
-        alignment_ = devInfo.textureAlignment();
-    }
-}
-
-cv::cuda::StackAllocator::~StackAllocator()
-{
-    cudaStreamSynchronize(stream_);
-
-    if (memStack_ != 0)
-        memStack_->pool->returnMemStack(memStack_);
-}
-
-namespace
-{
-    size_t alignUp(size_t what, size_t alignment)
-    {
-        size_t alignMask = alignment-1;
-        size_t inverseAlignMask = ~alignMask;
-        size_t res = (what + alignMask) & inverseAlignMask;
-        return res;
-    }
-}
-
-bool cv::cuda::StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
-{
-    if (memStack_ == 0)
-        return false;
-
-    size_t pitch, memSize;
-
-    if (rows > 1 && cols > 1)
-    {
-        pitch = alignUp(cols * elemSize, alignment_);
-        memSize = pitch * rows;
-    }
-    else
-    {
-        // Single row or single column must be continuous
-        pitch = elemSize * cols;
-        memSize = alignUp(elemSize * cols * rows, 64);
-    }
-
-    uchar* ptr = memStack_->requestMemory(memSize);
-
-    if (ptr == 0)
-        return false;
-
-    mat->data = ptr;
-    mat->step = pitch;
-    mat->refcount = (int*) fastMalloc(sizeof(int));
-
-    return true;
-}
-
-void cv::cuda::StackAllocator::free(GpuMat* mat)
-{
-    if (memStack_ == 0)
-        return;
-
-    memStack_->returnMemory(mat->datastart);
-    fastFree(mat->refcount);
-}
-
-void cv::cuda::setBufferPoolUsage(bool on)
-{
-    enableMemoryPool = on;
-}
-
-void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
-{
-    const int currentDevice = getDevice();
-
-    if (deviceId >= 0)
-    {
-        setDevice(deviceId);
-        memPool(deviceId)->initialize(stackSize, stackCount);
-    }
-    else
-    {
-        const int deviceCount = getCudaEnabledDeviceCount();
-
-        for (deviceId = 0; deviceId < deviceCount; ++deviceId)
-        {
-            setDevice(deviceId);
-            memPool(deviceId)->initialize(stackSize, stackCount);
-        }
-    }
-
-    setDevice(currentDevice);
-}
-
-/////////////////////////////////////////////////////////////
-/// BufferPool
-
-GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
-{
-    GpuMat buf(allocator_);
-    buf.create(rows, cols, type);
-    return buf;
-}
-
-#endif
--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@ -275,12 +275,12 @@ void cv::cuda::createContinuous(int rows, int cols, int type, OutputArray arr)
        ::createContinuousImpl(rows, cols, type, arr.getMatRef());
        break;

-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
        ::createContinuousImpl(rows, cols, type, arr.getGpuMatRef());
        break;

-    case _InputArray::CUDA_MEM:
-        ::createContinuousImpl(rows, cols, type, arr.getCudaMemRef());
+    case _InputArray::CUDA_HOST_MEM:
+        ::createContinuousImpl(rows, cols, type, arr.getHostMemRef());
        break;

    default:
@ -329,12 +329,12 @@ void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
        ::ensureSizeIsEnoughImpl(rows, cols, type, arr.getMatRef());
        break;

-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
        ::ensureSizeIsEnoughImpl(rows, cols, type, arr.getGpuMatRef());
        break;

-    case _InputArray::CUDA_MEM:
-        ::ensureSizeIsEnoughImpl(rows, cols, type, arr.getCudaMemRef());
+    case _InputArray::CUDA_HOST_MEM:
+        ::ensureSizeIsEnoughImpl(rows, cols, type, arr.getHostMemRef());
        break;

    default:
@ -342,14 +342,6 @@ void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
    }
 }

-GpuMat cv::cuda::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
-{
-    if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
-        return mat(Rect(0, 0, cols, rows));
-
-    return mat = GpuMat(rows, cols, type);
-}
-
 #ifndef HAVE_CUDA

 GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator()
--- a/modules/core/src/cuda_host_mem.cpp
+++ b/modules/core/src/cuda_host_mem.cpp
@ -42,10 +42,124 @@
 //M*/

 #include "precomp.hpp"
+#include <map>

 using namespace cv;
 using namespace cv::cuda;

+#ifdef HAVE_CUDA
+
+namespace {
+
+class HostMemAllocator : public MatAllocator
+{
+public:
+    explicit HostMemAllocator(unsigned int flags) : flags_(flags)
+    {
+    }
+
+    UMatData* allocate(int dims, const int* sizes, int type,
+                       void* data0, size_t* step,
+                       int /*flags*/, UMatUsageFlags /*usageFlags*/) const
+    {
+        size_t total = CV_ELEM_SIZE(type);
+        for (int i = dims-1; i >= 0; i--)
+        {
+            if (step)
+            {
+                if (data0 && step[i] != CV_AUTOSTEP)
+                {
+                    CV_Assert(total <= step[i]);
+                    total = step[i];
+                }
+                else
+                {
+                    step[i] = total;
+                }
+            }
+
+            total *= sizes[i];
+        }
+
+        UMatData* u = new UMatData(this);
+        u->size = total;
+
+        if (data0)
+        {
+            u->data = u->origdata = static_cast<uchar*>(data0);
+            u->flags |= UMatData::USER_ALLOCATED;
+        }
+        else
+        {
+            void* ptr = 0;
+            cudaSafeCall( cudaHostAlloc(&ptr, total, flags_) );
+
+            u->data = u->origdata = static_cast<uchar*>(ptr);
+        }
+
+        return u;
+    }
+
+    bool allocate(UMatData* u, int /*accessFlags*/, UMatUsageFlags /*usageFlags*/) const
+    {
+        return (u != NULL);
+    }
+
+    void deallocate(UMatData* u) const
+    {
+        CV_Assert(u->urefcount >= 0);
+        CV_Assert(u->refcount >= 0);
+
+        if (u && u->refcount == 0)
+        {
+            if ( !(u->flags & UMatData::USER_ALLOCATED) )
+            {
+                cudaFreeHost(u->origdata);
+                u->origdata = 0;
+            }
+
+            delete u;
+        }
+    }
+
+private:
+    unsigned int flags_;
+};
+
+} // namespace
+
+#endif
+
+MatAllocator* cv::cuda::HostMem::getAllocator(AllocType alloc_type)
+{
+#ifndef HAVE_CUDA
+    (void) alloc_type;
+    throw_no_cuda();
+    return NULL;
+#else
+    static std::map<unsigned int, Ptr<MatAllocator> > allocators;
+
+    unsigned int flag = cudaHostAllocDefault;
+
+    switch (alloc_type)
+    {
+    case PAGE_LOCKED:    flag = cudaHostAllocDefault; break;
+    case SHARED:         flag = cudaHostAllocMapped;  break;
+    case WRITE_COMBINED: flag = cudaHostAllocWriteCombined; break;
+    default:             CV_Error(cv::Error::StsBadFlag, "Invalid alloc type");
+    }
+
+    Ptr<MatAllocator>& a = allocators[flag];
+
+    if (a.empty())
+    {
+        a = makePtr<HostMemAllocator>(flag);
+    }
+
+    return a.get();
+#endif
+}
+
 #ifdef HAVE_CUDA
 namespace
 {
@ -59,7 +173,7 @@ namespace
 }
 #endif

-void cv::cuda::CudaMem::create(int rows_, int cols_, int type_)
+void cv::cuda::HostMem::create(int rows_, int cols_, int type_)
 {
 #ifndef HAVE_CUDA
    (void) rows_;
@ -123,9 +237,9 @@ void cv::cuda::CudaMem::create(int rows_, int cols_, int type_)
 #endif
 }

-CudaMem cv::cuda::CudaMem::reshape(int new_cn, int new_rows) const
+HostMem cv::cuda::HostMem::reshape(int new_cn, int new_rows) const
 {
-    CudaMem hdr = *this;
+    HostMem hdr = *this;

    int cn = channels();
    if (new_cn == 0)
@ -166,7 +280,7 @@ CudaMem cv::cuda::CudaMem::reshape(int new_cn, int new_rows) const
    return hdr;
 }

-void cv::cuda::CudaMem::release()
+void cv::cuda::HostMem::release()
 {
 #ifdef HAVE_CUDA
    if (refcount && CV_XADD(refcount, -1) == 1)
@ -181,7 +295,7 @@ void cv::cuda::CudaMem::release()
 #endif
 }

-GpuMat cv::cuda::CudaMem::createGpuMatHeader() const
+GpuMat cv::cuda::HostMem::createGpuMatHeader() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@ -45,8 +45,217 @@
 using namespace cv;
 using namespace cv::cuda;

+/////////////////////////////////////////////////////////////
+/// MemoryStack
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    class MemoryPool;
+
+    class MemoryStack
+    {
+    public:
+        uchar* requestMemory(size_t size);
+        void returnMemory(uchar* ptr);
+
+        uchar* datastart;
+        uchar* dataend;
+        uchar* tip;
+
+        bool isFree;
+        MemoryPool* pool;
+
+    #if !defined(NDEBUG)
+        std::vector<size_t> allocations;
+    #endif
+    };
+
+    uchar* MemoryStack::requestMemory(size_t size)
+    {
+        const size_t freeMem = dataend - tip;
+
+        if (size > freeMem)
+            return 0;
+
+        uchar* ptr = tip;
+
+        tip += size;
+
+    #if !defined(NDEBUG)
+        allocations.push_back(size);
+    #endif
+
+        return ptr;
+    }
+
+    void MemoryStack::returnMemory(uchar* ptr)
+    {
+        CV_DbgAssert( ptr >= datastart && ptr < dataend );
+
+    #if !defined(NDEBUG)
+        const size_t allocSize = tip - ptr;
+        CV_Assert( allocSize == allocations.back() );
+        allocations.pop_back();
+    #endif
+
+        tip = ptr;
+    }
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////
+/// MemoryPool
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    class MemoryPool
+    {
+    public:
+        MemoryPool();
+
+        void initialize(size_t stackSize, int stackCount);
+        void release();
+
+        MemoryStack* getFreeMemStack();
+        void returnMemStack(MemoryStack* memStack);
+
+    private:
+        void initilizeImpl();
+
+        Mutex mtx_;
+
+        bool initialized_;
+        size_t stackSize_;
+        int stackCount_;
+
+        uchar* mem_;
+
+        std::vector<MemoryStack> stacks_;
+    };
+
+    MemoryPool::MemoryPool() : initialized_(false), mem_(0)
+    {
+        // default : 10 Mb, 5 stacks
+        stackSize_ = 10 * 1024 * 1024;
+        stackCount_ = 5;
+    }
+
+    void MemoryPool::initialize(size_t stackSize, int stackCount)
+    {
+        AutoLock lock(mtx_);
+
+        release();
+
+        stackSize_ = stackSize;
+        stackCount_ = stackCount;
+
+        initilizeImpl();
+    }
+
+    void MemoryPool::initilizeImpl()
+    {
+        const size_t totalSize = stackSize_ * stackCount_;
+
+        if (totalSize > 0)
+        {
+            cudaError_t err = cudaMalloc(&mem_, totalSize);
+            if (err != cudaSuccess)
+                return;
+
+            stacks_.resize(stackCount_);
+
+            uchar* ptr = mem_;
+
+            for (int i = 0; i < stackCount_; ++i)
+            {
+                stacks_[i].datastart = ptr;
+                stacks_[i].dataend = ptr + stackSize_;
+                stacks_[i].tip = ptr;
+                stacks_[i].isFree = true;
+                stacks_[i].pool = this;
+
+                ptr += stackSize_;
+            }
+
+            initialized_ = true;
+        }
+    }
+
+    void MemoryPool::release()
+    {
+        if (mem_)
+        {
+#if !defined(NDEBUG)
+            for (int i = 0; i < stackCount_; ++i)
+            {
+                CV_DbgAssert( stacks_[i].isFree );
+                CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
+            }
+#endif
+
+            cudaFree(mem_);
+
+            mem_ = 0;
+            initialized_ = false;
+        }
+    }
+
+    MemoryStack* MemoryPool::getFreeMemStack()
+    {
+        AutoLock lock(mtx_);
+
+        if (!initialized_)
+            initilizeImpl();
+
+        if (!mem_)
+            return 0;
+
+        for (int i = 0; i < stackCount_; ++i)
+        {
+            if (stacks_[i].isFree)
+            {
+                stacks_[i].isFree = false;
+                return &stacks_[i];
+            }
+        }
+
+        return 0;
+    }
+
+    void MemoryPool::returnMemStack(MemoryStack* memStack)
+    {
+        AutoLock lock(mtx_);
+
+        CV_DbgAssert( !memStack->isFree );
+
+#if !defined(NDEBUG)
+        bool found = false;
+        for (int i = 0; i < stackCount_; ++i)
+        {
+            if (memStack == &stacks_[i])
+            {
+                found = true;
+                break;
+            }
+        }
+        CV_DbgAssert( found );
+#endif
+
+        CV_DbgAssert( memStack->tip == memStack->datastart );
+
+        memStack->isFree = true;
+    }
+}
+
+#endif
+
 ////////////////////////////////////////////////////////////////
-// Stream
+/// Stream::Impl

 #ifndef HAVE_CUDA

@ -62,6 +271,11 @@ public:

 #else

+namespace
+{
+    class StackAllocator;
+}
+
 class cv::cuda::Stream::Impl
 {
 public:
@ -74,10 +288,6 @@ public:
    ~Impl();
 };

-cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
-{
-}
-
 cv::cuda::Stream::Impl::Impl() : stream(0)
 {
    cudaSafeCall( cudaStreamCreate(&stream) );
@ -98,13 +308,101 @@ cv::cuda::Stream::Impl::~Impl()
        cudaStreamDestroy(stream);
 }

-cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
+#endif
+
+/////////////////////////////////////////////////////////////
+/// DefaultDeviceInitializer
+
+#ifdef HAVE_CUDA
+
+namespace cv { namespace cuda
 {
-    return stream.impl_->stream;
-}
+    class DefaultDeviceInitializer
+    {
+    public:
+        DefaultDeviceInitializer();
+        ~DefaultDeviceInitializer();
+
+        Stream& getNullStream(int deviceId);
+        MemoryPool* getMemoryPool(int deviceId);
+
+    private:
+        void initStreams();
+        void initPools();
+
+        std::vector<Ptr<Stream> > streams_;
+        Mutex streams_mtx_;
+
+        std::vector<MemoryPool> pools_;
+        Mutex pools_mtx_;
+    };
+
+    DefaultDeviceInitializer::DefaultDeviceInitializer()
+    {
+    }
+
+    DefaultDeviceInitializer::~DefaultDeviceInitializer()
+    {
+        streams_.clear();
+
+        for (size_t i = 0; i < pools_.size(); ++i)
+        {
+            cudaSetDevice(static_cast<int>(i));
+            pools_[i].release();
+        }
+
+        pools_.clear();
+    }
+
+    Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
+    {
+        AutoLock lock(streams_mtx_);
+
+        if (streams_.empty())
+        {
+            int deviceCount = getCudaEnabledDeviceCount();
+
+            if (deviceCount > 0)
+                streams_.resize(deviceCount);
+        }
+
+        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
+
+        if (streams_[deviceId].empty())
+        {
+            cudaStream_t stream = NULL;
+            Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
+            streams_[deviceId] = Ptr<Stream>(new Stream(impl));
+        }
+
+        return *streams_[deviceId];
+    }
+
+    MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
+    {
+        AutoLock lock(pools_mtx_);
+
+        if (pools_.empty())
+        {
+            int deviceCount = getCudaEnabledDeviceCount();
+
+            if (deviceCount > 0)
+                pools_.resize(deviceCount);
+        }
+
+        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
+
+        return &pools_[deviceId];
+    }
+
+    DefaultDeviceInitializer initializer;
+}}

 #endif

+/////////////////////////////////////////////////////////////
+/// Stream
+
 cv::cuda::Stream::Stream()
 {
 #ifndef HAVE_CUDA
@ -181,7 +479,7 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
    #if CUDART_VERSION < 5000
        (void) callback;
        (void) userData;
-        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA 5.0");
+        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
    #else
        CallbackData* data = new CallbackData(callback, userData);

@ -190,22 +488,16 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
 #endif
 }

-namespace
-{
-    bool default_stream_is_initialized;
-    Mutex mtx;
-    Ptr<Stream> default_stream;
-}
-
 Stream& cv::cuda::Stream::Null()
 {
-    AutoLock lock(mtx);
-    if (!default_stream_is_initialized)
-    {
-        default_stream = Ptr<Stream>(new Stream(Ptr<Impl>(new Impl(0))));
-        default_stream_is_initialized = true;
-    }
-    return *default_stream;
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    static Stream stream;
+    return stream;
+#else
+    const int deviceId = getDevice();
+    return initializer.getNullStream(deviceId);
+#endif
 }

 cv::cuda::Stream::operator bool_type() const
@ -217,6 +509,169 @@ cv::cuda::Stream::operator bool_type() const
 #endif
 }

+#ifdef HAVE_CUDA
+
+cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
+{
+    return stream.impl_->stream;
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////
+/// StackAllocator
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    bool enableMemoryPool = true;
+
+    class StackAllocator : public GpuMat::Allocator
+    {
+    public:
+        explicit StackAllocator(cudaStream_t stream);
+        ~StackAllocator();
+
+        bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
+        void free(GpuMat* mat);
+
+    private:
+        StackAllocator(const StackAllocator&);
+        StackAllocator& operator =(const StackAllocator&);
+
+        cudaStream_t stream_;
+        MemoryStack* memStack_;
+        size_t alignment_;
+    };
+
+    StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
+    {
+        if (enableMemoryPool)
+        {
+            const int deviceId = getDevice();
+            memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
+            DeviceInfo devInfo(deviceId);
+            alignment_ = devInfo.textureAlignment();
+        }
+    }
+
+    StackAllocator::~StackAllocator()
+    {
+        cudaStreamSynchronize(stream_);
+
+        if (memStack_ != 0)
+            memStack_->pool->returnMemStack(memStack_);
+    }
+
+    size_t alignUp(size_t what, size_t alignment)
+    {
+        size_t alignMask = alignment-1;
+        size_t inverseAlignMask = ~alignMask;
+        size_t res = (what + alignMask) & inverseAlignMask;
+        return res;
+    }
+
+    bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
+    {
+        if (memStack_ == 0)
+            return false;
+
+        size_t pitch, memSize;
+
+        if (rows > 1 && cols > 1)
+        {
+            pitch = alignUp(cols * elemSize, alignment_);
+            memSize = pitch * rows;
+        }
+        else
+        {
+            // Single row or single column must be continuous
+            pitch = elemSize * cols;
+            memSize = alignUp(elemSize * cols * rows, 64);
+        }
+
+        uchar* ptr = memStack_->requestMemory(memSize);
+
+        if (ptr == 0)
+            return false;
+
+        mat->data = ptr;
+        mat->step = pitch;
+        mat->refcount = (int*) fastMalloc(sizeof(int));
+
+        return true;
+    }
+
+    void StackAllocator::free(GpuMat* mat)
+    {
+        if (memStack_ == 0)
+            return;
+
+        memStack_->returnMemory(mat->datastart);
+        fastFree(mat->refcount);
+    }
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////
+/// BufferPool
+
+void cv::cuda::setBufferPoolUsage(bool on)
+{
+#ifndef HAVE_CUDA
+    (void)on;
+    throw_no_cuda();
+#else
+    enableMemoryPool = on;
+#endif
+}
+
+void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
+{
+#ifndef HAVE_CUDA
+    (void)deviceId;
+    (void)stackSize;
+    (void)stackCount;
+    throw_no_cuda();
+#else
+    const int currentDevice = getDevice();
+
+    if (deviceId >= 0)
+    {
+        setDevice(deviceId);
+        initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
+    }
+    else
+    {
+        const int deviceCount = getCudaEnabledDeviceCount();
+
+        for (deviceId = 0; deviceId < deviceCount; ++deviceId)
+        {
+            setDevice(deviceId);
+            initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
+        }
+    }
+
+    setDevice(currentDevice);
+#endif
+}
+
+#ifdef HAVE_CUDA
+
+cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
+{
+}
+
+GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
+{
+    GpuMat buf(allocator_);
+    buf.create(rows, cols, type);
+    return buf;
+}
+
+#endif

 ////////////////////////////////////////////////////////////////
 // Event
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -1187,18 +1187,18 @@ Mat _InputArray::getMat(int i) const
        return Mat();
    }

-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
    {
        CV_Assert( i < 0 );
        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call download method for cuda::GpuMat object");
        return Mat();
    }

-    if( k == CUDA_MEM )
+    if( k == CUDA_HOST_MEM )
    {
        CV_Assert( i < 0 );

-        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;
+        const cuda::HostMem* cuda_mem = (const cuda::HostMem*)obj;

        return cuda_mem->createMatHeader();
    }
@ -1391,15 +1391,15 @@ cuda::GpuMat _InputArray::getGpuMat() const
 {
    int k = kind();

-    if (k == GPU_MAT)
+    if (k == CUDA_GPU_MAT)
    {
        const cuda::GpuMat* d_mat = (const cuda::GpuMat*)obj;
        return *d_mat;
    }

-    if (k == CUDA_MEM)
+    if (k == CUDA_HOST_MEM)
    {
-        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;
+        const cuda::HostMem* cuda_mem = (const cuda::HostMem*)obj;
        return cuda_mem->createGpuMatHeader();
    }

@ -1412,7 +1412,7 @@ cuda::GpuMat _InputArray::getGpuMat() const
    if (k == NONE)
        return cuda::GpuMat();

-    CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for cuda::GpuMat and cuda::CudaMem");
+    CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for cuda::GpuMat and cuda::HostMem");
    return cuda::GpuMat();
 }

@ -1520,20 +1520,22 @@ Size _InputArray::size(int i) const
        return buf->size();
    }

-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
    {
        CV_Assert( i < 0 );
        const cuda::GpuMat* d_mat = (const cuda::GpuMat*)obj;
        return d_mat->size();
    }

-    CV_Assert( k == CUDA_MEM );
-    //if( k == CUDA_MEM )
+    if( k == CUDA_HOST_MEM )
    {
        CV_Assert( i < 0 );
-        const cuda::CudaMem* cuda_mem = (const cuda::CudaMem*)obj;
+        const cuda::HostMem* cuda_mem = (const cuda::HostMem*)obj;
        return cuda_mem->size();
    }
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+    return Size();
 }

 int _InputArray::sizend(int* arrsz, int i) const
@ -1700,18 +1702,20 @@ int _InputArray::dims(int i) const
        return 2;
    }

-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
    {
        CV_Assert( i < 0 );
        return 2;
    }

-    CV_Assert( k == CUDA_MEM );
-    //if( k == CUDA_MEM )
+    if( k == CUDA_HOST_MEM )
    {
        CV_Assert( i < 0 );
        return 2;
    }
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+    return 0;
 }

 size_t _InputArray::total(int i) const
@ -1799,12 +1803,14 @@ int _InputArray::type(int i) const
    if( k == OPENGL_BUFFER )
        return ((const ogl::Buffer*)obj)->type();

-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
        return ((const cuda::GpuMat*)obj)->type();

-    CV_Assert( k == CUDA_MEM );
-    //if( k == CUDA_MEM )
-        return ((const cuda::CudaMem*)obj)->type();
+    if( k == CUDA_HOST_MEM )
+        return ((const cuda::HostMem*)obj)->type();
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+    return 0;
 }

 int _InputArray::depth(int i) const
@ -1863,12 +1869,14 @@ bool _InputArray::empty() const
    if( k == OPENGL_BUFFER )
        return ((const ogl::Buffer*)obj)->empty();

-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
        return ((const cuda::GpuMat*)obj)->empty();

-    CV_Assert( k == CUDA_MEM );
-    //if( k == CUDA_MEM )
-        return ((const cuda::CudaMem*)obj)->empty();
+    if( k == CUDA_HOST_MEM )
+        return ((const cuda::HostMem*)obj)->empty();
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+    return true;
 }

 bool _InputArray::isContinuous(int i) const
@ -1970,7 +1978,7 @@ size_t _InputArray::offset(int i) const
        return vv[i].offset;
    }

-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
    {
        CV_Assert( i < 0 );
        const cuda::GpuMat * const m = ((const cuda::GpuMat*)obj);
@ -2016,7 +2024,7 @@ size_t _InputArray::step(int i) const
        return vv[i].step;
    }

-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
    {
        CV_Assert( i < 0 );
        return ((const cuda::GpuMat*)obj)->step;
@ -2095,7 +2103,7 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
        ((UMat*)obj)->create(_sz, mtype);
        return;
    }
-    if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    if( k == CUDA_GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == _sz);
        CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
@ -2109,11 +2117,11 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
        ((ogl::Buffer*)obj)->create(_sz, mtype);
        return;
    }
-    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == _sz);
-        CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
-        ((cuda::CudaMem*)obj)->create(_sz, mtype);
+        CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == _sz);
+        CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
+        ((cuda::HostMem*)obj)->create(_sz, mtype);
        return;
    }
    int sizes[] = {_sz.height, _sz.width};
@ -2137,7 +2145,7 @@ void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTran
        ((UMat*)obj)->create(_rows, _cols, mtype);
        return;
    }
-    if( k == GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    if( k == CUDA_GPU_MAT && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
        CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
        CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
@ -2151,11 +2159,11 @@ void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTran
        ((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
        return;
    }
-    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
    {
-        CV_Assert(!fixedSize() || ((cuda::CudaMem*)obj)->size() == Size(_cols, _rows));
-        CV_Assert(!fixedType() || ((cuda::CudaMem*)obj)->type() == mtype);
-        ((cuda::CudaMem*)obj)->create(_rows, _cols, mtype);
+        CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == Size(_cols, _rows));
+        CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
+        ((cuda::HostMem*)obj)->create(_rows, _cols, mtype);
        return;
    }
    int sizes[] = {_rows, _cols};
@ -2479,15 +2487,15 @@ void _OutputArray::release() const
        return;
    }

-    if( k == GPU_MAT )
+    if( k == CUDA_GPU_MAT )
    {
        ((cuda::GpuMat*)obj)->release();
        return;
    }

-    if( k == CUDA_MEM )
+    if( k == CUDA_HOST_MEM )
    {
-        ((cuda::CudaMem*)obj)->release();
+        ((cuda::HostMem*)obj)->release();
        return;
    }

@ -2583,7 +2591,7 @@ UMat& _OutputArray::getUMatRef(int i) const
 cuda::GpuMat& _OutputArray::getGpuMatRef() const
 {
    int k = kind();
-    CV_Assert( k == GPU_MAT );
+    CV_Assert( k == CUDA_GPU_MAT );
    return *(cuda::GpuMat*)obj;
 }

@ -2594,11 +2602,11 @@ ogl::Buffer& _OutputArray::getOGlBufferRef() const
    return *(ogl::Buffer*)obj;
 }

-cuda::CudaMem& _OutputArray::getCudaMemRef() const
+cuda::HostMem& _OutputArray::getHostMemRef() const
 {
    int k = kind();
-    CV_Assert( k == CUDA_MEM );
-    return *(cuda::CudaMem*)obj;
+    CV_Assert( k == CUDA_HOST_MEM );
+    return *(cuda::HostMem*)obj;
 }

 void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
@ -2614,10 +2622,10 @@ void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
    }
    else if( k == UMAT )
        ((UMat*)obj)->setTo(arr, mask);
-    else if( k == GPU_MAT )
+    else if( k == CUDA_GPU_MAT )
    {
        Mat value = arr.getMat();
-        CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::GPU_MAT) );
+        CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::CUDA_GPU_MAT) );
        ((cuda::GpuMat*)obj)->setTo(Scalar(Vec<double, 4>(value.ptr<double>())), mask);
    }
    else
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@ -509,7 +509,7 @@ cv::ogl::Buffer::Buffer(InputArray arr, Target target, bool autoRelease) : rows_
    switch (kind)
    {
    case _InputArray::OPENGL_BUFFER:
-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
        copyFrom(arr, target, autoRelease);
        break;

@ -594,7 +594,7 @@ void cv::ogl::Buffer::copyFrom(InputArray arr, Target target, bool autoRelease)
            break;
        }

-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
        {
            #ifndef HAVE_CUDA
                throw_no_cuda();
@ -657,7 +657,7 @@ void cv::ogl::Buffer::copyTo(OutputArray arr) const
            break;
        }

-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
        {
            #ifndef HAVE_CUDA
                throw_no_cuda();
@ -1018,7 +1018,7 @@ cv::ogl::Texture2D::Texture2D(InputArray arr, bool autoRelease) : rows_(0), cols
            break;
        }

-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
        {
            #ifndef HAVE_CUDA
                throw_no_cuda();
@ -1132,7 +1132,7 @@ void cv::ogl::Texture2D::copyFrom(InputArray arr, bool autoRelease)
            break;
        }

-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
        {
            #ifndef HAVE_CUDA
                throw_no_cuda();
@ -1184,7 +1184,7 @@ void cv::ogl::Texture2D::copyTo(OutputArray arr, int ddepth, bool autoRelease) c
            break;
        }

-    case _InputArray::GPU_MAT:
+    case _InputArray::CUDA_GPU_MAT:
        {
            #ifndef HAVE_CUDA
                throw_no_cuda();
--- a/modules/core/test/cuda/test_buffer_pool.cpp
+++ b/modules/core/test/cuda/test_buffer_pool.cpp
@ -40,13 +40,13 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"

 #ifdef HAVE_CUDA

-#include "opencv2/cudaarithm.hpp"
-#include "opencv2/cudawarping.hpp"
+#include "opencv2/core/cuda.hpp"
 #include "opencv2/core/private.cuda.hpp"
+#include "opencv2/ts/cuda_test.hpp"

 using namespace testing;
 using namespace cv;
@ -54,65 +54,64 @@ using namespace cv::cuda;

 struct BufferPoolTest : TestWithParam<DeviceInfo>
 {
+    void RunSimpleTest(Stream& stream, HostMem& dst_1, HostMem& dst_2)
+    {
+        BufferPool pool(stream);
+
+        {
+            GpuMat buf0 = pool.getBuffer(Size(640, 480), CV_8UC1);
+            EXPECT_FALSE( buf0.empty() );
+
+            buf0.setTo(Scalar::all(0), stream);
+
+            GpuMat buf1 = pool.getBuffer(Size(640, 480), CV_8UC1);
+            EXPECT_FALSE( buf1.empty() );
+
+            buf0.convertTo(buf1, buf1.type(), 1.0, 1.0, stream);
+
+            buf1.download(dst_1, stream);
+        }
+
+        {
+            GpuMat buf2 = pool.getBuffer(Size(1280, 1024), CV_32SC1);
+            EXPECT_FALSE( buf2.empty() );
+
+            buf2.setTo(Scalar::all(2), stream);
+
+            buf2.download(dst_2, stream);
+        }
+    }
+
+    void CheckSimpleTest(HostMem& dst_1, HostMem& dst_2)
+    {
+        EXPECT_MAT_NEAR(Mat(Size(640, 480), CV_8UC1, Scalar::all(1)), dst_1, 0.0);
+        EXPECT_MAT_NEAR(Mat(Size(1280, 1024), CV_32SC1, Scalar::all(2)), dst_2, 0.0);
+    }
 };

-namespace
+CUDA_TEST_P(BufferPoolTest, FromNullStream)
 {
-    void func1(const GpuMat& src, GpuMat& dst, Stream& stream)
-    {
-        BufferPool pool(stream);
+    HostMem dst_1, dst_2;

-        GpuMat buf = pool.getBuffer(src.size(), CV_32FC(src.channels()));
+    RunSimpleTest(Stream::Null(), dst_1, dst_2);

-        src.convertTo(buf, CV_32F, 1.0 / 255.0, stream);
-
-        cuda::exp(buf, dst, stream);
-    }
-
-    void func2(const GpuMat& src, GpuMat& dst, Stream& stream)
-    {
-        BufferPool pool(stream);
-
-        GpuMat buf1 = pool.getBuffer(saturate_cast<int>(src.rows * 0.5), saturate_cast<int>(src.cols * 0.5), src.type());
-
-        cuda::resize(src, buf1, Size(), 0.5, 0.5, cv::INTER_NEAREST, stream);
-
-        GpuMat buf2 = pool.getBuffer(buf1.size(), CV_32FC(buf1.channels()));
-
-        func1(buf1, buf2, stream);
-
-        GpuMat buf3 = pool.getBuffer(src.size(), buf2.type());
-
-        cuda::resize(buf2, buf3, src.size(), 0, 0, cv::INTER_NEAREST, stream);
-
-        buf3.convertTo(dst, CV_8U, stream);
-    }
+    CheckSimpleTest(dst_1, dst_2);
 }

-CUDA_TEST_P(BufferPoolTest, SimpleUsage)
+CUDA_TEST_P(BufferPoolTest, From2Streams)
 {
-    DeviceInfo devInfo = GetParam();
-    setDevice(devInfo.deviceID());
+    HostMem dst1_1, dst1_2;
+    HostMem dst2_1, dst2_2;

-    GpuMat src(200, 200, CV_8UC1);
-    GpuMat dst;
+    Stream stream1, stream2;
+    RunSimpleTest(stream1, dst1_1, dst1_2);
+    RunSimpleTest(stream2, dst2_1, dst2_2);

-    Stream stream;
+    stream1.waitForCompletion();
+    stream2.waitForCompletion();

-    func2(src, dst, stream);
-
-    stream.waitForCompletion();
-
-    GpuMat buf, buf1, buf2, buf3;
-    GpuMat dst_gold;
-
-    cuda::resize(src, buf1, Size(), 0.5, 0.5, cv::INTER_NEAREST);
-    buf1.convertTo(buf, CV_32F, 1.0 / 255.0);
-    cuda::exp(buf, buf2);
-    cuda::resize(buf2, buf3, src.size(), 0, 0, cv::INTER_NEAREST);
-    buf3.convertTo(dst_gold, CV_8U);
-
-    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+    CheckSimpleTest(dst1_1, dst1_2);
+    CheckSimpleTest(dst2_1, dst2_2);
 }

 INSTANTIATE_TEST_CASE_P(CUDA_Stream, BufferPoolTest, ALL_DEVICES);
--- a/modules/core/test/cuda/test_gpumat.cpp
+++ b/modules/core/test/cuda/test_gpumat.cpp
@ -40,16 +40,19 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"

 #ifdef HAVE_CUDA

+#include "opencv2/core/cuda.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
 using namespace cvtest;

 ////////////////////////////////////////////////////////////////////////////////
 // SetTo

-PARAM_TEST_CASE(SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(GpuMat_SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
@ -67,7 +70,7 @@ PARAM_TEST_CASE(SetTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
    }
 };

-CUDA_TEST_P(SetTo, Zero)
+CUDA_TEST_P(GpuMat_SetTo, Zero)
 {
    cv::Scalar zero = cv::Scalar::all(0);

@ -77,7 +80,7 @@ CUDA_TEST_P(SetTo, Zero)
    EXPECT_MAT_NEAR(cv::Mat::zeros(size, type), mat, 0.0);
 }

-CUDA_TEST_P(SetTo, SameVal)
+CUDA_TEST_P(GpuMat_SetTo, SameVal)
 {
    cv::Scalar val = cv::Scalar::all(randomDouble(0.0, 255.0));

@ -102,7 +105,7 @@ CUDA_TEST_P(SetTo, SameVal)
    }
 }

-CUDA_TEST_P(SetTo, DifferentVal)
+CUDA_TEST_P(GpuMat_SetTo, DifferentVal)
 {
    cv::Scalar val = randomScalar(0.0, 255.0);

@ -127,7 +130,7 @@ CUDA_TEST_P(SetTo, DifferentVal)
    }
 }

-CUDA_TEST_P(SetTo, Masked)
+CUDA_TEST_P(GpuMat_SetTo, Masked)
 {
    cv::Scalar val = randomScalar(0.0, 255.0);
    cv::Mat mat_gold = randomMat(size, type);
@ -156,7 +159,7 @@ CUDA_TEST_P(SetTo, Masked)
    }
 }

-INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, SetTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_SetTo, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_TYPES,
@ -165,7 +168,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, SetTo, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // CopyTo

-PARAM_TEST_CASE(CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+PARAM_TEST_CASE(GpuMat_CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
@ -184,7 +187,7 @@ PARAM_TEST_CASE(CopyTo, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
    }
 };

-CUDA_TEST_P(CopyTo, WithOutMask)
+CUDA_TEST_P(GpuMat_CopyTo, WithOutMask)
 {
    cv::Mat src = randomMat(size, type);

@ -195,7 +198,7 @@ CUDA_TEST_P(CopyTo, WithOutMask)
    EXPECT_MAT_NEAR(src, dst, 0.0);
 }

-CUDA_TEST_P(CopyTo, Masked)
+CUDA_TEST_P(GpuMat_CopyTo, Masked)
 {
    cv::Mat src = randomMat(size, type);
    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
@ -226,7 +229,7 @@ CUDA_TEST_P(CopyTo, Masked)
    }
 }

-INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, CopyTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_CopyTo, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_TYPES,
@ -235,7 +238,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, CopyTo, testing::Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // ConvertTo

-PARAM_TEST_CASE(ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, UseRoi)
+PARAM_TEST_CASE(GpuMat_ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, UseRoi)
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
@ -255,7 +258,7 @@ PARAM_TEST_CASE(ConvertTo, cv::cuda::DeviceInfo, cv::Size, MatDepth, MatDepth, U
    }
 };

-CUDA_TEST_P(ConvertTo, WithOutScaling)
+CUDA_TEST_P(GpuMat_ConvertTo, WithOutScaling)
 {
    cv::Mat src = randomMat(size, depth1);

@ -285,7 +288,7 @@ CUDA_TEST_P(ConvertTo, WithOutScaling)
    }
 }

-CUDA_TEST_P(ConvertTo, WithScaling)
+CUDA_TEST_P(GpuMat_ConvertTo, WithScaling)
 {
    cv::Mat src = randomMat(size, depth1);
    double a = randomDouble(0.0, 1.0);
@ -317,7 +320,7 @@ CUDA_TEST_P(ConvertTo, WithScaling)
    }
 }

-INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, ConvertTo, testing::Combine(
+INSTANTIATE_TEST_CASE_P(CUDA, GpuMat_ConvertTo, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_DEPTH,
@ -356,6 +359,6 @@ CUDA_TEST_P(EnsureSizeIsEnough, BufferReuse)
    EXPECT_EQ(reinterpret_cast<intptr_t>(old.data), reinterpret_cast<intptr_t>(buffer.data));
 }

-INSTANTIATE_TEST_CASE_P(CUDA_GpuMat, EnsureSizeIsEnough, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA, EnsureSizeIsEnough, ALL_DEVICES);

 #endif // HAVE_CUDA
--- a/modules/core/test/cuda/test_opengl.cpp
+++ b/modules/core/test/cuda/test_opengl.cpp
@ -40,10 +40,14 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"

 #if defined(HAVE_CUDA) && defined(HAVE_OPENGL)

+#include "opencv2/core/cuda.hpp"
+#include "opencv2/core/opengl.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
 using namespace cvtest;

 /////////////////////////////////////////////
--- a/modules/core/test/cuda/test_stream.cpp
+++ b/modules/core/test/cuda/test_stream.cpp
@ -40,22 +40,23 @@
 //
 //M*/

-#include "test_precomp.hpp"
+#include "../test_precomp.hpp"

 #ifdef HAVE_CUDA

 #include <cuda_runtime.h>

-#if CUDART_VERSION >= 5000
+#include "opencv2/core/cuda.hpp"
+#include "opencv2/ts/cuda_test.hpp"

 using namespace cvtest;

 struct Async : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
-    cv::cuda::CudaMem src;
+    cv::cuda::HostMem src;
    cv::cuda::GpuMat d_src;

-    cv::cuda::CudaMem dst;
+    cv::cuda::HostMem dst;
    cv::cuda::GpuMat d_dst;

    virtual void SetUp()
@ -63,7 +64,7 @@ struct Async : testing::TestWithParam<cv::cuda::DeviceInfo>
        cv::cuda::DeviceInfo devInfo = GetParam();
        cv::cuda::setDevice(devInfo.deviceID());

-        src = cv::cuda::CudaMem(cv::cuda::CudaMem::PAGE_LOCKED);
+        src = cv::cuda::HostMem(cv::cuda::HostMem::PAGE_LOCKED);

        cv::Mat m = randomMat(cv::Size(128, 128), CV_8UC1);
        m.copyTo(src);
@ -76,8 +77,8 @@ void checkMemSet(int status, void* userData)

    Async* test = reinterpret_cast<Async*>(userData);

-    cv::cuda::CudaMem src = test->src;
-    cv::cuda::CudaMem dst = test->dst;
+    cv::cuda::HostMem src = test->src;
+    cv::cuda::HostMem dst = test->dst;

    cv::Mat dst_gold = cv::Mat::zeros(src.size(), src.type());

@ -105,8 +106,8 @@ void checkConvert(int status, void* userData)

    Async* test = reinterpret_cast<Async*>(userData);

-    cv::cuda::CudaMem src = test->src;
-    cv::cuda::CudaMem dst = test->dst;
+    cv::cuda::HostMem src = test->src;
+    cv::cuda::HostMem dst = test->dst;

    cv::Mat dst_gold;
    src.createMatHeader().convertTo(dst_gold, CV_32S);
@ -128,8 +129,25 @@ CUDA_TEST_P(Async, Convert)
    stream.waitForCompletion();
 }

+CUDA_TEST_P(Async, HostMemAllocator)
+{
+    cv::cuda::Stream stream;
+
+    cv::Mat h_dst;
+    h_dst.allocator = cv::cuda::HostMem::getAllocator();
+
+    d_src.upload(src, stream);
+    d_src.convertTo(d_dst, CV_32S, stream);
+    d_dst.download(h_dst, stream);
+
+    stream.waitForCompletion();
+
+    cv::Mat dst_gold;
+    src.createMatHeader().convertTo(dst_gold, CV_32S);
+
+    ASSERT_MAT_NEAR(dst_gold, h_dst, 0);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Stream, Async, ALL_DEVICES);

-#endif // CUDART_VERSION >= 5000
-
 #endif // HAVE_CUDA
--- a/modules/core/test/test_main.cpp
+++ b/modules/core/test/test_main.cpp
@ -7,4 +7,14 @@

 #include "test_precomp.hpp"

+#ifndef HAVE_CUDA
+
 CV_TEST_MAIN("cv")
+
+#else
+
+#include "opencv2/ts/cuda_test.hpp"
+
+CV_CUDA_TEST_MAIN("cv")
+
+#endif
--- a/modules/cuda/include/opencv2/cuda.hpp
+++ b/modules/cuda/include/opencv2/cuda.hpp
@ -50,15 +50,11 @@
 #include "opencv2/core/cuda.hpp"

 /**
-@defgroup cuda CUDA-accelerated Computer Vision
-    @ref cuda_intro "Introduction page"
+    @addtogroup cuda
    @{
-        @defgroup cuda_init Initalization and Information
-        @defgroup cuda_struct Data Structures
        @defgroup cuda_calib3d Camera Calibration and 3D Reconstruction
        @defgroup cuda_objdetect Object Detection
    @}
-
 */

 namespace cv { namespace cuda {
--- a/modules/cuda/perf/perf_buffer_pool.cpp
+++ b/modules/cuda/perf/perf_buffer_pool.cpp
@ -1,114 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-#include "opencv2/cudaarithm.hpp"
-#include "opencv2/core/private.cuda.hpp"
-
-using namespace testing;
-using namespace perf;
-using namespace cv;
-using namespace cv::cuda;
-
-namespace
-{
-    void func1(const GpuMat& src, GpuMat& dst, Stream& stream)
-    {
-        BufferPool pool(stream);
-
-        GpuMat buf = pool.getBuffer(src.size(), CV_32FC(src.channels()));
-
-        src.convertTo(buf, CV_32F, 1.0 / 255.0, stream);
-
-        cuda::exp(buf, dst, stream);
-    }
-
-    void func2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
-    {
-        BufferPool pool(stream);
-
-        GpuMat buf1 = pool.getBuffer(src1.size(), CV_32FC(src1.channels()));
-
-        func1(src1, buf1, stream);
-
-        GpuMat buf2 = pool.getBuffer(src2.size(), CV_32FC(src2.channels()));
-
-        func1(src2, buf2, stream);
-
-        cuda::add(buf1, buf2, dst, noArray(), -1, stream);
-    }
-}
-
-PERF_TEST_P(Sz, BufferPool, CUDA_TYPICAL_MAT_SIZES)
-{
-    static bool first = true;
-
-    const Size size = GetParam();
-
-    const bool useBufferPool = PERF_RUN_CUDA();
-
-    Mat host_src(size, CV_8UC1);
-    declare.in(host_src, WARMUP_RNG);
-
-    GpuMat src1(host_src), src2(host_src);
-    GpuMat dst;
-
-    setBufferPoolUsage(useBufferPool);
-    if (useBufferPool && first)
-    {
-        setBufferPoolConfig(-1, 25 * 1024 * 1024, 2);
-        first = false;
-    }
-
-    TEST_CYCLE()
-    {
-        func2(src1, src2, dst, Stream::Null());
-    }
-
-    Mat h_dst(dst);
-    SANITY_CHECK(h_dst);
-}
-
-#endif
--- a/modules/cudaarithm/CMakeLists.txt
+++ b/modules/cudaarithm/CMakeLists.txt
@ -4,7 +4,7 @@ endif()

 set(the_description "CUDA-accelerated Operations on Matrices")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

 ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudev)

--- a/modules/cudaarithm/doc/arithm.rst
+++ b/modules/cudaarithm/doc/arithm.rst
@ -55,7 +55,7 @@ Performs a per-element multiplication of two Fourier spectrums.

    :param dst: Destination spectrum.

-    :param flags: Mock parameter used for CPU/CUDA interfaces similarity.
+    :param flags: Mock parameter used for CPU/CUDA interfaces similarity, simply add a `0` value.

    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.

--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@ -892,7 +892,7 @@ CV_EXPORTS void mulSpectrums(InputArray src1, InputArray src2, OutputArray dst,
@param src1 First spectrum.
@param src2 Second spectrum with the same size and type as a .
@param dst Destination spectrum.
-@param flags Mock parameter used for CPU/CUDA interfaces similarity.
+@param flags Mock parameter used for CPU/CUDA interfaces similarity, simply add a `0` value.
@param scale Scale constant.
@param conjB Optional flag to specify if the second spectrum needs to be conjugated before the
 multiplication.
--- a/modules/cudaarithm/src/cuda/lut.cu
+++ b/modules/cudaarithm/src/cuda/lut.cu
@ -74,7 +74,7 @@ namespace

    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
    {
-        if (_lut.kind() == _InputArray::GPU_MAT)
+        if (_lut.kind() == _InputArray::CUDA_GPU_MAT)
        {
            d_lut = _lut.getGpuMat();
        }
--- a/modules/cudabgsegm/CMakeLists.txt
+++ b/modules/cudabgsegm/CMakeLists.txt
@ -4,6 +4,6 @@ endif()

 set(the_description "CUDA-accelerated Background Segmentation")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

 ocv_define_module(cudabgsegm opencv_video OPTIONAL opencv_imgproc opencv_cudaarithm opencv_cudafilters opencv_cudaimgproc)
--- a/modules/cudacodec/CMakeLists.txt
+++ b/modules/cudacodec/CMakeLists.txt
@ -4,7 +4,7 @@ endif()

 set(the_description "CUDA-accelerated Video Encoding/Decoding")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wshadow)

 ocv_add_module(cudacodec opencv_core opencv_videoio OPTIONAL opencv_cudev)

--- a/modules/cudafeatures2d/CMakeLists.txt
+++ b/modules/cudafeatures2d/CMakeLists.txt
@ -4,6 +4,6 @@ endif()

 set(the_description "CUDA-accelerated Feature Detection and Description")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter -Wshadow)

 ocv_define_module(cudafeatures2d opencv_features2d opencv_cudafilters opencv_cudawarping)
--- a/modules/cudafilters/CMakeLists.txt
+++ b/modules/cudafilters/CMakeLists.txt
@ -4,6 +4,6 @@ endif()

 set(the_description "CUDA-accelerated Image Filtering")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

 ocv_define_module(cudafilters opencv_imgproc opencv_cudaarithm)
--- a/modules/cudaimgproc/CMakeLists.txt
+++ b/modules/cudaimgproc/CMakeLists.txt
@ -6,4 +6,4 @@ set(the_description "CUDA-accelerated Image Processing")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)

-ocv_define_module(cudaimgproc opencv_imgproc OPTIONAL opencv_cudaarithm opencv_cudafilters)
+ocv_define_module(cudaimgproc opencv_imgproc OPTIONAL opencv_cudev opencv_cudaarithm opencv_cudafilters)
--- a/modules/cudaimgproc/perf/perf_hough.cpp
+++ b/modules/cudaimgproc/perf/perf_hough.cpp
@ -275,7 +275,7 @@ PERF_TEST_P(Sz, GeneralizedHoughBallard, CUDA_TYPICAL_MAT_SIZES)
    }
 }

-PERF_TEST_P(Sz, GeneralizedHoughGuil, CUDA_TYPICAL_MAT_SIZES)
+PERF_TEST_P(Sz, DISABLED_GeneralizedHoughGuil, CUDA_TYPICAL_MAT_SIZES)
 {
    declare.time(10);

@ -329,8 +329,6 @@ PERF_TEST_P(Sz, GeneralizedHoughGuil, CUDA_TYPICAL_MAT_SIZES)
        alg->setTemplate(cv::cuda::GpuMat(templ));

        TEST_CYCLE() alg->detect(d_edges, d_dx, d_dy, positions);
-
-        CUDA_SANITY_CHECK(positions);
    }
    else
    {
@ -343,7 +341,8 @@ PERF_TEST_P(Sz, GeneralizedHoughGuil, CUDA_TYPICAL_MAT_SIZES)
        alg->setTemplate(templ);

        TEST_CYCLE() alg->detect(edges, dx, dy, positions);
-
-        CPU_SANITY_CHECK(positions);
    }
+
+    // The algorithm is not stable yet.
+    SANITY_CHECK_NOTHING();
 }
--- a/modules/cudaimgproc/src/color.cpp
+++ b/modules/cudaimgproc/src/color.cpp
--- a/modules/cudaimgproc/src/cuda/color.cu
+++ b/modules/cudaimgproc/src/cuda/color.cu
@ -40,422 +40,258 @@
 //
 //M*/

-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else

-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/color.hpp"
 #include "cvt_color_internal.h"
+#include "opencv2/cudev.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;

 namespace cv { namespace cuda { namespace device
 {
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };

-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-#define OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
+#define OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name, func_t) \
+    void name(const GpuMat& src, GpuMat& dst, Stream& stream) \
    { \
-        traits::functor_type functor = traits::create_functor(); \
-        typedef typename traits::functor_type::argument_type src_t; \
-        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::cuda::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
+        func_t op; \
+        typedef typename func_t::argument_type src_t; \
+        typedef typename func_t::result_type   dst_t; \
+        gridTransformUnary(globPtr<src_t>(src), globPtr<dst_t>(dst), op, stream); \
    }

 #define OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(name) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name, name ## _traits)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name, name ## _func)

 #define OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(name) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _func<uchar>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _func<ushort>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _func<float>)

 #define OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(name) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _func<uchar>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _func<float>)

 #define OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _func<uchar>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _func<float>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _FULL_8u, name ## _FULL_func<uchar>) \
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR(name ## _FULL_32f, name ## _FULL_func<float>)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_RGBA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_GRAY)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_GRAY)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_GRAY)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_GRAY)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(GRAY_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(GRAY_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_YUV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_YUV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_YUV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_YUV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_YUV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_YUV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_YUV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_YUV4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YUV4_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_YCrCb)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_YCrCb)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_YCrCb4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_YCrCb4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_YCrCb)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_YCrCb)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_YCrCb4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_YCrCb4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_XYZ)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_XYZ)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGB_to_XYZ4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(RGBA_to_XYZ4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_XYZ)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_XYZ)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGR_to_XYZ4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(BGRA_to_XYZ4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(XYZ4_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGB_to_HSV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGBA_to_HSV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGB_to_HSV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGBA_to_HSV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGR_to_HSV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGRA_to_HSV)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGR_to_HSV4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGRA_to_HSV4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HSV4_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGB_to_HLS)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGBA_to_HLS)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGB_to_HLS4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(RGBA_to_HLS4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGR_to_HLS)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGRA_to_HLS)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGR_to_HLS4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(BGRA_to_HLS4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(HLS4_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGB_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGBA_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGB_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGBA_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGR_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGRA_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGR_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGRA_to_Lab4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGB_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGBA_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGB_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGBA_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGR_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGRA_to_Lab)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGR_to_Lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGRA_to_Lab4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_LRGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_LRGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_LRGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_LRGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_LBGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_LBGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab_to_LBGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Lab4_to_LBGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGB_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGBA_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGB_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(RGBA_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGR_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGRA_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGR_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(BGRA_to_Luv4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGB_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGBA_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGB_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LRGBA_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGR_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGRA_to_Luv)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGR_to_Luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(LBGRA_to_Luv4)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_LRGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_LRGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_LRGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_LRGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_LBGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_LBGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv_to_LBGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(Luv4_to_LBGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR_to_BGR565)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(RGB_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(RGB_to_BGR565)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGRA_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGRA_to_BGR565)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(RGBA_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(RGBA_to_BGR565)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_RGB)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_BGR)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_RGBA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_BGRA)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_BGRA)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(GRAY_to_BGR555)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(GRAY_to_BGR565)

-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
-    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR555_to_GRAY)
+    OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE(BGR565_to_GRAY)

    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR
    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ONE
    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR_ALL
    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F
    #undef OPENCV_CUDA_IMPLEMENT_CVTCOLOR_8U32F_FULL
-}}} // namespace cv { namespace cuda { namespace cudev

-#endif /* CUDA_DISABLER */
+}}}
+
+#endif
--- a/modules/cudaimgproc/src/cuda/match_template.cu
+++ b/modules/cudaimgproc/src/cuda/match_template.cu
@ -218,7 +218,7 @@ namespace cv { namespace cuda { namespace device
        // Prepared_SQDIFF

        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<double> image_sqsum, double templ_sqsum, PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -234,7 +234,7 @@ namespace cv { namespace cuda { namespace device
        }

        template <int cn>
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream)
        {
            const dim3 threads(32, 8);
            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -246,10 +246,10 @@ namespace cv { namespace cuda { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, int cn,
                                             cudaStream_t stream)
        {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);

            static const caller_t callers[] =
            {
@ -287,8 +287,8 @@ namespace cv { namespace cuda { namespace device

        template <int cn>
        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
+                int w, int h, const PtrStep<double> image_sqsum,
+                double templ_sqsum, PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -305,7 +305,7 @@ namespace cv { namespace cuda { namespace device
        }

        template <int cn>
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
                                                    PtrStepSzf result, cudaStream_t stream)
        {
            const dim3 threads(32, 8);
@ -319,10 +319,10 @@ namespace cv { namespace cuda { namespace device
        }


-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
                                                    PtrStepSzf result, int cn, cudaStream_t stream)
        {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
            static const caller_t callers[] =
            {
                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
@ -334,7 +334,7 @@ namespace cv { namespace cuda { namespace device
        //////////////////////////////////////////////////////////////////////
        // Prepared_CCOFF

-        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
+        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<int> image_sum, PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -349,7 +349,7 @@ namespace cv { namespace cuda { namespace device
            }
        }

-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -365,8 +365,8 @@ namespace cv { namespace cuda { namespace device

        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -388,9 +388,9 @@ namespace cv { namespace cuda { namespace device

        void matchTemplatePrepared_CCOFF_8UC2(
                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                unsigned int templ_sum_r, unsigned int templ_sum_g,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                int templ_sum_r, int templ_sum_g,
                PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
@ -412,9 +412,9 @@ namespace cv { namespace cuda { namespace device
                float templ_sum_scale_r,
                float templ_sum_scale_g,
                float templ_sum_scale_b,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
+                const PtrStep<int> image_sum_b,
                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -440,12 +440,12 @@ namespace cv { namespace cuda { namespace device

        void matchTemplatePrepared_CCOFF_8UC3(
                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
                PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
@ -471,10 +471,10 @@ namespace cv { namespace cuda { namespace device
                float templ_sum_scale_g,
                float templ_sum_scale_b,
                float templ_sum_scale_a,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
-                const PtrStep<unsigned int> image_sum_a,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
+                const PtrStep<int> image_sum_b,
+                const PtrStep<int> image_sum_a,
                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -504,14 +504,14 @@ namespace cv { namespace cuda { namespace device

        void matchTemplatePrepared_CCOFF_8UC4(
                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                const PtrStepSz<int> image_sum_a,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
+                int templ_sum_a,
                PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
@ -537,8 +537,8 @@ namespace cv { namespace cuda { namespace device
        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
                int w, int h, float weight,
                float templ_sum_scale, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum,
-                const PtrStep<unsigned long long> image_sqsum,
+                const PtrStep<int> image_sum,
+                const PtrStep<double> image_sqsum,
                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -559,9 +559,9 @@ namespace cv { namespace cuda { namespace device
        }

        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                    int w, int h, const PtrStepSz<unsigned int> image_sum,
-                    const PtrStepSz<unsigned long long> image_sqsum,
-                    unsigned int templ_sum, unsigned long long templ_sqsum,
+                    int w, int h, const PtrStepSz<int> image_sum,
+                    const PtrStepSz<double> image_sqsum,
+                    int templ_sum, double templ_sqsum,
                    PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
@ -586,8 +586,8 @@ namespace cv { namespace cuda { namespace device
                int w, int h, float weight,
                float templ_sum_scale_r, float templ_sum_scale_g,
                float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -618,10 +618,10 @@ namespace cv { namespace cuda { namespace device

        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
                    PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
@ -652,9 +652,9 @@ namespace cv { namespace cuda { namespace device
                int w, int h, float weight,
                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
                float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
+                const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -693,12 +693,12 @@ namespace cv { namespace cuda { namespace device

        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
+                    int templ_sum_b, double templ_sqsum_b,
                    PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
@ -732,10 +732,10 @@ namespace cv { namespace cuda { namespace device
                int w, int h, float weight,
                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
                float templ_sum_scale_a, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
+                const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
+                const PtrStep<int> image_sum_a, const PtrStep<double> image_sqsum_a,
                PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -780,14 +780,14 @@ namespace cv { namespace cuda { namespace device

        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                    const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
+                    int templ_sum_b, double templ_sqsum_b,
+                    int templ_sum_a, double templ_sqsum_a,
                    PtrStepSzf result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
@ -823,8 +823,8 @@ namespace cv { namespace cuda { namespace device

        template <int cn>
        __global__ void normalizeKernel_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
+                int w, int h, const PtrStep<double> image_sqsum,
+                double templ_sqsum, PtrStepSzf result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -838,8 +838,8 @@ namespace cv { namespace cuda { namespace device
            }
        }

-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
+        void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
+                          double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
--- a/modules/cudaimgproc/src/cvt_color_internal.h
+++ b/modules/cudaimgproc/src/cvt_color_internal.h
@ -43,10 +43,12 @@
 #ifndef __cvt_color_internal_h__
 #define __cvt_color_internal_h__

+#include "opencv2/core/cuda.hpp"
+
 namespace cv { namespace cuda { namespace device
 {
 #define OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name) \
-    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    void name(const GpuMat& _src, GpuMat& _dst, Stream& stream);

 #define OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(name)       \
    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
@ -60,210 +62,209 @@ namespace cv { namespace cuda { namespace device
 #define OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(name)    \
    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _8u)        \
    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _32f)       \
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _full_8u)   \
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _FULL_8u)   \
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(name ## _FULL_32f)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_RGBA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_GRAY)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_GRAY)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_GRAY)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_GRAY)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(GRAY_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(GRAY_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_YUV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_YUV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_YUV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_YUV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_YUV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_YUV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_YUV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_YUV4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YUV4_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_YCrCb)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_YCrCb)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_YCrCb4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_YCrCb4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_YCrCb)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_YCrCb)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_YCrCb4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_YCrCb4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_XYZ)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_XYZ)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGB_to_XYZ4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(RGBA_to_XYZ4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_XYZ)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_XYZ)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGR_to_XYZ4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(BGRA_to_XYZ4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(XYZ4_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGB_to_HSV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGBA_to_HSV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGB_to_HSV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGBA_to_HSV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGR_to_HSV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGRA_to_HSV)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGR_to_HSV4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGRA_to_HSV4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HSV4_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGB_to_HLS)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGBA_to_HLS)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGB_to_HLS4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(RGBA_to_HLS4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGR_to_HLS)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGRA_to_HLS)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGR_to_HLS4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(BGRA_to_HLS4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(HLS4_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGB_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGBA_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGB_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGBA_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGR_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGRA_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGR_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGRA_to_Lab4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGB_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGBA_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGB_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGBA_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGR_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGRA_to_Lab)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGR_to_Lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGRA_to_Lab4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_LRGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_LRGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_LRGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_LRGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_LBGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_LBGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab_to_LBGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Lab4_to_LBGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGB_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGBA_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGB_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(RGBA_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGR_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGRA_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGR_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(BGRA_to_Luv4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGB_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGBA_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGB_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LRGBA_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGR_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGRA_to_Luv)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGR_to_Luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(LBGRA_to_Luv4)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_LRGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_LRGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_LRGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_LRGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_LBGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_LBGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv_to_LBGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(Luv4_to_LBGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR_to_BGR565)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(RGB_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(RGB_to_BGR565)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGRA_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGRA_to_BGR565)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(RGBA_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(RGBA_to_BGR565)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_RGB)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_BGR)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_RGBA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_BGRA)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_BGRA)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(GRAY_to_BGR555)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(GRAY_to_BGR565)

-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
-
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
-    OPENCV_CUDA_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR555_to_GRAY)
+    OPENCV_CUDA_DECLARE_CVTCOLOR_ONE(BGR565_to_GRAY)

    #undef OPENCV_CUDA_DECLARE_CVTCOLOR_ONE
    #undef OPENCV_CUDA_DECLARE_CVTCOLOR_ALL
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@ -467,14 +467,14 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
    _levels.create(1, nLevels, CV_32SC1);

    Mat host_levels;
-    if (kind == _InputArray::GPU_MAT)
+    if (kind == _InputArray::CUDA_GPU_MAT)
        host_levels.create(1, nLevels, CV_32SC1);
    else
        host_levels = _levels.getMat();

    nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );

-    if (kind == _InputArray::GPU_MAT)
+    if (kind == _InputArray::CUDA_GPU_MAT)
        _levels.getGpuMatRef().upload(host_levels);
 }

--- a/modules/cudaimgproc/src/match_template.cpp
+++ b/modules/cudaimgproc/src/match_template.cpp
@ -61,77 +61,77 @@ namespace cv { namespace cuda { namespace device
        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);

-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result,
            int cn, cudaStream_t stream);

-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result,
            int cn, cudaStream_t stream);

-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC2(
            int w, int h,
-            const PtrStepSz<unsigned int> image_sum_r,
-            const PtrStepSz<unsigned int> image_sum_g,
-            unsigned int templ_sum_r,
-            unsigned int templ_sum_g,
+            const PtrStepSz<int> image_sum_r,
+            const PtrStepSz<int> image_sum_g,
+            int templ_sum_r,
+            int templ_sum_g,
            PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC3(
                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC4(
                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                const PtrStepSz<int> image_sum_a,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
+                int templ_sum_a,
                PtrStepSzf result, cudaStream_t stream);


        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                int w, int h, const PtrStepSz<unsigned int> image_sum,
-                const PtrStepSz<unsigned long long> image_sqsum,
-                unsigned int templ_sum, unsigned long long templ_sqsum,
+                int w, int h, const PtrStepSz<int> image_sum,
+                const PtrStepSz<double> image_sqsum,
+                int templ_sum, double templ_sqsum,
                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
+                int templ_sum_b, double templ_sqsum_b,
                PtrStepSzf result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
+                int templ_sum_b, double templ_sqsum_b,
+                int templ_sum_a, double templ_sqsum_a,
                PtrStepSzf result, cudaStream_t stream);

-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
+        void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
+                          double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);

        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
    }
@ -290,7 +290,7 @@ namespace

        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);

-        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];

        normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }
@ -361,7 +361,7 @@ namespace

        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);

-        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];

        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();
@ -400,7 +400,7 @@ namespace

        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);

-        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];

        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();
@ -446,7 +446,7 @@ namespace
            image_sums_.resize(1);
            cuda::integral(image, image_sums_[0], intBuffer_, stream);

-            unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0];
+            int templ_sum = (int) cuda::sum(templ)[0];

            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
        }
@ -465,19 +465,19 @@ namespace
            case 2:
                matchTemplatePrepared_CCOFF_8UC2(
                        templ.cols, templ.rows, image_sums_[0], image_sums_[1],
-                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1],
+                        (int) templ_sum[0], (int) templ_sum[1],
                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
                matchTemplatePrepared_CCOFF_8UC3(
                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
-                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2],
+                        (int) templ_sum[0], (int) templ_sum[1], (int) templ_sum[2],
                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
                matchTemplatePrepared_CCOFF_8UC4(
                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
-                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (unsigned int) templ_sum[3],
+                        (int) templ_sum[0], (int) templ_sum[1], (int) templ_sum[2], (int) templ_sum[3],
                        result, StreamAccessor::getStream(stream));
                break;
            default:
@ -532,8 +532,8 @@ namespace
            image_sqsums_.resize(1);
            cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);

-            unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0];
-            unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ)[0];
+            int templ_sum = (int) cuda::sum(templ)[0];
+            double templ_sqsum = cuda::sqrSum(templ)[0];

            matchTemplatePrepared_CCOFF_NORMED_8U(
                    templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
@ -561,8 +561,8 @@ namespace
                        templ.cols, templ.rows,
                        image_sums_[0], image_sqsums_[0],
                        image_sums_[1], image_sqsums_[1],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
@ -571,9 +571,9 @@ namespace
                        image_sums_[0], image_sqsums_[0],
                        image_sums_[1], image_sqsums_[1],
                        image_sums_[2], image_sqsums_[2],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
+                        (int)templ_sum[2], templ_sqsum[2],
                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
@ -583,10 +583,10 @@ namespace
                        image_sums_[1], image_sqsums_[1],
                        image_sums_[2], image_sqsums_[2],
                        image_sums_[3], image_sqsums_[3],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
-                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
+                        (int)templ_sum[2], templ_sqsum[2],
+                        (int)templ_sum[3], templ_sqsum[3],
                        result, StreamAccessor::getStream(stream));
                break;
            default:
--- a/modules/cudaimgproc/test/test_match_template.cpp
+++ b/modules/cudaimgproc/test/test_match_template.cpp
@ -90,7 +90,18 @@ CUDA_TEST_P(MatchTemplate8U, Accuracy)
    cv::Mat dst_gold;
    cv::matchTemplate(image, templ, dst_gold, method);

-    EXPECT_MAT_NEAR(dst_gold, dst, templ_size.area() * 1e-1);
+    cv::Mat h_dst(dst);
+    ASSERT_EQ(dst_gold.size(), h_dst.size());
+    ASSERT_EQ(dst_gold.type(), h_dst.type());
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            float gold_val = dst_gold.at<float>(y, x);
+            float actual_val = dst_gold.at<float>(y, x);
+            ASSERT_FLOAT_EQ(gold_val, actual_val) << y << ", " << x;
+        }
+    }
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MatchTemplate8U, testing::Combine(
@ -138,7 +149,18 @@ CUDA_TEST_P(MatchTemplate32F, Regression)
    cv::Mat dst_gold;
    cv::matchTemplate(image, templ, dst_gold, method);

-    EXPECT_MAT_NEAR(dst_gold, dst, templ_size.area() * 1e-1);
+    cv::Mat h_dst(dst);
+    ASSERT_EQ(dst_gold.size(), h_dst.size());
+    ASSERT_EQ(dst_gold.type(), h_dst.type());
+    for (int y = 0; y < h_dst.rows; ++y)
+    {
+        for (int x = 0; x < h_dst.cols; ++x)
+        {
+            float gold_val = dst_gold.at<float>(y, x);
+            float actual_val = dst_gold.at<float>(y, x);
+            ASSERT_FLOAT_EQ(gold_val, actual_val) << y << ", " << x;
+        }
+    }
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, MatchTemplate32F, testing::Combine(
--- a/modules/cudalegacy/CMakeLists.txt
+++ b/modules/cudalegacy/CMakeLists.txt
@ -4,6 +4,6 @@ endif()

 set(the_description "CUDA-accelerated Computer Vision (legacy)")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4130 /wd4324 /wd4512 /wd4310 -Wundef -Wmissing-declarations -Wuninitialized)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4130 /wd4324 /wd4512 /wd4310 -Wundef -Wmissing-declarations -Wuninitialized -Wshadow)

 ocv_define_module(cudalegacy opencv_core OPTIONAL opencv_objdetect)
--- a/modules/cudaoptflow/CMakeLists.txt
+++ b/modules/cudaoptflow/CMakeLists.txt
@ -4,6 +4,6 @@ endif()

 set(the_description "CUDA-accelerated Optical Flow")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

 ocv_define_module(cudaoptflow opencv_video opencv_cudaarithm opencv_cudawarping opencv_cudaimgproc OPTIONAL opencv_cudalegacy)
--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
@ -95,6 +95,16 @@ namespace cv { namespace cuda { namespace device { namespace optflow_farneback

 }}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback

+namespace
+{
+    GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
+    {
+        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
+            return mat(Rect(0, 0, cols, rows));
+
+        return mat = GpuMat(rows, cols, type);
+    }
+}

 void cv::cuda::FarnebackOpticalFlow::prepareGaussian(
        int n, double sigma, float *g, float *xg, float *xxg,
--- a/modules/cudastereo/CMakeLists.txt
+++ b/modules/cudastereo/CMakeLists.txt
@ -4,6 +4,6 @@ endif()

 set(the_description "CUDA-accelerated Stereo Correspondence")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

 ocv_define_module(cudastereo opencv_calib3d)
--- a/modules/cudawarping/CMakeLists.txt
+++ b/modules/cudawarping/CMakeLists.txt
@ -4,6 +4,6 @@ endif()

 set(the_description "CUDA-accelerated Image Warping")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

 ocv_define_module(cudawarping opencv_imgproc OPTIONAL opencv_cudalegacy)
--- a/modules/cudev/CMakeLists.txt
+++ b/modules/cudev/CMakeLists.txt
@ -4,7 +4,7 @@ endif()

 set(the_description "CUDA device layer")

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4189 /wd4505 -Wundef -Wmissing-declarations -Wunused-function -Wunused-variable -Wenum-compare -Wshadow)

 ocv_add_module(cudev)

--- a/modules/cudev/include/opencv2/cudev/functional/detail/color_cvt.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/detail/color_cvt.hpp
@ -156,7 +156,7 @@ namespace color_cvt_detail
            const int g = src.y;
            const int r = bidx == 0 ? src.z : src.x;
            const int a = src.w;
-            return (ushort) ((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));
+            return (ushort) ((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a ? 0x8000 : 0));
        }
    };

@ -263,7 +263,8 @@ namespace color_cvt_detail
    {
        __device__ ushort operator ()(uchar src) const
        {
-            return (ushort) (src | (src << 5) | (src << 10));
+            const int t = src >> 3;
+            return (ushort)(t | (t << 5) | (t << 10));
        }
    };

@ -272,7 +273,8 @@ namespace color_cvt_detail
    {
        __device__ ushort operator ()(uchar src) const
        {
-            return (ushort) ((src >> 3) | ((src & ~3) << 3) | ((src & ~7) << 8));
+            const int t = src;
+            return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
        }
    };

@ -797,8 +799,7 @@ namespace color_cvt_detail

            if (diff > numeric_limits<float>::epsilon())
            {
-                s = (l < 0.5f) * diff / (vmax + vmin);
-                s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);
+                s = l < 0.5f ? diff / (vmax + vmin) : diff / (2 - vmax - vmin);

                diff = 60.f / diff;

@ -1190,7 +1191,7 @@ namespace color_cvt_detail

            dst.x = saturate_cast<uchar>(buf.x * 2.55f);
            dst.y = saturate_cast<uchar>(buf.y * 0.72033898305084743f + 96.525423728813564f);
-            dst.z = saturate_cast<uchar>(buf.z * 0.99609375f + 139.453125f);
+            dst.z = saturate_cast<uchar>(buf.z * 0.9732824427480916f + 136.259541984732824f);

            return dst;
        }
@ -1228,6 +1229,10 @@ namespace color_cvt_detail
            float G = -0.969256f * X + 1.875991f * Y + 0.041556f * Z;
            float R = 3.240479f * X - 1.537150f * Y - 0.498535f * Z;

+            R = ::fminf(::fmaxf(R, 0.f), 1.f);
+            G = ::fminf(::fmaxf(G, 0.f), 1.f);
+            B = ::fminf(::fmaxf(B, 0.f), 1.f);
+
            if (srgb)
            {
                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
@ -1255,7 +1260,7 @@ namespace color_cvt_detail

            buf.x = src.x * (100.f / 255.f);
            buf.y = src.y * 1.388235294117647f - 134.f;
-            buf.z = src.z * 1.003921568627451f - 140.f;
+            buf.z = src.z * 1.027450980392157f - 140.f;

            Luv2RGB<float, 3, 3, srgb, blueIdx> cvtf;
            buf = cvtf(buf);
--- a/Show More
+++ b/Show More