Merge pull request #3677 from ilya-lavrenov:cuda_backport

2015-02-24 15:06:15 +00:00
parent 2d2ecbf3ee e03136e951
commit 96d7bcc2a9
5 changed files with 165 additions and 271 deletions
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@@ -31,10 +31,8 @@
 # The following variables affect the behavior of the macros in the
 # script (in alphebetical order).  Note that any of these flags can be
 # changed multiple times in the same directory before calling
-# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX
-# or CUDA_WRAP_SRCS.
-#
-# ::
+# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX,
+# CUDA_COMPILE_FATBIN, CUDA_COMPILE_CUBIN or CUDA_WRAP_SRCS::
 #
 #   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
 #   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
@@ -43,19 +41,11 @@
 #      nvcc in the generated source.  If you compile to PTX and then load the
 #      file yourself, you can mix bit sizes between device and host.
 #
-#
-#
-# ::
-#
 #   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
 #   -- Set to ON if you want the custom build rule to be attached to the source
 #      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
 #      targets.
 #
-#
-#
-# ::
-#
 #      This allows the user to build the target from the CUDA file; however, bad
 #      things can happen if the CUDA source file is added to multiple targets.
 #      When performing parallel builds it is possible for the custom build
@@ -68,44 +58,24 @@
 #      this script could detect the reuse of source files across multiple targets
 #      and turn the option off for the user, but no good solution could be found.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_CUBIN (Default OFF)
 #   -- Set to ON to enable and extra compilation pass with the -cubin option in
 #      Device mode. The output is parsed and register, shared memory usage is
 #      printed during build.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_EMULATION (Default OFF for device mode)
 #   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
 #      when CUDA_BUILD_EMULATION is TRUE.
 #
-#
-#
-# ::
-#
 #   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
 #   -- Set to the path you wish to have the generated files placed.  If it is
 #      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
 #      Intermediate files will always be placed in
 #      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
 #
-#
-#
-# ::
-#
 #   CUDA_HOST_COMPILATION_CPP (Default ON)
 #   -- Set to OFF for C compilation of host code.
 #
-#
-#
-# ::
-#
 #   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
 #   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
 #      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
@@ -113,19 +83,11 @@
 #      $(VCInstallDir)/bin is a special value that expands out to the path when
 #      the command is run from withing VS.
 #
-#
-#
-# ::
-#
 #   CUDA_NVCC_FLAGS
 #   CUDA_NVCC_FLAGS_<CONFIG>
 #   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
 #      semi-colon delimited (e.g. --compiler-options;-Wall)
 #
-#
-#
-# ::
-#
 #   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
 #   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
 #      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
@@ -137,10 +99,6 @@
 #      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
 #      shared library compilation are not affected by this flag.
 #
-#
-#
-# ::
-#
 #   CUDA_SEPARABLE_COMPILATION (Default OFF)
 #   -- If set this will enable separable compilation for all CUDA runtime object
 #      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
@@ -148,38 +106,22 @@
 #      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
 #      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
 #
-#
-#
-# ::
-#
 #   CUDA_VERBOSE_BUILD (Default OFF)
 #   -- Set to ON to see all the commands used when building the CUDA file.  When
 #      using a Makefile generator the value defaults to VERBOSE (run make
 #      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
 #      always print the output.
 #
-#
-#
-# The script creates the following macros (in alphebetical order):
-#
-# ::
+# The script creates the following macros (in alphebetical order)::
 #
 #   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
 #   -- Adds the cufft library to the target (can be any target).  Handles whether
 #      you are in emulation mode or not.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
 #   -- Adds the cublas library to the target (can be any target).  Handles
 #      whether you are in emulation mode or not.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
 #                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
 #   -- Creates an executable "cuda_target" which is made up of the files
@@ -193,42 +135,28 @@
 #      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
 #      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
 #                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
 #   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_CLEAN_TARGET()
 #   -- Creates a convience target that deletes all the dependency files
 #      generated.  You should make clean after running this target to ensure the
 #      dependency files get regenerated.
 #
-#
-#
-# ::
-#
 #   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
 #                 [OPTIONS ...] )
 #   -- Returns a list of generated files from the input source files to be used
 #      with ADD_LIBRARY or ADD_EXECUTABLE.
 #
-#
-#
-# ::
-#
 #   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
 #   -- Returns a list of PTX files generated from the input source files.
 #
+#   CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of FATBIN files generated from the input source files.
 #
-#
-# ::
+#   CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of CUBIN files generated from the input source files.
 #
 #   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
 #                                                        cuda_target
@@ -242,10 +170,6 @@
 #      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
 #      this is a function and not a macro.
 #
-#
-#
-# ::
-#
 #   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
 #   -- Sets the directories that should be passed to nvcc
 #      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
@@ -253,17 +177,9 @@
 #
 #
 #
-#
-#
-# ::
-#
 #   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
 #                                            nvcc_flags object_files)
 #
-#
-#
-# ::
-#
 #   -- Generates the link object required by separable compilation from the given
 #      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
 #      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
@@ -273,91 +189,51 @@
 #      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
 #      instead of a macro.
 #
-#
-#
-# ::
-#
 #   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
 #                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
 #   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
 #      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
 #      function under the hood.
 #
-#
-#
-# ::
-#
 #      Given the list of files (file0 file1 ... fileN) this macro generates
 #      custom commands that generate either PTX or linkable objects (use "PTX" or
 #      "OBJ" for the format argument to switch).  Files that don't end with .cu
 #      or have the HEADER_FILE_ONLY property are ignored.
 #
-#
-#
-# ::
-#
 #      The arguments passed in after OPTIONS are extra command line options to
 #      give to nvcc.  You can also specify per configuration options by
 #      specifying the name of the configuration followed by the options.  General
 #      options must preceed configuration specific options.  Not all
 #      configurations need to be specified, only the ones provided will be used.
 #
-#
-#
-# ::
-#
 #         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
 #         DEBUG -g
 #         RELEASE --use_fast_math
 #         RELWITHDEBINFO --use_fast_math;-g
 #         MINSIZEREL --use_fast_math
 #
-#
-#
-# ::
-#
 #      For certain configurations (namely VS generating object files with
 #      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
 #      be produced for the given cuda file.  This is because when you add the
 #      cuda file to Visual Studio it knows that this file produces an object file
 #      and will link in the resulting object file automatically.
 #
-#
-#
-# ::
-#
 #      This script will also generate a separate cmake script that is used at
 #      build time to invoke nvcc.  This is for several reasons.
 #
-#
-#
-# ::
-#
 #        1. nvcc can return negative numbers as return values which confuses
 #        Visual Studio into thinking that the command succeeded.  The script now
 #        checks the error codes and produces errors when there was a problem.
 #
-#
-#
-# ::
-#
 #        2. nvcc has been known to not delete incomplete results when it
 #        encounters problems.  This confuses build systems into thinking the
 #        target was generated when in fact an unusable file exists.  The script
 #        now deletes the output files if there was an error.
 #
-#
-#
-# ::
-#
 #        3. By putting all the options that affect the build into a file and then
 #        make the build rule dependent on the file, the output files will be
 #        regenerated when the options change.
 #
-#
-#
-# ::
-#
 #      This script also looks at optional arguments STATIC, SHARED, or MODULE to
 #      determine when to target the object compilation for a shared library.
 #      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
@@ -366,27 +242,17 @@
 #      <target_name>_EXPORTS is defined when a shared library compilation is
 #      detected.
 #
-#
-#
-# ::
-#
 #      Flags passed into add_definitions with -D or /D are passed along to nvcc.
 #
 #
 #
-# The script defines the following variables:
-#
-# ::
+# The script defines the following variables::
 #
 #   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
 #   CUDA_VERSION_MINOR    -- The minor version.
 #   CUDA_VERSION
 #   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
 #
-#
-#
-# ::
-#
 #   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
 #   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
 #                            SDK.  This script will not directly support finding
@@ -412,13 +278,13 @@
 #                            Only available for CUDA version 3.2+.
 #   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
 #                            Only available for CUDA version 3.2+.
-#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives library.
+#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives lib.
 #                            Only available for CUDA version 4.0+.
-#   CUDA_nppc_LIBRARY      -- NVIDIA Performance Primitives library (core).
+#   CUDA_nppc_LIBRARY     -- NVIDIA Performance Primitives lib (core).
 #                            Only available for CUDA version 5.5+.
-#   CUDA_nppi_LIBRARY      -- NVIDIA Performance Primitives library (image processing).
+#   CUDA_nppi_LIBRARY     -- NVIDIA Performance Primitives lib (image processing).
 #                            Only available for CUDA version 5.5+.
-#   CUDA_npps_LIBRARY      -- NVIDIA Performance Primitives library (signal processing).
+#   CUDA_npps_LIBRARY     -- NVIDIA Performance Primitives lib (signal processing).
 #                            Only available for CUDA version 5.5+.
 #   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
 #                            Only available for CUDA version 3.2+.
@@ -427,32 +293,15 @@
 #                            Only available for CUDA version 3.2+.
 #                            Windows only.
 #
-#
-#
-#
-#
-# ::
-#
+
 #   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
 #   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
 #
-#
-#
-# ::
-#
 #   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
 #
-#
-#
-# ::
-#
 #   Copyright (c) 2007-2009
 #   Scientific Computing and Imaging Institute, University of Utah
 #
-#
-#
-# ::
-#
 #   This code is licensed under the MIT License.  See the FindCUDA.cmake script
 #   for the text of the license.

@@ -481,11 +330,6 @@

 # FindCUDA.cmake

-# We need to have at least this version to support the VERSION_LESS argument to 'if' (2.6.2) and unset (2.6.3)
-cmake_policy(PUSH)
-cmake_minimum_required(VERSION 2.6.3)
-cmake_policy(POP)
-
 # This macro helps us find the location of helper files we will need the full path to
 macro(CUDA_FIND_HELPER_FILE _name _extension)
  set(_full_name "${_name}.${_extension}")
@@ -608,7 +452,17 @@ set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
 if(CMAKE_GENERATOR MATCHES "Visual Studio")
  set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC")
 else()
-  set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+  # Using cc which is symlink to clang may let NVCC think it is GCC and issue
+  # unhandled -dumpspecs option to clang. Also in case neither
+  # CMAKE_C_COMPILER is defined (project does not use C language) nor
+  # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let
+  # nvcc use its own default C compiler.
+  if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER)
+    get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+  else()
+    set(c_compiler_realpath "")
+  endif()
+  set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
 endif()

 # Propagate the host flags to the host compiler via -Xcompiler
@@ -675,14 +529,16 @@ endmacro()

 # Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
 # if they have then clear the cache variables, so that will be detected again.
-if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
+if(DEFINED CUDA_TOOLKIT_ROOT_DIR_INTERNAL AND (NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}"))
+  unset(CUDA_TARGET_TRIPLET CACHE)
  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
  unset(CUDA_NVCC_EXECUTABLE CACHE)
  unset(CUDA_VERSION CACHE)
  cuda_unset_include_and_libraries()
 endif()

-if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
+if(DEFINED CUDA_TARGET_TRIPLET_INTERNAL AND (NOT "${CUDA_TARGET_TRIPLET}" STREQUAL "${CUDA_TARGET_TRIPLET_INTERNAL}") OR
+   (DEFINED CUDA_TOOLKIT_TARGET_DIR  AND DEFINED CUDA_TOOLKIT_TARGET_DIR_INTERNAL AND NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}"))
  cuda_unset_include_and_libraries()
 endif()

@@ -758,27 +614,46 @@ endif()
 # Always set this convenience variable
 set(CUDA_VERSION_STRING "${CUDA_VERSION}")

-# Support for arm cross compilation with CUDA 5.5
-set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}")
-if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
-  if(ANDROID AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
-    set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
-  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-    set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-  endif()
-endif()
-set(CUDA_TOOLKIT_TARGET_DIR "${__cuda_toolkit_target_dir_initial}" CACHE PATH "Toolkit target location.")
-mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
-
 # Target CPU architecture
-if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+if(DEFINED CUDA_TARGET_CPU_ARCH)
+  set(_cuda_target_cpu_arch_initial "${CUDA_TARGET_CPU_ARCH}")
+elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|ARM)")
  set(_cuda_target_cpu_arch_initial "ARM")
 else()
  set(_cuda_target_cpu_arch_initial "")
 endif()
-set(CUDA_TARGET_CPU_ARCH ${_cuda_target_cpu_arch_initial} CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.")
+set(CUDA_TARGET_CPU_ARCH "${_cuda_target_cpu_arch_initial}" CACHE STRING "Specify the name of the class of CPU architecture for which the input files must be compiled.")
 mark_as_advanced(CUDA_TARGET_CPU_ARCH)

+# Target OS variant
+if(DEFINED CUDA_TARGET_OS_VARIANT)
+  set(_cuda_target_os_variant_initial "${CUDA_TARGET_OS_VARIANT}")
+else()
+  set(_cuda_target_os_variant_initial "")
+endif()
+set(CUDA_TARGET_OS_VARIANT "${_cuda_target_os_variant_initial}" CACHE STRING "Specify the name of the class of OS for which the input files must be compiled.")
+mark_as_advanced(CUDA_TARGET_OS_VARIANT)
+
+# Target triplet
+if(DEFINED CUDA_TARGET_TRIPLET)
+  set(_cuda_target_triplet_initial "${CUDA_TARGET_TRIPLET}")
+elseif(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND "${CUDA_TARGET_CPU_ARCH}" STREQUAL "ARM")
+  if("${CUDA_TARGET_OS_VARIANT}" STREQUAL "Android" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
+    set(_cuda_target_triplet_initial "armv7-linux-androideabi")
+  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
+    set(_cuda_target_triplet_initial "armv7-linux-gnueabihf")
+  endif()
+endif()
+set(CUDA_TARGET_TRIPLET "${_cuda_target_triplet_initial}" CACHE STRING "Specify the target triplet for which the input files must be compiled.")
+file(GLOB __cuda_available_target_tiplets RELATIVE "${CUDA_TOOLKIT_ROOT_DIR}/targets" "${CUDA_TOOLKIT_ROOT_DIR}/targets/*" )
+set_property(CACHE CUDA_TARGET_TRIPLET PROPERTY STRINGS ${__cuda_available_target_tiplets})
+mark_as_advanced(CUDA_TARGET_TRIPLET)
+
+# Target directory
+if(NOT DEFINED CUDA_TOOLKIT_TARGET_DIR AND CUDA_TARGET_TRIPLET AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/${CUDA_TARGET_TRIPLET}")
+  set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}/targets/${CUDA_TARGET_TRIPLET}")
+endif()
+
 # CUDA_TOOLKIT_INCLUDE
 find_path(CUDA_TOOLKIT_INCLUDE
  device_functions.h # Header included in toolkit
@@ -802,10 +677,16 @@ macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
    # and old paths.
    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
  endif()
+  if(CUDA_VERSION VERSION_GREATER "6.0")
+    set(_cuda_static_lib_names "")
+    foreach(name ${_names})
+      list(APPEND _cuda_static_lib_names "${name}_static")
+    endforeach()
+  endif()
  # CUDA 3.2+ on Windows moved the library directories, so we need to new
  # (lib/Win32) and the old path (lib).
  find_library(${_var}
-    NAMES ${_names}
+    NAMES ${_names} ${_cuda_static_lib_names}
    PATHS "${CUDA_TOOLKIT_TARGET_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}"
    ENV CUDA_PATH
    ENV CUDA_LIB_PATH
@@ -815,7 +696,7 @@ macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
    )
  # Search default search paths, after we search our own set of paths.
  find_library(${_var}
-    NAMES ${_names}
+    NAMES ${_names} ${_cuda_static_lib_names}
    PATHS "/usr/lib/nvidia-current"
    DOC ${_doc}
    )
@@ -853,18 +734,6 @@ if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
 else()
  set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
 endif()
-if(APPLE)
-  # We need to add the path to cudart to the linker using rpath, since the
-  # library name for the cuda libraries is prepended with @rpath.
-  if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
-    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDARTEMU_LIBRARY}" PATH)
-  else()
-    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDART_LIBRARY}" PATH)
-  endif()
-  if(_cuda_path_to_cudart)
-    list(APPEND CUDA_LIBRARIES -Wl,-rpath "-Wl,${_cuda_path_to_cudart}")
-  endif()
-endif()

 # 1.1 toolkit on linux doesn't appear to have a separate library on
 # some platforms.
@@ -997,6 +866,8 @@ set(CUDA_FOUND TRUE)

 set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
  "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
+set(CUDA_TARGET_TRIPLET_INTERNAL "${CUDA_TARGET_TRIPLET}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TARGET_TRIPLET was set successfully." FORCE)
 set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
 set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
@@ -1044,15 +915,15 @@ macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
  set( ${_options} )
  set( _found_options FALSE )
  foreach(arg ${ARGN})
-    if(arg STREQUAL "OPTIONS")
+    if("x${arg}" STREQUAL "xOPTIONS")
      set( _found_options TRUE )
    elseif(
-        arg STREQUAL "WIN32" OR
-        arg STREQUAL "MACOSX_BUNDLE" OR
-        arg STREQUAL "EXCLUDE_FROM_ALL" OR
-        arg STREQUAL "STATIC" OR
-        arg STREQUAL "SHARED" OR
-        arg STREQUAL "MODULE"
+        "x${arg}" STREQUAL "xWIN32" OR
+        "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR
+        "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+        "x${arg}" STREQUAL "xSTATIC" OR
+        "x${arg}" STREQUAL "xSHARED" OR
+        "x${arg}" STREQUAL "xMODULE"
        )
      list(APPEND ${_cmake_options} ${arg})
    else()
@@ -1148,7 +1019,7 @@ function(CUDA_COMPUTE_BUILD_PATH path build_path)
    endif()
  endif()

-  # This recipie is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
+  # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
  # CMake source.

  # Remove leading /
@@ -1177,7 +1048,7 @@ endfunction()
 # a .cpp or .ptx file.
 # INPUT:
 #   cuda_target         - Target name
-#   format              - PTX or OBJ
+#   format              - PTX, CUBIN, FATBIN or OBJ
 #   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
 #   OPTIONS             - Extra options to NVCC
 # OUTPUT:
@@ -1227,6 +1098,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
  endif()

+  if(CUDA_TARGET_OS_VARIANT AND CUDA_VERSION VERSION_LESS "7.0")
+    set(nvcc_flags ${nvcc_flags} "-target-os-variant=${CUDA_TARGET_OS_VARIANT}")
+  endif()
+
  # This needs to be passed in at this stage, because VS needs to fill out the
  # value of VCInstallDir from within VS.  Note that CCBIN is only used if
  # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
@@ -1355,7 +1230,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
  foreach(file ${ARGN})
    # Ignore any file marked as a HEADER_FILE_ONLY
    get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-    if(${file} MATCHES ".*\\.cu$" AND NOT _is_header)
+    if(${file} MATCHES "\\.cu$" AND NOT _is_header)

      # Allow per source file overrides of the format.
      get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
@@ -1363,16 +1238,22 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
        set(_cuda_source_format ${format})
      endif()

-      if( ${_cuda_source_format} MATCHES "PTX" )
-        set( compile_to_ptx ON )
-      elseif( ${_cuda_source_format} MATCHES "OBJ")
-        set( compile_to_ptx OFF )
+      if( ${_cuda_source_format} MATCHES "OBJ")
+        set( cuda_compile_to_external_module OFF )
      else()
-        message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ or PTX.")
+        set( cuda_compile_to_external_module ON )
+        if( ${_cuda_source_format} MATCHES "PTX" )
+          set( cuda_compile_to_external_module_type "ptx" )
+        elseif( ${_cuda_source_format} MATCHES "CUBIN")
+          set( cuda_compile_to_external_module_type "cubin" )
+        elseif( ${_cuda_source_format} MATCHES "FATBIN")
+          set( cuda_compile_to_external_module_type "fatbin" )
+        else()
+          message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ, PTX, CUBIN or FATBIN.")
+        endif()
      endif()

-
-      if(compile_to_ptx)
+      if(cuda_compile_to_external_module)
        # Don't use any of the host compilation flags for PTX targets.
        set(CUDA_HOST_FLAGS)
        set(CUDA_NVCC_FLAGS_CONFIG)
@@ -1387,7 +1268,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
      if(CUDA_GENERATED_OUTPUT_DIR)
        set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
      else()
-        if ( compile_to_ptx )
+        if ( cuda_compile_to_external_module )
          set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
        else()
          set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
@@ -1397,10 +1278,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
      # Add a custom target to generate a c or ptx file. ######################

      get_filename_component( basename ${file} NAME )
-      if( compile_to_ptx )
+      if( cuda_compile_to_external_module )
        set(generated_file_path "${cuda_compile_output_dir}")
-        set(generated_file_basename "${cuda_target}_generated_${basename}.ptx")
-        set(format_flag "-ptx")
+        set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}")
+        set(format_flag "-${cuda_compile_to_external_module_type}")
        file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
      else()
        set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
@@ -1423,7 +1304,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake")

      # Setup properties for obj files:
-      if( NOT compile_to_ptx )
+      if( NOT cuda_compile_to_external_module )
        set_source_files_properties("${generated_file}"
          PROPERTIES
          EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
@@ -1438,7 +1319,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
        set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
      endif()

-      if( NOT compile_to_ptx AND CUDA_SEPARABLE_COMPILATION)
+      if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION)
        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
      endif()

@@ -1455,7 +1336,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
      # Build the NVCC made dependency file ###################################
      set(build_cubin OFF)
      if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
-         if ( NOT compile_to_ptx )
+         if ( NOT cuda_compile_to_external_module )
           set ( build_cubin ON )
         endif()
      endif()
@@ -1482,8 +1363,8 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)

      # Create up the comment string
      file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-      if(compile_to_ptx)
-        set(cuda_build_comment_string "Building NVCC ptx file ${generated_file_relative_path}")
+      if(cuda_compile_to_external_module)
+        set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}")
      else()
        set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
      endif()
@@ -1576,18 +1457,27 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
-    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
      list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
    endif()
+    # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG}
+    set(config_specific_flags)
    set(flags)
    foreach(config ${CUDA_configuration_types})
      string(TOUPPER ${config} config_upper)
+      # Add config specific flags
+      foreach(f ${CUDA_NVCC_FLAGS_${config_upper}})
+        list(APPEND config_specific_flags $<$<CONFIG:${config}>:${f}>)
+      endforeach()
      set(important_host_flags)
      _cuda_get_important_host_flags(important_host_flags ${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}})
      foreach(f ${important_host_flags})
        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
      endforeach()
    endforeach()
+    # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
+    set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
+
    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")

    # Some generators don't handle the multiple levels of custom command
@@ -1713,21 +1603,29 @@ endmacro()

 ###############################################################################
 ###############################################################################
-# CUDA COMPILE
+# (Internal) helper for manually added cuda source files with specific targets
 ###############################################################################
 ###############################################################################
-macro(CUDA_COMPILE generated_files)
+macro(cuda_compile_base cuda_target format generated_files)

  # Separate the sources from the options
  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile OBJ _generated_files ${_sources} ${_cmake_options}
+  CUDA_WRAP_SRCS( ${cuda_target} ${format} _generated_files ${_sources} ${_cmake_options}
    OPTIONS ${_options} )

  set( ${generated_files} ${_generated_files})

 endmacro()

+###############################################################################
+###############################################################################
+# CUDA COMPILE
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE generated_files)
+  cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN})
+endmacro()

 ###############################################################################
 ###############################################################################
@@ -1735,17 +1633,28 @@ endmacro()
 ###############################################################################
 ###############################################################################
 macro(CUDA_COMPILE_PTX generated_files)
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile_ptx PTX _generated_files ${_sources} ${_cmake_options}
-    OPTIONS ${_options} )
-
-  set( ${generated_files} ${_generated_files})
-
+  cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN})
 endmacro()

+###############################################################################
+###############################################################################
+# CUDA COMPILE FATBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_FATBIN generated_files)
+  cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE CUBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_CUBIN generated_files)
+  cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN})
+endmacro()
+
+
 ###############################################################################
 ###############################################################################
 # CUDA ADD CUFFT TO TARGET
--- a/cmake/FindCUDA/make2cmake.cmake
+++ b/cmake/FindCUDA/make2cmake.cmake
@@ -37,12 +37,11 @@

 file(READ ${input_file} depend_text)

-if (${depend_text} MATCHES ".+")
+if (NOT "${depend_text}" STREQUAL "")

  # message("FOUND DEPENDS")

-  # Remember, four backslashes is escaped to one backslash in the string.
-  string(REGEX REPLACE "\\\\ " " " depend_text ${depend_text})
+  string(REPLACE "\\ " " " depend_text ${depend_text})

  # This works for the nvcc -M generated dependency files.
  string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
--- a/cmake/FindCUDA/parse_cubin.cmake
+++ b/cmake/FindCUDA/parse_cubin.cmake
@@ -37,11 +37,10 @@

 file(READ ${input_file} file_text)

-if (${file_text} MATCHES ".+")
+if (NOT "${file_text}" STREQUAL "")

-  # Remember, four backslashes is escaped to one backslash in the string.
-  string(REGEX REPLACE ";" "\\\\;" file_text ${file_text})
-  string(REGEX REPLACE "\ncode" ";code" file_text ${file_text})
+  string(REPLACE ";" "\\;" file_text ${file_text})
+  string(REPLACE "\ncode" ";code" file_text ${file_text})

  list(LENGTH file_text len)

@@ -57,7 +56,7 @@ if (${file_text} MATCHES ".+")

        # Extract kernel names.
        if (${entry} MATCHES "[^g]name = ([^ ]+)")
-          string(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+          set(entry "${CMAKE_MATCH_1}")

          # Check to see if the kernel name starts with "_"
          set(skip FALSE)
@@ -76,19 +75,19 @@ if (${file_text} MATCHES ".+")

          # Registers
          if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
            message("Registers: ${entry}")
          endif()

          # Local memory
          if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
            message("Local:     ${entry}")
          endif()

          # Shared memory
          if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
            message("Shared:    ${entry}")
          endif()

--- a/cmake/FindCUDA/run_nvcc.cmake
+++ b/cmake/FindCUDA/run_nvcc.cmake
@@ -62,7 +62,7 @@ set(cmake_dependency_file "@cmake_dependency_file@") # path
 set(CUDA_make2cmake "@CUDA_make2cmake@") # path
 set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
 set(build_cubin @build_cubin@) # bool
-set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # bool
+set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # path
 # We won't actually use these variables for now, but we need to set this, in
 # order to force this file to be run again if it changes.
 set(generated_file_path "@generated_file_path@") # path
@@ -106,7 +106,7 @@ list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
 # Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
 list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
 list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
  if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
    set(CCBIN -ccbin "${CCBIN}")
  else()
@@ -126,7 +126,7 @@ endif()
 # and other return variables are present after executing the process.
 macro(cuda_execute_process status command)
  set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
+  if(NOT "x${_command}" STREQUAL "xCOMMAND")
    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
  endif()
  if(verbose)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -15,19 +15,10 @@ endif()

 set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})

-foreach(var INCLUDE LIBRARY PROGRAM)
-  set(__old_frpm_${var} "${CMAKE_FIND_ROOT_PATH_MODE_${var}}")
-endforeach()
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-
-find_package(CUDA 4.2 QUIET)
-
-foreach(var INCLUDE LIBRARY PROGRAM)
-  set(CMAKE_FIND_ROOT_PATH_MODE_${var} "${__old_frpm_${var}}")
-endforeach()
+if(ANDROID AND "${CUDA_VERSION}" VERSION_LESS "7.0")
+  set(CUDA_TARGET_OS_VARIANT "Android")
+endif()
+find_host_package(CUDA 4.2 QUIET)

 list(REMOVE_AT CMAKE_MODULE_PATH 0)

@@ -159,10 +150,6 @@ if(CUDA_FOUND)

  if(ANDROID)
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xptxas;-dlcm=ca")
-    if(${CUDA_VERSION} VERSION_LESS "7.0")
-      # since CUDA 7.0 OS variant is depricated
-      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-target-os-variant=Android")
-    endif()
  endif()

  message(STATUS "CUDA NVCC target flags: ${CUDA_NVCC_FLAGS}")
@@ -265,4 +252,4 @@ if(HAVE_CUDA)
    set(CUDA_cufft_LIBRARY_ABS ${CUDA_cufft_LIBRARY})
    ocv_convert_to_lib_name(CUDA_cufft_LIBRARY ${CUDA_cufft_LIBRARY})
  endif()
-endif()
+endif()