Dynamic CUDA support library loading implemented for Linux.

Logical mistake in macro fixed; DeviceInfo deligate reimplemented; Build and warning fixes.
2013-12-13 17:28:29 +04:00 · 2013-12-13 17:28:29 +04:00 · 8660e048bc
commit 8660e048bc
parent d4087f19a2
6 changed files with 357 additions and 227 deletions
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -1,36 +1,76 @@
 set(the_description "The Core Functionality")

+macro(ocv_glob_module_sources_no_cuda)
+  file(GLOB_RECURSE lib_srcs "src/*.cpp")
+  file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
+  file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
+  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
+
+  set(cuda_objs "")
+  set(lib_cuda_hdrs "")
+  if(HAVE_CUDA)
+    ocv_include_directories(${CUDA_INCLUDE_DIRS})
+    file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
+  endif()
+
+  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
+
+  file(GLOB cl_kernels "src/opencl/*.cl")
+  if(HAVE_opencv_ocl AND cl_kernels)
+    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
+      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
+    source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
+    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
+  endif()
+
+  source_group("Include" FILES ${lib_hdrs})
+  source_group("Include\\detail" FILES ${lib_hdrs_detail})
+
+  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
+                                 SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs})
+endmacro()
+
+ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
+ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
+
 if(HAVE_WINRT)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()

+if(DYNAMIC_CUDA_SUPPORT)
+  add_definitions(-DDYNAMIC_CUDA_SUPPORT)
+else()
+  add_definitions(-DUSE_CUDA)
+endif()
+
+if(HAVE_CUDA)
+  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+endif()
+
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")

 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})

-if(DYNAMIC_CUDA_SUPPORT)
-  add_definitions(-DDYNAMIC_CUDA_SUPPORT)
+if (DYNAMIC_CUDA_SUPPORT)
+  ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                                  HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+else()
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                          HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 endif()

-ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
-ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
-
-if(HAVE_CUDA)
-  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
-endif()
-
-ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
-                        HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
-
 ocv_create_module()
 ocv_add_precompiled_headers(${the_module})

 ocv_add_accuracy_tests()
 ocv_add_perf_tests()

-if(DYNAMIC_CUDA_SUPPORT)
+if (DYNAMIC_CUDA_SUPPORT)
  add_subdirectory(cuda)
 endif()
--- a/modules/core/cuda/CMakeLists.txt
+++ b/modules/core/cuda/CMakeLists.txt
@ -1,6 +1,5 @@
 project(opencv_core_cuda)
-set(HAVE_CUDA FALSE)
-add_definitions("-DHAVE_CUDA")
+add_definitions(-DUSE_CUDA)
 include_directories(${CUDA_INCLUDE_DIRS}
                    "../src/"
                    "../include/opencv2/core/"
--- a/modules/core/cuda/main.cpp
+++ b/modules/core/cuda/main.cpp
@ -1,6 +1,10 @@
+#include "cvconfig.h"
 #include "opencv2/core/core.hpp"
 #include "opencv2/core/gpumat.hpp"

+#include <stdio.h>
+#include <iostream>
+
 #ifdef HAVE_CUDA
 #include <cuda_runtime.h>
 #include <npp.h>
@ -17,7 +21,30 @@
 #endif
 #endif

+using namespace std;
 using namespace cv;
 using namespace cv::gpu;

-#include "gpumat_cuda.hpp"
+#include "gpumat_cuda.hpp"
+
+#ifdef HAVE_CUDA
+static CudaDeviceInfoFuncTable deviceInfoTable;
+static CudaFuncTable gpuTable;
+#else
+static EmptyDeviceInfoFuncTable deviceInfoTable;
+static EmptyFuncTable gpuTable;
+#endif
+
+extern "C" {
+   
+DeviceInfoFuncTable* deviceInfoFactory()
+{
+    return (DeviceInfoFuncTable*)&deviceInfoTable;
+}
+
+GpuFuncTable* gpuFactory()
+{
+    return (GpuFuncTable*)&gpuTable;
+}
+
+}
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -137,6 +137,9 @@ namespace cv { namespace gpu
        int deviceID() const { return device_id_; }

    private:
+        // Private section is fictive to preserve bin compatibility.
+        // Changes in the private fields there have no effects.
+        // see deligate code.
        void query();

        int device_id_;
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -43,8 +43,9 @@
 #include "precomp.hpp"
 #include "opencv2/core/gpumat.hpp"
 #include <iostream>
+#include <dlfcn.h>

-#if defined(HAVE_CUDA)
+#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
    #include <cuda_runtime.h>
    #include <npp.h>

@ -66,15 +67,81 @@ using namespace cv::gpu;

 #include "gpumat_cuda.hpp"

-namespace
+typedef GpuFuncTable* (*GpuFactoryType)();
+typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)();
+
+static GpuFactoryType gpuFactory = NULL;
+static DeviceInfoFactoryType deviceInfoFactory = NULL;
+
+static const std::string getCudaSupportLibName()
 {
-    const GpuFuncTable* gpuFuncTable()
-    {
-        static EmptyFuncTable funcTable;
-        return &funcTable;
-    }
+    return "libopencv_core_cuda.so";
 }

+static bool loadCudaSupportLib()
+{
+    void* handle;
+    const std::string name = getCudaSupportLibName();
+    handle = dlopen(name.c_str(), RTLD_LAZY);
+    if (!handle)
+        return false;
+
+    deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory");
+    if (!deviceInfoFactory)
+    {
+        dlclose(handle);
+        return false;
+    }
+    
+    gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory");
+    if (!gpuFactory)
+    {
+        dlclose(handle);
+        return false;
+    }
+
+    dlclose(handle);
+
+    return true;
+}
+
+static GpuFuncTable* gpuFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyFuncTable stub;
+   static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub;
+   static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaFuncTable impl;
+   static GpuFuncTable* funcTable = &impl;
+#else
+   static EmptyFuncTable stub;
+   static GpuFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
+}
+
+static DeviceInfoFuncTable* deviceInfoFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyDeviceInfoFuncTable stub;
+   static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub;
+   static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaDeviceInfoFuncTable impl;
+   static DeviceInfoFuncTable* funcTable = &impl;
+#else
+   static EmptyFuncTable stub;
+   static DeviceInfoFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
+}
+
+
 //////////////////////////////// Initialization & Info ////////////////////////

 int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); }
@ -95,13 +162,13 @@ bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuF
 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); }
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); }

-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); }
-void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); }
-size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); }
-size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); }
-bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); }
-bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); }
-void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); }
+size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
+void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
+size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); }
+size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
+bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
+bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
+void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }

 void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
 void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); }
@ -556,7 +623,7 @@ namespace cv { namespace gpu
    
    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
    {
-        gpuFuncTable()->setTo(src, s, stream);
+        gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream);
    }
    
    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
--- a/modules/core/src/gpumat_cuda.hpp
+++ b/modules/core/src/gpumat_cuda.hpp
@ -1,30 +1,19 @@
-namespace
-{
-#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT)
+#ifndef __GPUMAT_CUDA_HPP__
+#define __GPUMAT_CUDA_HPP__

-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
-
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    class DeviceInfoFuncTable
    {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            std::ostringstream msg;
-            msg << "NPP API Call Error: " << err;
-            cv::gpu::error(msg.str().c_str(), file, line, func);
-        }
-    }
-#endif
-}
-
-namespace
-{
+    public:
+        virtual size_t sharedMemPerBlock() const = 0;
+        virtual void queryMemory(size_t&, size_t&) const = 0;
+        virtual size_t freeMemory() const = 0;
+        virtual size_t totalMemory() const = 0;
+        virtual bool supports(FeatureSet) const = 0;
+        virtual bool isCompatible() const = 0;
+        virtual void query() = 0;
+        virtual ~DeviceInfoFuncTable() {};
+    };
+    
    class GpuFuncTable
    {
    public:
@ -40,6 +29,7 @@ namespace

        virtual bool deviceSupports(FeatureSet) const = 0;

+        // TargetArchs
        virtual bool builtWith(FeatureSet) const = 0;
        virtual bool has(int, int) const = 0;
        virtual bool hasPtx(int, int) const = 0;
@ -49,14 +39,6 @@ namespace
        virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
        virtual bool hasEqualOrGreaterBin(int, int) const = 0;

-        virtual size_t sharedMemPerBlock() const = 0;
-        virtual void queryMemory(size_t&, size_t&) const = 0;
-        virtual size_t freeMemory() const = 0;
-        virtual size_t totalMemory() const = 0;
-        virtual bool supports(FeatureSet) const = 0;
-        virtual bool isCompatible() const = 0;
-        virtual void query() const = 0;
-
        virtual void printCudaDeviceInfo(int) const = 0;
        virtual void printShortCudaDeviceInfo(int) const = 0;
        
@ -72,17 +54,24 @@ namespace
        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;

        // for gpu::device::setTo funcs
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0;
        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
        
        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
        virtual void free(void* devPtr) const = 0;
    };
-}

-#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
-namespace
-{
+    class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
+    {
+    public:
+        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
+        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
+        size_t freeMemory() const { throw_nogpu; return 0; }
+        size_t totalMemory() const { throw_nogpu; return 0; }
+        bool supports(FeatureSet) const { throw_nogpu; return false; }
+        bool isCompatible() const { throw_nogpu; return false; }
+        void query() { throw_nogpu; }
+    };
+    
    class EmptyFuncTable : public GpuFuncTable
    {
    public:
@ -105,15 +94,7 @@ namespace
        bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
        bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
        bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
-        
-        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
-        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
-        size_t freeMemory() const { throw_nogpu; return 0; }
-        size_t totalMemory() const { throw_nogpu; return 0; }
-        bool supports(FeatureSet) const { throw_nogpu; return false; }
-        bool isCompatible() const { throw_nogpu; return false; }
-        void query() const { throw_nogpu; }
-        
+                
        void printCudaDeviceInfo(int) const { throw_nogpu; }
        void printShortCudaDeviceInfo(int) const { throw_nogpu; }
        
@ -126,15 +107,32 @@ namespace
        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
        void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }

-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; }
        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }

        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
        void free(void*) const {}
    };
+
+#if defined(USE_CUDA)
+
+#define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
+#define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
+
+inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
 }

-#else
+inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+    {
+        std::ostringstream msg;
+        msg << "NPP API Call Error: " << err;
+        cv::gpu::error(msg.str().c_str(), file, line, func);
+    }
+}

 namespace cv { namespace gpu { namespace device
 {
@ -149,8 +147,6 @@ namespace cv { namespace gpu { namespace device
    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
 }}}

-namespace
-{
    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
@ -162,10 +158,7 @@ namespace
        Scalar_<T> sf = s;
        cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
    }
-}

-namespace
-{
    template<int n> struct NPPTypeTraits;
    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
@ -208,6 +201,7 @@ namespace
            cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
+    
    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
    {
        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
@ -361,9 +355,8 @@ namespace
    {
        return reinterpret_cast<size_t>(ptr) % size == 0;
    }
-}
     
-    namespace cv { namespace gpu { namespace devices
+    namespace cv { namespace gpu { namespace device
    {
        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
        {
@ -418,74 +411,52 @@ namespace
        {
            setTo(src, s, mask, 0);
        }
-    }}
+    }}}

-namespace
-{
-    class CudaFuncTable : public GpuFuncTable
+
+    class CudaArch
    {
-    protected:
-        
-        class CudaArch
-        {
-        public:
-            CudaArch();
-            
-            bool builtWith(FeatureSet feature_set) const;
-            bool hasPtx(int major, int minor) const;
-            bool hasBin(int major, int minor) const;
-            bool hasEqualOrLessPtx(int major, int minor) const;
-            bool hasEqualOrGreaterPtx(int major, int minor) const;
-            bool hasEqualOrGreaterBin(int major, int minor) const;
-            
-        private:
-            static void fromStr(const string& set_as_str, vector<int>& arr);
-            
-            vector<int> bin;
-            vector<int> ptx;
-            vector<int> features;
-        };
-        
-        const CudaArch cudaArch;
-        
-        CudaArch::CudaArch()
+    public:
+        CudaArch()
        {
            fromStr(CUDA_ARCH_BIN, bin);
            fromStr(CUDA_ARCH_PTX, ptx);
            fromStr(CUDA_ARCH_FEATURES, features);
        }
        
-        bool CudaArch::builtWith(FeatureSet feature_set) const
+        bool builtWith(FeatureSet feature_set) const
        {
            return !features.empty() && (features.back() >= feature_set);
        }
        
-        bool CudaArch::hasPtx(int major, int minor) const
+        bool hasPtx(int major, int minor) const
        {
            return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
        }
        
-        bool CudaArch::hasBin(int major, int minor) const
+        bool hasBin(int major, int minor) const
        {
            return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
        }
        
-        bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+        bool hasEqualOrLessPtx(int major, int minor) const
        {
            return !ptx.empty() && (ptx.front() <= major * 10 + minor);
        }
        
-        bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+        bool hasEqualOrGreaterPtx(int major, int minor) const
        {
            return !ptx.empty() && (ptx.back() >= major * 10 + minor);
        }
        
-        bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+        bool hasEqualOrGreaterBin(int major, int minor) const
        {
            return !bin.empty() && (bin.back() >= major * 10 + minor);
        }
        
-        void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
+        
+    private:
+        void fromStr(const string& set_as_str, vector<int>& arr)
        {
            if (set_as_str.find_first_not_of(" ") == string::npos)
                return;
@ -501,25 +472,21 @@ namespace
            
            sort(arr.begin(), arr.end());
        }
-
-        class DeviceProps
-        {
-        public:
-            DeviceProps();
-            ~DeviceProps();
-            
-            cudaDeviceProp* get(int devID);
-            
-        private:
-            std::vector<cudaDeviceProp*> props_;
-        };
        
-        DeviceProps::DeviceProps()
+        vector<int> bin;
+        vector<int> ptx;
+        vector<int> features;
+    };
+
+    class DeviceProps
+    {
+    public:
+        DeviceProps()
        {
            props_.resize(10, 0);
        }
        
-        DeviceProps::~DeviceProps()
+        ~DeviceProps()
        {
            for (size_t i = 0; i < props_.size(); ++i)
            {
@ -529,7 +496,7 @@ namespace
            props_.clear();
        }
        
-        cudaDeviceProp* DeviceProps::get(int devID)
+        cudaDeviceProp* get(int devID)
        {
            if (devID >= (int) props_.size())
                props_.resize(devID + 5, 0);
@ -542,10 +509,92 @@ namespace
            
            return props_[devID];
        }
-        
-        DeviceProps deviceProps;
+    private:
+        std::vector<cudaDeviceProp*> props_;
+    };

-        int convertSMVer2Cores(int major, int minor)
+    DeviceProps deviceProps;
+    
+    class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
+    {
+    public:
+        size_t sharedMemPerBlock() const
+        {
+            return deviceProps.get(device_id_)->sharedMemPerBlock;
+        }
+        
+        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+        {
+            int prevDeviceID = getDevice();
+            if (prevDeviceID != device_id_)
+                setDevice(device_id_);
+            
+            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+            
+            if (prevDeviceID != device_id_)
+                setDevice(prevDeviceID);
+        }
+        
+        size_t freeMemory() const
+        {
+            size_t _totalMemory, _freeMemory;
+            queryMemory(_totalMemory, _freeMemory);
+            return _freeMemory;
+        }
+        
+        size_t totalMemory() const
+        {
+            size_t _totalMemory, _freeMemory;
+            queryMemory(_totalMemory, _freeMemory);
+            return _totalMemory;
+        }
+        
+        bool supports(FeatureSet feature_set) const
+        {
+            int version = majorVersion_ * 10 + minorVersion_;
+            return version >= feature_set;
+        }
+        
+        bool isCompatible() const
+        {
+            // Check PTX compatibility
+            if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_))
+                return true;
+            
+            // Check BIN compatibility
+                for (int i = minorVersion_; i >= 0; --i)
+                    if (TargetArchs::hasBin(majorVersion_, i))
+                        return true;
+                    
+                    return false;
+        }
+        
+        void query()
+        {
+            const cudaDeviceProp* prop = deviceProps.get(device_id_);
+            
+            name_ = prop->name;
+            multi_processor_count_ = prop->multiProcessorCount;
+            majorVersion_ = prop->major;
+            minorVersion_ = prop->minor;
+        }
+
+    private:
+        int device_id_;
+        
+        std::string name_;
+        int multi_processor_count_;
+        int majorVersion_;
+        int minorVersion_;
+    };
+    
+    class CudaFuncTable : public GpuFuncTable
+    {
+    protected:
+              
+        const CudaArch cudaArch;
+
+        int convertSMVer2Cores(int major, int minor) const
        {
            // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
            typedef struct {
@ -600,42 +649,42 @@ namespace
            cudaSafeCall( cudaDeviceReset() );
        }
        
-        bool TargetArchs::builtWith(FeatureSet feature_set) const
+        bool builtWith(FeatureSet feature_set) const
        {
            return cudaArch.builtWith(feature_set);
        }
        
-        bool TargetArchs::has(int major, int minor) const
+        bool has(int major, int minor) const
        {
            return hasPtx(major, minor) || hasBin(major, minor);
        }
        
-        bool TargetArchs::hasPtx(int major, int minor) const
+        bool hasPtx(int major, int minor) const
        {
            return cudaArch.hasPtx(major, minor);
        }
        
-        bool TargetArchs::hasBin(int major, int minor) const
+        bool hasBin(int major, int minor) const
        {
            return cudaArch.hasBin(major, minor);
        }
        
-        bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const
+        bool hasEqualOrLessPtx(int major, int minor) const
        {
            return cudaArch.hasEqualOrLessPtx(major, minor);
        }
        
-        bool TargetArchs::hasEqualOrGreater(int major, int minor) const
+        bool hasEqualOrGreater(int major, int minor) const
        {
            return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
        }
        
-        bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const
+        bool hasEqualOrGreaterPtx(int major, int minor) const
        {
            return cudaArch.hasEqualOrGreaterPtx(major, minor);
        }
        
-        bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const
+        bool hasEqualOrGreaterBin(int major, int minor) const
        {
            return cudaArch.hasEqualOrGreaterBin(major, minor);
        }
@ -664,68 +713,7 @@ namespace
            
            return TargetArchs::builtWith(feature_set) && (version >= feature_set);
        }
-        
-        size_t sharedMemPerBlock() const
-        {
-            return deviceProps.get(device_id_)->sharedMemPerBlock;
-        }
-        
-        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
-        {
-            int prevDeviceID = getDevice();
-            if (prevDeviceID != device_id_)
-                setDevice(device_id_);
-            
-            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
-            
-            if (prevDeviceID != device_id_)
-                setDevice(prevDeviceID);
-        }
-        
-        size_t freeMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _freeMemory;
-        }
-        
-        size_t totalMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _totalMemory;
-        }
-        
-        bool supports(FeatureSet feature_set) const
-        {
-            int version = majorVersion() * 10 + minorVersion();
-            return version >= feature_set;
-        }
-        
-        bool isCompatible() const
-        {
-            // Check PTX compatibility
-            if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
-                return true;
-            
-            // Check BIN compatibility
-                for (int i = minorVersion(); i >= 0; --i)
-                    if (TargetArchs::hasBin(majorVersion(), i))
-                        return true;
-                    
-                    return false;
-        }
-        
-        void query() const
-        {
-            const cudaDeviceProp* prop = deviceProps.get(device_id_);
-            
-            name_ = prop->name;
-            multi_processor_count_ = prop->multiProcessorCount;
-            majorVersion_ = prop->major;
-            minorVersion_ = prop->minor;
-        }
-                
+                        
        void printCudaDeviceInfo(int device) const
        {
            int count = getCudaEnabledDeviceCount();
@ -864,16 +852,16 @@ namespace
            typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
            static const func_t funcs[7][4] =
            {
-                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
-                /*  8S */ {cv::gpu::details::copyWithMask                , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask                         , cv::gpu::details::copyWithMask                         },
-                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
-                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
-                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
-                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
-                /* 64F */ {cv::gpu::details::copyWithMask                , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask                         , cv::gpu::details::copyWithMask                         }
+                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
+                /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
+                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
+                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
+                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
+                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
+                /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
            };

-            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask;
+            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;

            func(src, dst, mask, 0);
        }
@ -971,7 +959,7 @@ namespace
            func(src, dst);
        }

-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
+        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
        {
            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
            CV_Assert(dst.depth() <= CV_64F);
@ -982,10 +970,10 @@ namespace
                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
            }

-            cv::gpu::device::convertTo(src, dst, alpha, beta);
+            cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
        }

-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
+        void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
        {
            if (mask.empty())
            {
@ -1016,7 +1004,7 @@ namespace
                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                 , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
+                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
                };

                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
@ -1027,7 +1015,10 @@ namespace
                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
                }

-                funcs[m.depth()][m.channels() - 1](m, s);
+                if (stream)
+                    cv::gpu::device::setTo(m, s, stream);
+                else
+                    funcs[m.depth()][m.channels() - 1](m, s);
            }
            else
            {
@ -1051,7 +1042,10 @@ namespace
                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
                }

-                funcs[m.depth()][m.channels() - 1](m, s, mask);
+                if (stream)
+                    cv::gpu::device::setTo(m, s, mask, stream);
+                else
+                    funcs[m.depth()][m.channels() - 1](m, s, mask);
            }
        }

@ -1065,5 +1059,5 @@ namespace
            cudaFree(devPtr);
        }
    };
-}
+#endif
 #endif