From d4087f19a2aa38c00b101b01d06c60dc70edf5d0 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Wed, 11 Dec 2013 16:38:30 +0400
Subject: [PATCH 01/13] All CUDA related stuff were moved to separate dynamic
 library.

---
 modules/core/CMakeLists.txt                  |   23 +-
 modules/core/cuda/CMakeLists.txt             |   11 +
 modules/core/cuda/main.cpp                   |   23 +
 modules/core/include/opencv2/core/gpumat.hpp |    2 +
 modules/core/src/gpumat.cpp                  | 1145 ++----------------
 modules/core/src/gpumat_cuda.hpp             | 1069 ++++++++++++++++
 6 files changed, 1201 insertions(+), 1072 deletions(-)
 create mode 100644 modules/core/cuda/CMakeLists.txt
 create mode 100644 modules/core/cuda/main.cpp
 create mode 100644 modules/core/src/gpumat_cuda.hpp

diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 66b8ae0d2..595198292 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,22 +1,27 @@
 set(the_description "The Core Functionality")
-ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
-ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
 
 if(HAVE_WINRT)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
 
-if(HAVE_CUDA)
-  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
-endif()
-
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
 
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
 
+if(DYNAMIC_CUDA_SUPPORT)
+  add_definitions(-DDYNAMIC_CUDA_SUPPORT)
+endif()
+
+ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
+ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
+
+if(HAVE_CUDA)
+  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+endif()
+
 ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
                         HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 
@@ -25,3 +30,7 @@ ocv_add_precompiled_headers(${the_module})
 
 ocv_add_accuracy_tests()
 ocv_add_perf_tests()
+
+if(DYNAMIC_CUDA_SUPPORT)
+  add_subdirectory(cuda)
+endif()
diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt
new file mode 100644
index 000000000..0b1c9428d
--- /dev/null
+++ b/modules/core/cuda/CMakeLists.txt
@@ -0,0 +1,11 @@
+project(opencv_core_cuda)
+set(HAVE_CUDA FALSE)
+add_definitions("-DHAVE_CUDA")
+include_directories(${CUDA_INCLUDE_DIRS}
+                    "../src/"
+                    "../include/opencv2/core/"
+                    "${OpenCV_SOURCE_DIR}/modules/gpu/include"
+                   )
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+cuda_add_library(opencv_core_cuda SHARED main.cpp ../src/cuda/matrix_operations.cu)
+target_link_libraries(opencv_core_cuda ${CUDA_LIBRARIES})
\ No newline at end of file
diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp
new file mode 100644
index 000000000..c4b8cbe1d
--- /dev/null
+++ b/modules/core/cuda/main.cpp
@@ -0,0 +1,23 @@
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+
+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#include <npp.h>
+
+#define CUDART_MINIMUM_REQUIRED_VERSION 4020
+#define NPP_MINIMUM_REQUIRED_VERSION 4200
+
+#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
+#error "Insufficient Cuda Runtime library version, please update it."
+#endif
+
+#if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+#error "Insufficient NPP version, please update it."
+#endif
+#endif
+
+using namespace cv;
+using namespace cv::gpu;
+
+#include "gpumat_cuda.hpp"
\ No newline at end of file
diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index 193c9aa70..b50210213 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -48,6 +48,8 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/core/cuda_devptrs.hpp"
 
+#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
+
 namespace cv { namespace gpu
 {
     //////////////////////////////// Initialization & Info ////////////////////////
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 4c4af61c4..9a2e36cb6 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -44,7 +44,7 @@
 #include "opencv2/core/gpumat.hpp"
 #include <iostream>
 
-#ifdef HAVE_CUDA
+#if defined(HAVE_CUDA)
     #include <cuda_runtime.h>
     #include <npp.h>
 
@@ -64,489 +64,62 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-#ifndef HAVE_CUDA
-
-#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
-
-#else // HAVE_CUDA
+#include "gpumat_cuda.hpp"
 
 namespace
 {
-#define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
-#define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
-
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    const GpuFuncTable* gpuFuncTable()
     {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            std::ostringstream msg;
-            msg << "NPP API Call Error: " << err;
-            cv::gpu::error(msg.str().c_str(), file, line, func);
-        }
+        static EmptyFuncTable funcTable;
+        return &funcTable;
     }
 }
 
-#endif // HAVE_CUDA
-
 //////////////////////////////// Initialization & Info ////////////////////////
 
-#ifndef HAVE_CUDA
+int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); }
 
-int cv::gpu::getCudaEnabledDeviceCount() { return 0; }
+void cv::gpu::setDevice(int device) { gpuFuncTable()->setDevice(device); }
+int cv::gpu::getDevice() { return gpuFuncTable()->getDevice(); }
 
-void cv::gpu::setDevice(int) { throw_nogpu; }
-int cv::gpu::getDevice() { throw_nogpu; return 0; }
+void cv::gpu::resetDevice() { gpuFuncTable()->resetDevice(); }
 
-void cv::gpu::resetDevice() { throw_nogpu; }
+bool cv::gpu::deviceSupports(FeatureSet feature_set) { return gpuFuncTable()->deviceSupports(feature_set); }
 
-bool cv::gpu::deviceSupports(FeatureSet) { throw_nogpu; return false; }
+bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return gpuFuncTable()->builtWith(feature_set); }
+bool cv::gpu::TargetArchs::has(int major, int minor) { return gpuFuncTable()->has(major, minor); }
+bool cv::gpu::TargetArchs::hasPtx(int major, int minor) {  return gpuFuncTable()->hasPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return gpuFuncTable()->hasBin(major, minor);  }
+bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrLessPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuFuncTable()->hasEqualOrGreater(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); }
 
-bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::has(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasBin(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_nogpu; return false; }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_nogpu; return false; }
+size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); }
+void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); }
+size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); }
+size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); }
+bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); }
+bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); }
+void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); }
 
-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_nogpu; return 0; }
-void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; }
-size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; }
-size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; }
-bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_nogpu; return false; }
-bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; }
-void cv::gpu::DeviceInfo::query() { throw_nogpu; }
+void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
+void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); }
 
-void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; }
-void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; }
+#ifdef HAVE_CUDA
 
-#else // HAVE_CUDA
-
-int cv::gpu::getCudaEnabledDeviceCount()
+namespace cv { namespace gpu
 {
-    int count;
-    cudaError_t error = cudaGetDeviceCount( &count );
-
-    if (error == cudaErrorInsufficientDriver)
-        return -1;
-
-    if (error == cudaErrorNoDevice)
-        return 0;
-
-    cudaSafeCall( error );
-    return count;
-}
-
-void cv::gpu::setDevice(int device)
-{
-    cudaSafeCall( cudaSetDevice( device ) );
-}
-
-int cv::gpu::getDevice()
-{
-    int device;
-    cudaSafeCall( cudaGetDevice( &device ) );
-    return device;
-}
-
-void cv::gpu::resetDevice()
-{
-    cudaSafeCall( cudaDeviceReset() );
-}
-
-namespace
-{
-    class CudaArch
-    {
-    public:
-        CudaArch();
-
-        bool builtWith(FeatureSet feature_set) const;
-        bool hasPtx(int major, int minor) const;
-        bool hasBin(int major, int minor) const;
-        bool hasEqualOrLessPtx(int major, int minor) const;
-        bool hasEqualOrGreaterPtx(int major, int minor) const;
-        bool hasEqualOrGreaterBin(int major, int minor) const;
-
-    private:
-        static void fromStr(const string& set_as_str, vector<int>& arr);
-
-        vector<int> bin;
-        vector<int> ptx;
-        vector<int> features;
-    };
-
-    const CudaArch cudaArch;
-
-    CudaArch::CudaArch()
-    {
-        fromStr(CUDA_ARCH_BIN, bin);
-        fromStr(CUDA_ARCH_PTX, ptx);
-        fromStr(CUDA_ARCH_FEATURES, features);
-    }
-
-    bool CudaArch::builtWith(FeatureSet feature_set) const
-    {
-        return !features.empty() && (features.back() >= feature_set);
-    }
-
-    bool CudaArch::hasPtx(int major, int minor) const
-    {
-        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
-    }
-
-    bool CudaArch::hasBin(int major, int minor) const
-    {
-        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
-    }
-
-    bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
-    {
-        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
-    }
-
-    bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
-    {
-        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
-    }
-
-    bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
-    {
-        return !bin.empty() && (bin.back() >= major * 10 + minor);
-    }
-
-    void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
-    {
-        if (set_as_str.find_first_not_of(" ") == string::npos)
-            return;
-
-        istringstream stream(set_as_str);
-        int cur_value;
-
-        while (!stream.eof())
-        {
-            stream >> cur_value;
-            arr.push_back(cur_value);
-        }
-
-        sort(arr.begin(), arr.end());
-    }
-}
-
-bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
-{
-    return cudaArch.builtWith(feature_set);
-}
-
-bool cv::gpu::TargetArchs::has(int major, int minor)
-{
-    return hasPtx(major, minor) || hasBin(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
-{
-    return cudaArch.hasPtx(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasBin(int major, int minor)
-{
-    return cudaArch.hasBin(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
-{
-    return cudaArch.hasEqualOrLessPtx(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
-{
-    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
-{
-    return cudaArch.hasEqualOrGreaterPtx(major, minor);
-}
-
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
-{
-    return cudaArch.hasEqualOrGreaterBin(major, minor);
-}
-
-bool cv::gpu::deviceSupports(FeatureSet feature_set)
-{
-    static int versions[] =
-    {
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-    };
-    static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
-
-    const int devId = getDevice();
-
-    int version;
-
-    if (devId < cache_size && versions[devId] >= 0)
-        version = versions[devId];
-    else
-    {
-        DeviceInfo dev(devId);
-        version = dev.majorVersion() * 10 + dev.minorVersion();
-        if (devId < cache_size)
-            versions[devId] = version;
-    }
-
-    return TargetArchs::builtWith(feature_set) && (version >= feature_set);
-}
-
-namespace
-{
-    class DeviceProps
-    {
-    public:
-        DeviceProps();
-        ~DeviceProps();
-
-        cudaDeviceProp* get(int devID);
-
-    private:
-        std::vector<cudaDeviceProp*> props_;
-    };
-
-    DeviceProps::DeviceProps()
-    {
-        props_.resize(10, 0);
-    }
-
-    DeviceProps::~DeviceProps()
-    {
-        for (size_t i = 0; i < props_.size(); ++i)
-        {
-            if (props_[i])
-                delete props_[i];
-        }
-        props_.clear();
-    }
-
-    cudaDeviceProp* DeviceProps::get(int devID)
-    {
-        if (devID >= (int) props_.size())
-            props_.resize(devID + 5, 0);
-
-        if (!props_[devID])
-        {
-            props_[devID] = new cudaDeviceProp;
-            cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
-        }
-
-        return props_[devID];
-    }
-
-    DeviceProps deviceProps;
-}
-
-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const
-{
-    return deviceProps.get(device_id_)->sharedMemPerBlock;
-}
-
-void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
-{
-    int prevDeviceID = getDevice();
-    if (prevDeviceID != device_id_)
-        setDevice(device_id_);
-
-    cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
-
-    if (prevDeviceID != device_id_)
-        setDevice(prevDeviceID);
-}
-
-size_t cv::gpu::DeviceInfo::freeMemory() const
-{
-    size_t _totalMemory, _freeMemory;
-    queryMemory(_totalMemory, _freeMemory);
-    return _freeMemory;
-}
-
-size_t cv::gpu::DeviceInfo::totalMemory() const
-{
-    size_t _totalMemory, _freeMemory;
-    queryMemory(_totalMemory, _freeMemory);
-    return _totalMemory;
-}
-
-bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const
-{
-    int version = majorVersion() * 10 + minorVersion();
-    return version >= feature_set;
-}
-
-bool cv::gpu::DeviceInfo::isCompatible() const
-{
-    // Check PTX compatibility
-    if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
-        return true;
-
-    // Check BIN compatibility
-    for (int i = minorVersion(); i >= 0; --i)
-        if (TargetArchs::hasBin(majorVersion(), i))
-            return true;
-
-    return false;
-}
-
-void cv::gpu::DeviceInfo::query()
-{
-    const cudaDeviceProp* prop = deviceProps.get(device_id_);
-
-    name_ = prop->name;
-    multi_processor_count_ = prop->multiProcessorCount;
-    majorVersion_ = prop->major;
-    minorVersion_ = prop->minor;
-}
-
-namespace
-{
-    int convertSMVer2Cores(int major, int minor)
-    {
-        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-        typedef struct {
-            int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-            int Cores;
-        } SMtoCores;
-
-        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
-
-        int index = 0;
-        while (gpuArchCoresPerSM[index].SM != -1)
-        {
-            if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
-                return gpuArchCoresPerSM[index].Cores;
-            index++;
-        }
-
-        return -1;
-    }
-}
-
-void cv::gpu::printCudaDeviceInfo(int device)
-{
-    int count = getCudaEnabledDeviceCount();
-    bool valid = (device >= 0) && (device < count);
-
-    int beg = valid ? device   : 0;
-    int end = valid ? device+1 : count;
-
-    printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
-    printf("Device count: %d\n", count);
-
-    int driverVersion = 0, runtimeVersion = 0;
-    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-
-    const char *computeMode[] = {
-        "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
-        "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
-        "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
-        "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
-        "Unknown",
-        NULL
-    };
-
-    for(int dev = beg; dev < end; ++dev)
-    {
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-
-        printf("\nDevice %d: \"%s\"\n", dev, prop.name);
-        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
-        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-
-        int cores = convertSMVer2Cores(prop.major, prop.minor);
-        if (cores > 0)
-            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
-
-        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
-
-        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
-            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
-            prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
-        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
-            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
-            prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-
-        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
-        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
-        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
-        printf("  Warp size:                                     %d\n", prop.warpSize);
-        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
-        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
-        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
-        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
-        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
-
-        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
-        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
-        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
-
-        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
-        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
-        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
-        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
-        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
-        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
-        printf("  Compute Mode:\n");
-        printf("      %s \n", computeMode[prop.computeMode]);
-    }
-
-    printf("\n");
-    printf("deviceQuery, CUDA Driver = CUDART");
-    printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
-    printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
-    printf(", NumDevs = %d\n\n", count);
-    fflush(stdout);
-}
-
-void cv::gpu::printShortCudaDeviceInfo(int device)
-{
-    int count = getCudaEnabledDeviceCount();
-    bool valid = (device >= 0) && (device < count);
-
-    int beg = valid ? device   : 0;
-    int end = valid ? device+1 : count;
-
-    int driverVersion = 0, runtimeVersion = 0;
-    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-
-    for(int dev = beg; dev < end; ++dev)
-    {
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-
-        const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
-        printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
-        printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
-
-        int cores = convertSMVer2Cores(prop.major, prop.minor);
-        if (cores > 0)
-            printf(", %d cores", cores * prop.multiProcessorCount);
-
-        printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-    }
-    fflush(stdout);
-}
-
-#endif // HAVE_CUDA
+    CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, cudaStream_t);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, cudaStream_t = 0);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, cudaStream_t);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
+}}
+
+#endif
 
 //////////////////////////////// GpuMat ///////////////////////////////
 
@@ -830,601 +403,6 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
     return mat = GpuMat(rows, cols, type);
 }
 
-namespace
-{
-    class GpuFuncTable
-    {
-    public:
-        virtual ~GpuFuncTable() {}
-
-        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
-        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
-        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
-
-        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
-
-        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
-        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
-
-        virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
-
-        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
-        virtual void free(void* devPtr) const = 0;
-    };
-}
-
-#ifndef HAVE_CUDA
-
-namespace
-{
-    class EmptyFuncTable : public GpuFuncTable
-    {
-    public:
-        void copy(const Mat&, GpuMat&) const { throw_nogpu; }
-        void copy(const GpuMat&, Mat&) const { throw_nogpu; }
-        void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
-
-        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
-
-        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu; }
-
-        void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu; }
-
-        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
-        void free(void*) const {}
-    };
-
-    const GpuFuncTable* gpuFuncTable()
-    {
-        static EmptyFuncTable empty;
-        return &empty;
-    }
-}
-
-#else // HAVE_CUDA
-
-namespace cv { namespace gpu { namespace device
-{
-    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
-
-    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);
-
-    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-
-    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
-}}}
-
-namespace
-{
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
-    }
-
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
-    }
-}
-
-
-namespace cv { namespace gpu
-{
-    CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*);
-    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&);
-    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar);
-    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
-}}
-
-
-namespace cv { namespace gpu
-{
-    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
-    {
-        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
-
-        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
-    }
-
-    void convertTo(const GpuMat& src, GpuMat& dst)
-    {
-        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
-    }
-
-    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
-    {
-        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
-    }
-
-    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-
-        static const caller_t callers[] =
-        {
-            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-            kernelSetCaller<float>, kernelSetCaller<double>
-        };
-
-        callers[src.depth()](src, s, stream);
-    }
-
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-
-        static const caller_t callers[] =
-        {
-            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-            kernelSetCaller<float>, kernelSetCaller<double>
-        };
-
-        callers[src.depth()](src, s, mask, stream);
-    }
-
-    void setTo(GpuMat& src, Scalar s)
-    {
-        setTo(src, s, 0);
-    }
-
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
-    {
-        setTo(src, s, mask, 0);
-    }
-}}
-
-namespace
-{
-    template<int n> struct NPPTypeTraits;
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
-    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
-    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
-    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
-    };
-
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Set
-
-    template<int SDEPTH, int SCN> struct NppSetFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SCN> struct NppSetFunc<CV_8S, SCN>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<> struct NppSetFunc<CV_8S, 1>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // CopyMasked
-
-    template<int SDEPTH> struct NppCopyMaskedFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
-    {
-        return reinterpret_cast<size_t>(ptr) % size == 0;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // CudaFuncTable
-
-    class CudaFuncTable : public GpuFuncTable
-    {
-    public:
-        void copy(const Mat& src, GpuMat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
-        }
-        void copy(const GpuMat& src, Mat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
-        }
-        void copy(const GpuMat& src, GpuMat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
-        }
-
-        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
-
-            if (src.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-            static const func_t funcs[7][4] =
-            {
-                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
-                /*  8S */ {cv::gpu::copyWithMask                         , cv::gpu::copyWithMask, cv::gpu::copyWithMask                         , cv::gpu::copyWithMask                         },
-                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
-                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
-                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
-                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
-                /* 64F */ {cv::gpu::copyWithMask                         , cv::gpu::copyWithMask, cv::gpu::copyWithMask                         , cv::gpu::copyWithMask                         }
-            };
-
-            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::copyWithMask;
-
-            func(src, dst, mask, 0);
-        }
-
-        void convert(const GpuMat& src, GpuMat& dst) const
-        {
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
-            static const func_t funcs[7][7][4] =
-            {
-                {
-                    /*  8U ->  8U */ {0, 0, 0, 0},
-                    /*  8U ->  8S */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
-                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
-                    /*  8U -> 32S */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /*  8U -> 64F */ {cv::gpu::convertTo                                , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /*  8S ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S ->  8S */ {0,0,0,0},
-                    /*  8S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
-                    /* 16U ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 16U */ {0,0,0,0},
-                    /* 16U -> 16S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16U -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
-                    /* 16S ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 16U */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 16S */ {0,0,0,0},
-                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                },
-                    /* 16S -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo                                }
-                },
-                {
-                    /* 32S ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S ->  8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 32S */ {0,0,0,0},
-                    /* 32S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F ->  8S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 32S */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 32F -> 32F */ {0,0,0,0},
-                    /* 32F -> 64F */ {cv::gpu::convertTo                                  , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}
-                },
-                {
-                    /* 64F ->  8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F ->  8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /* 64F -> 64F */ {0,0,0,0}
-                }
-            };
-
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-            CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
-
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-            if (!aligned)
-            {
-                cv::gpu::convertTo(src, dst);
-                return;
-            }
-
-            const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
-            CV_DbgAssert(func != 0);
-
-            func(src, dst);
-        }
-
-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
-
-            cv::gpu::convertTo(src, dst, alpha, beta);
-        }
-
-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
-        {
-            if (mask.empty())
-            {
-                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-                {
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
-                    return;
-                }
-
-                if (m.depth() == CV_8U)
-                {
-                    int cn = m.channels();
-
-                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-                    {
-                        int val = saturate_cast<uchar>(s[0]);
-                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
-                        return;
-                    }
-                }
-
-                typedef void (*func_t)(GpuMat& src, Scalar s);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
-                    {cv::gpu::setTo                          , cv::gpu::setTo                          , cv::gpu::setTo                        , cv::gpu::setTo                          },
-                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
-                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
-                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
-                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::setTo                          , cv::gpu::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-                    {cv::gpu::setTo                          , cv::gpu::setTo                          , cv::gpu::setTo                        , cv::gpu::setTo                          }
-                };
-
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
-
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-
-                funcs[m.depth()][m.channels() - 1](m, s);
-            }
-            else
-            {
-                typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
-                    {cv::gpu::setTo                               , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo                               },
-                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
-                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
-                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
-                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
-                    {cv::gpu::setTo                               , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo                               }
-                };
-
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
-
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-
-                funcs[m.depth()][m.channels() - 1](m, s, mask);
-            }
-        }
-
-        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
-        {
-            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
-        }
-
-        void free(void* devPtr) const
-        {
-            cudaFree(devPtr);
-        }
-    };
-
-    const GpuFuncTable* gpuFuncTable()
-    {
-        static CudaFuncTable funcTable;
-        return &funcTable;
-    }
-}
-
-#endif // HAVE_CUDA
-
 void cv::gpu::GpuMat::upload(const Mat& m)
 {
     CV_DbgAssert(!m.empty());
@@ -1492,9 +470,9 @@ void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double bet
     dst.create(size(), rtype);
 
     if (noScale)
-        gpuFuncTable()->convert(*psrc, dst);
+        cv::gpu::convertTo(*psrc, dst);
     else
-        gpuFuncTable()->convert(*psrc, dst, alpha, beta);
+        cv::gpu::convertTo(*psrc, dst, alpha, beta);
 }
 
 GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
@@ -1502,7 +480,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
     CV_Assert(mask.empty() || mask.type() == CV_8UC1);
     CV_DbgAssert(!empty());
 
-    gpuFuncTable()->setTo(*this, s, mask);
+    gpu::setTo(*this, s, mask);
 
     return *this;
 }
@@ -1562,6 +540,43 @@ void cv::gpu::GpuMat::release()
     refcount = 0;
 }
 
+#ifdef HAVE_CUDA
+
+namespace cv { namespace gpu
+{
+    void convertTo(const GpuMat& src, GpuMat& dst)
+    {
+        gpuFuncTable()->convert(src, dst);
+    }
+    
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
+    {
+        gpuFuncTable()->convert(src, dst, alpha, beta, stream);
+    }
+    
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+    {
+        gpuFuncTable()->setTo(src, s, stream);
+    }
+    
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    {
+        gpuFuncTable()->setTo(src, s, mask, stream);        
+    }
+    
+    void setTo(GpuMat& src, Scalar s)
+    {
+        setTo(src, s, 0);
+    }
+    
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        setTo(src, s, mask, 0);
+    }
+}}
+
+#endif
+
 ////////////////////////////////////////////////////////////////////////
 // Error handling
 
@@ -1578,5 +593,5 @@ void cv::gpu::error(const char *error_string, const char *file, const int line,
         cerr.flush();
     }
     else
-        cv::error( cv::Exception(code, error_string, func, file, line) );
+        ::cv::error( ::cv::Exception(code, error_string, func, file, line) );
 }
diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp
new file mode 100644
index 000000000..631d6ea8c
--- /dev/null
+++ b/modules/core/src/gpumat_cuda.hpp
@@ -0,0 +1,1069 @@
+namespace
+{
+#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT)
+
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
+
+    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }
+
+    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+        {
+            std::ostringstream msg;
+            msg << "NPP API Call Error: " << err;
+            cv::gpu::error(msg.str().c_str(), file, line, func);
+        }
+    }
+#endif
+}
+
+namespace
+{
+    class GpuFuncTable
+    {
+    public:
+        virtual ~GpuFuncTable() {}
+
+        // DeviceInfo routines
+        virtual int getCudaEnabledDeviceCount() const = 0;
+
+        virtual void setDevice(int) const = 0;
+        virtual int getDevice() const = 0;
+
+        virtual void resetDevice() const  = 0;
+
+        virtual bool deviceSupports(FeatureSet) const = 0;
+
+        virtual bool builtWith(FeatureSet) const = 0;
+        virtual bool has(int, int) const = 0;
+        virtual bool hasPtx(int, int) const = 0;
+        virtual bool hasBin(int, int) const = 0;
+        virtual bool hasEqualOrLessPtx(int, int) const = 0;
+        virtual bool hasEqualOrGreater(int, int) const = 0;
+        virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
+        virtual bool hasEqualOrGreaterBin(int, int) const = 0;
+
+        virtual size_t sharedMemPerBlock() const = 0;
+        virtual void queryMemory(size_t&, size_t&) const = 0;
+        virtual size_t freeMemory() const = 0;
+        virtual size_t totalMemory() const = 0;
+        virtual bool supports(FeatureSet) const = 0;
+        virtual bool isCompatible() const = 0;
+        virtual void query() const = 0;
+
+        virtual void printCudaDeviceInfo(int) const = 0;
+        virtual void printShortCudaDeviceInfo(int) const = 0;
+        
+        // GpuMat routines
+        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
+        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
+        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
+
+        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
+
+        // gpu::device::convertTo funcs
+        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
+        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
+
+        // for gpu::device::setTo funcs
+        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0;
+        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
+        
+        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
+        virtual void free(void* devPtr) const = 0;
+    };
+}
+
+#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
+namespace
+{
+    class EmptyFuncTable : public GpuFuncTable
+    {
+    public:
+        
+        // DeviceInfo routines
+        int getCudaEnabledDeviceCount() const { return 0; }
+        
+        void setDevice(int) const { throw_nogpu; }
+        int getDevice() const { throw_nogpu; return 0; }
+        
+        void resetDevice() const { throw_nogpu; }
+        
+        bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
+
+        bool builtWith(FeatureSet) const { throw_nogpu; return false; }
+        bool has(int, int) const { throw_nogpu; return false; }
+        bool hasPtx(int, int) const { throw_nogpu; return false; }
+        bool hasBin(int, int) const { throw_nogpu; return false; }
+        bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
+        bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
+        bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
+        bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
+        
+        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
+        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
+        size_t freeMemory() const { throw_nogpu; return 0; }
+        size_t totalMemory() const { throw_nogpu; return 0; }
+        bool supports(FeatureSet) const { throw_nogpu; return false; }
+        bool isCompatible() const { throw_nogpu; return false; }
+        void query() const { throw_nogpu; }
+        
+        void printCudaDeviceInfo(int) const { throw_nogpu; }
+        void printShortCudaDeviceInfo(int) const { throw_nogpu; }
+        
+        void copy(const Mat&, GpuMat&) const { throw_nogpu; }
+        void copy(const GpuMat&, Mat&) const { throw_nogpu; }
+        void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
+
+        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
+
+        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
+        void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
+
+        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; }
+        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
+
+        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
+        void free(void*) const {}
+    };
+}
+
+#else
+
+namespace cv { namespace gpu { namespace device
+{
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
+
+    template <typename T>
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);
+
+    template <typename T>
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+
+    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
+}}}
+
+namespace
+{
+    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
+    {
+        Scalar_<T> sf = s;
+        cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+    }
+
+    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    {
+        Scalar_<T> sf = s;
+        cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+    }
+}
+
+namespace
+{
+    template<int n> struct NPPTypeTraits;
+    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
+    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
+
+    //////////////////////////////////////////////////////////////////////////
+    // Convert
+
+    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+        
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
+    };
+    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
+    {
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+        
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
+    };
+    
+    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+        
+        static void call(const GpuMat& src, GpuMat& dst)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
+            
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
+    {
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+        
+        static void call(const GpuMat& src, GpuMat& dst)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            
+            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
+            
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    
+    //////////////////////////////////////////////////////////////////////////
+    // Set
+    
+    template<int SDEPTH, int SCN> struct NppSetFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    template<int SCN> struct NppSetFunc<CV_8S, SCN>
+    {
+        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    template<> struct NppSetFunc<CV_8S, 1>
+    {
+        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    
+    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        static void call(GpuMat& src, Scalar s)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            
+            Scalar_<src_t> nppS = s;
+            
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+            
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        static void call(GpuMat& src, Scalar s)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            
+            Scalar_<src_t> nppS = s;
+            
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+            
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    
+    template<int SDEPTH, int SCN> struct NppSetMaskFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+    
+    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            
+            Scalar_<src_t> nppS = s;
+            
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+            
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            
+            Scalar_<src_t> nppS = s;
+            
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+            
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    
+    //////////////////////////////////////////////////////////////////////////
+    // CopyMasked
+    
+    template<int SDEPTH> struct NppCopyMaskedFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+    
+    template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        
+        static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+            
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    
+    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+    {
+        return reinterpret_cast<size_t>(ptr) % size == 0;
+    }
+}
+     
+    namespace cv { namespace gpu { namespace devices
+    {
+        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
+        {
+            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+            
+            cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
+        }
+        
+        void convertTo(const GpuMat& src, GpuMat& dst)
+        {
+            cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
+        }
+        
+        void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
+        {
+            cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
+        }
+        
+        void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
+            
+            static const caller_t callers[] =
+            {
+                kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+                kernelSetCaller<float>, kernelSetCaller<double>
+            };
+            
+            callers[src.depth()](src, s, stream);
+        }
+        
+        void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+            
+            static const caller_t callers[] =
+            {
+                kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+                kernelSetCaller<float>, kernelSetCaller<double>
+            };
+            
+            callers[src.depth()](src, s, mask, stream);
+        }
+        
+        void setTo(GpuMat& src, Scalar s)
+        {
+            setTo(src, s, 0);
+        }
+        
+        void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+        {
+            setTo(src, s, mask, 0);
+        }
+    }}
+
+namespace
+{
+    class CudaFuncTable : public GpuFuncTable
+    {
+    protected:
+        
+        class CudaArch
+        {
+        public:
+            CudaArch();
+            
+            bool builtWith(FeatureSet feature_set) const;
+            bool hasPtx(int major, int minor) const;
+            bool hasBin(int major, int minor) const;
+            bool hasEqualOrLessPtx(int major, int minor) const;
+            bool hasEqualOrGreaterPtx(int major, int minor) const;
+            bool hasEqualOrGreaterBin(int major, int minor) const;
+            
+        private:
+            static void fromStr(const string& set_as_str, vector<int>& arr);
+            
+            vector<int> bin;
+            vector<int> ptx;
+            vector<int> features;
+        };
+        
+        const CudaArch cudaArch;
+        
+        CudaArch::CudaArch()
+        {
+            fromStr(CUDA_ARCH_BIN, bin);
+            fromStr(CUDA_ARCH_PTX, ptx);
+            fromStr(CUDA_ARCH_FEATURES, features);
+        }
+        
+        bool CudaArch::builtWith(FeatureSet feature_set) const
+        {
+            return !features.empty() && (features.back() >= feature_set);
+        }
+        
+        bool CudaArch::hasPtx(int major, int minor) const
+        {
+            return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+        }
+        
+        bool CudaArch::hasBin(int major, int minor) const
+        {
+            return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+        }
+        
+        bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+        {
+            return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+        }
+        
+        bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+        {
+            return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+        }
+        
+        bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+        {
+            return !bin.empty() && (bin.back() >= major * 10 + minor);
+        }
+        
+        void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
+        {
+            if (set_as_str.find_first_not_of(" ") == string::npos)
+                return;
+            
+            istringstream stream(set_as_str);
+            int cur_value;
+            
+            while (!stream.eof())
+            {
+                stream >> cur_value;
+                arr.push_back(cur_value);
+            }
+            
+            sort(arr.begin(), arr.end());
+        }
+
+        class DeviceProps
+        {
+        public:
+            DeviceProps();
+            ~DeviceProps();
+            
+            cudaDeviceProp* get(int devID);
+            
+        private:
+            std::vector<cudaDeviceProp*> props_;
+        };
+        
+        DeviceProps::DeviceProps()
+        {
+            props_.resize(10, 0);
+        }
+        
+        DeviceProps::~DeviceProps()
+        {
+            for (size_t i = 0; i < props_.size(); ++i)
+            {
+                if (props_[i])
+                    delete props_[i];
+            }
+            props_.clear();
+        }
+        
+        cudaDeviceProp* DeviceProps::get(int devID)
+        {
+            if (devID >= (int) props_.size())
+                props_.resize(devID + 5, 0);
+            
+            if (!props_[devID])
+            {
+                props_[devID] = new cudaDeviceProp;
+                cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
+            }
+            
+            return props_[devID];
+        }
+        
+        DeviceProps deviceProps;
+
+        int convertSMVer2Cores(int major, int minor)
+        {
+            // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+            typedef struct {
+                int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+                int Cores;
+            } SMtoCores;
+            
+            SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
+            
+            int index = 0;
+            while (gpuArchCoresPerSM[index].SM != -1)
+            {
+                if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
+                    return gpuArchCoresPerSM[index].Cores;
+                index++;
+            }
+            
+            return -1;
+        }
+        
+    public:
+
+        int getCudaEnabledDeviceCount() const
+        {
+            int count;
+            cudaError_t error = cudaGetDeviceCount( &count );
+            
+            if (error == cudaErrorInsufficientDriver)
+                return -1;
+            
+            if (error == cudaErrorNoDevice)
+                return 0;
+            
+            cudaSafeCall( error );
+            return count;
+        }
+        
+        void setDevice(int device) const
+        {
+            cudaSafeCall( cudaSetDevice( device ) );
+        }
+        
+        int getDevice() const
+        {
+            int device;
+            cudaSafeCall( cudaGetDevice( &device ) );
+            return device;
+        }
+        
+        void resetDevice() const
+        {
+            cudaSafeCall( cudaDeviceReset() );
+        }
+        
+        bool TargetArchs::builtWith(FeatureSet feature_set) const
+        {
+            return cudaArch.builtWith(feature_set);
+        }
+        
+        bool TargetArchs::has(int major, int minor) const
+        {
+            return hasPtx(major, minor) || hasBin(major, minor);
+        }
+        
+        bool TargetArchs::hasPtx(int major, int minor) const
+        {
+            return cudaArch.hasPtx(major, minor);
+        }
+        
+        bool TargetArchs::hasBin(int major, int minor) const
+        {
+            return cudaArch.hasBin(major, minor);
+        }
+        
+        bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const
+        {
+            return cudaArch.hasEqualOrLessPtx(major, minor);
+        }
+        
+        bool TargetArchs::hasEqualOrGreater(int major, int minor) const
+        {
+            return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+        }
+        
+        bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const
+        {
+            return cudaArch.hasEqualOrGreaterPtx(major, minor);
+        }
+        
+        bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const
+        {
+            return cudaArch.hasEqualOrGreaterBin(major, minor);
+        }
+        
+        bool deviceSupports(FeatureSet feature_set) const
+        {
+            static int versions[] =
+            {
+                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+            };
+            static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+            
+            const int devId = getDevice();
+            
+            int version;
+            
+            if (devId < cache_size && versions[devId] >= 0)
+                version = versions[devId];
+            else
+            {
+                DeviceInfo dev(devId);
+                version = dev.majorVersion() * 10 + dev.minorVersion();
+                if (devId < cache_size)
+                    versions[devId] = version;
+            }
+            
+            return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+        }
+        
+        size_t sharedMemPerBlock() const
+        {
+            return deviceProps.get(device_id_)->sharedMemPerBlock;
+        }
+        
+        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+        {
+            int prevDeviceID = getDevice();
+            if (prevDeviceID != device_id_)
+                setDevice(device_id_);
+            
+            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+            
+            if (prevDeviceID != device_id_)
+                setDevice(prevDeviceID);
+        }
+        
+        size_t freeMemory() const
+        {
+            size_t _totalMemory, _freeMemory;
+            queryMemory(_totalMemory, _freeMemory);
+            return _freeMemory;
+        }
+        
+        size_t totalMemory() const
+        {
+            size_t _totalMemory, _freeMemory;
+            queryMemory(_totalMemory, _freeMemory);
+            return _totalMemory;
+        }
+        
+        bool supports(FeatureSet feature_set) const
+        {
+            int version = majorVersion() * 10 + minorVersion();
+            return version >= feature_set;
+        }
+        
+        bool isCompatible() const
+        {
+            // Check PTX compatibility
+            if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
+                return true;
+            
+            // Check BIN compatibility
+                for (int i = minorVersion(); i >= 0; --i)
+                    if (TargetArchs::hasBin(majorVersion(), i))
+                        return true;
+                    
+                    return false;
+        }
+        
+        void query() const
+        {
+            const cudaDeviceProp* prop = deviceProps.get(device_id_);
+            
+            name_ = prop->name;
+            multi_processor_count_ = prop->multiProcessorCount;
+            majorVersion_ = prop->major;
+            minorVersion_ = prop->minor;
+        }
+                
+        void printCudaDeviceInfo(int device) const
+        {
+            int count = getCudaEnabledDeviceCount();
+            bool valid = (device >= 0) && (device < count);
+            
+            int beg = valid ? device   : 0;
+            int end = valid ? device+1 : count;
+            
+            printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
+            printf("Device count: %d\n", count);
+            
+            int driverVersion = 0, runtimeVersion = 0;
+            cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+            cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+            
+            const char *computeMode[] = {
+                "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+                "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+                "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+                "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+                "Unknown",
+                NULL
+            };
+            
+            for(int dev = beg; dev < end; ++dev)
+            {
+                cudaDeviceProp prop;
+                cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+                
+                printf("\nDevice %d: \"%s\"\n", dev, prop.name);
+                printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+                printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
+                printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
+                
+                int cores = convertSMVer2Cores(prop.major, prop.minor);
+                if (cores > 0)
+                    printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
+                
+                printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
+                
+                printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
+                prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
+                prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
+                printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
+                prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
+                prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
+                
+                printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
+                printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
+                printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
+                printf("  Warp size:                                     %d\n", prop.warpSize);
+                printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
+                printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
+                printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
+                printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
+                printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
+                
+                printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
+                printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+                printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
+                printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
+                
+                printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
+                printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
+                printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
+                printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
+                printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
+                printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
+                printf("  Compute Mode:\n");
+                printf("      %s \n", computeMode[prop.computeMode]);
+            }
+            
+            printf("\n");
+            printf("deviceQuery, CUDA Driver = CUDART");
+            printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
+            printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
+            printf(", NumDevs = %d\n\n", count);
+            fflush(stdout);
+        }
+        
+        void printShortCudaDeviceInfo(int device) const
+        {
+            int count = getCudaEnabledDeviceCount();
+            bool valid = (device >= 0) && (device < count);
+            
+            int beg = valid ? device   : 0;
+            int end = valid ? device+1 : count;
+            
+            int driverVersion = 0, runtimeVersion = 0;
+            cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+            cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+            
+            for(int dev = beg; dev < end; ++dev)
+            {
+                cudaDeviceProp prop;
+                cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+                
+                const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
+                printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
+                printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
+                
+                int cores = convertSMVer2Cores(prop.major, prop.minor);
+                if (cores > 0)
+                    printf(", %d cores", cores * prop.multiProcessorCount);
+                
+                printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+            }
+            fflush(stdout);
+        }
+        
+        void copy(const Mat& src, GpuMat& dst) const
+        {
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
+        }
+        void copy(const GpuMat& src, Mat& dst) const
+        {
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
+        }
+        void copy(const GpuMat& src, GpuMat& dst) const
+        {
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+        }
+
+        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
+        {
+            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+
+            if (src.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+
+            typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
+            static const func_t funcs[7][4] =
+            {
+                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
+                /*  8S */ {cv::gpu::details::copyWithMask                , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask                         , cv::gpu::details::copyWithMask                         },
+                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
+                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
+                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
+                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
+                /* 64F */ {cv::gpu::details::copyWithMask                , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask                         , cv::gpu::details::copyWithMask                         }
+            };
+
+            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask;
+
+            func(src, dst, mask, 0);
+        }
+
+        void convert(const GpuMat& src, GpuMat& dst) const
+        {
+            typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
+            static const func_t funcs[7][7][4] =
+            {
+                {
+                    /*  8U ->  8U */ {0, 0, 0, 0},
+                    /*  8U ->  8S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
+                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
+                    /*  8U -> 32S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /*  8U -> 64F */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+                },
+                {
+                    /*  8S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /*  8S ->  8S */ {0,0,0,0},
+                    /*  8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /*  8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /*  8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /*  8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /*  8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+                },
+                {
+                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
+                    /* 16U ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /* 16U -> 16U */ {0,0,0,0},
+                    /* 16U -> 16S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /* 16U -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+                },
+                {
+                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
+                    /* 16S ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /* 16S -> 16U */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /* 16S -> 16S */ {0,0,0,0},
+                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                    /* 16S -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+                },
+                {
+                    /* 32S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32S ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32S -> 32S */ {0,0,0,0},
+                    /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+                },
+                {
+                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32F ->  8S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32F -> 32S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 32F -> 32F */ {0,0,0,0},
+                    /* 32F -> 64F */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+                },
+                {
+                    /* 64F ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 64F ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                    /* 64F -> 64F */ {0,0,0,0}
+                }
+            };
+
+            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+            CV_Assert(dst.depth() <= CV_64F);
+            CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
+
+            if (src.depth() == CV_64F || dst.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+
+            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
+            if (!aligned)
+            {
+                cv::gpu::device::convertTo(src, dst);
+                return;
+            }
+
+            const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
+            CV_DbgAssert(func != 0);
+
+            func(src, dst);
+        }
+
+        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
+        {
+            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+            CV_Assert(dst.depth() <= CV_64F);
+
+            if (src.depth() == CV_64F || dst.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+
+            cv::gpu::device::convertTo(src, dst, alpha, beta);
+        }
+
+        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
+        {
+            if (mask.empty())
+            {
+                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+                {
+                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
+                    return;
+                }
+
+                if (m.depth() == CV_8U)
+                {
+                    int cn = m.channels();
+
+                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
+                    {
+                        int val = saturate_cast<uchar>(s[0]);
+                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
+                        return;
+                    }
+                }
+
+                typedef void (*func_t)(GpuMat& src, Scalar s);
+                static const func_t funcs[7][4] =
+                {
+                    {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
+                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          },
+                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
+                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
+                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
+                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
+                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                 , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
+                };
+
+                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+
+                if (m.depth() == CV_64F)
+                {
+                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+                }
+
+                funcs[m.depth()][m.channels() - 1](m, s);
+            }
+            else
+            {
+                typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
+                static const func_t funcs[7][4] =
+                {
+                    {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
+                    {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               },
+                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
+                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
+                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
+                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
+                    {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               }
+                };
+
+                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+
+                if (m.depth() == CV_64F)
+                {
+                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+                }
+
+                funcs[m.depth()][m.channels() - 1](m, s, mask);
+            }
+        }
+
+        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
+        {
+            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+        }
+
+        void free(void* devPtr) const
+        {
+            cudaFree(devPtr);
+        }
+    };
+}
+#endif
\ No newline at end of file

From 8660e048bc12c348ccfc17d42e97ea7af3aa34b0 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Fri, 13 Dec 2013 17:28:29 +0400
Subject: [PATCH 02/13] Dynamic CUDA support library loading implemented for
 Linux.

Logical mistake in macro fixed;
DeviceInfo deligate reimplemented;
Build and warning fixes.
---
 modules/core/CMakeLists.txt                  |  68 +++-
 modules/core/cuda/CMakeLists.txt             |   3 +-
 modules/core/cuda/main.cpp                   |  29 +-
 modules/core/include/opencv2/core/gpumat.hpp |   3 +
 modules/core/src/gpumat.cpp                  |  97 ++++-
 modules/core/src/gpumat_cuda.hpp             | 384 +++++++++----------
 6 files changed, 357 insertions(+), 227 deletions(-)

diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 595198292..a7a997f67 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,36 +1,76 @@
 set(the_description "The Core Functionality")
 
+macro(ocv_glob_module_sources_no_cuda)
+  file(GLOB_RECURSE lib_srcs "src/*.cpp")
+  file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
+  file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
+  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
+
+  set(cuda_objs "")
+  set(lib_cuda_hdrs "")
+  if(HAVE_CUDA)
+    ocv_include_directories(${CUDA_INCLUDE_DIRS})
+    file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
+  endif()
+
+  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
+
+  file(GLOB cl_kernels "src/opencl/*.cl")
+  if(HAVE_opencv_ocl AND cl_kernels)
+    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
+      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
+    source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
+    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
+  endif()
+
+  source_group("Include" FILES ${lib_hdrs})
+  source_group("Include\\detail" FILES ${lib_hdrs_detail})
+
+  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
+                                 SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs})
+endmacro()
+
+ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
+ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
+
 if(HAVE_WINRT)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
 
+if(DYNAMIC_CUDA_SUPPORT)
+  add_definitions(-DDYNAMIC_CUDA_SUPPORT)
+else()
+  add_definitions(-DUSE_CUDA)
+endif()
+
+if(HAVE_CUDA)
+  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+endif()
+
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
 
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
 
-if(DYNAMIC_CUDA_SUPPORT)
-  add_definitions(-DDYNAMIC_CUDA_SUPPORT)
+if (DYNAMIC_CUDA_SUPPORT)
+  ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                                  HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+else()
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                          HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 endif()
 
-ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
-ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
-
-if(HAVE_CUDA)
-  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
-endif()
-
-ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
-                        HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
-
 ocv_create_module()
 ocv_add_precompiled_headers(${the_module})
 
 ocv_add_accuracy_tests()
 ocv_add_perf_tests()
 
-if(DYNAMIC_CUDA_SUPPORT)
+if (DYNAMIC_CUDA_SUPPORT)
   add_subdirectory(cuda)
 endif()
diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt
index 0b1c9428d..72ecea7a4 100644
--- a/modules/core/cuda/CMakeLists.txt
+++ b/modules/core/cuda/CMakeLists.txt
@@ -1,6 +1,5 @@
 project(opencv_core_cuda)
-set(HAVE_CUDA FALSE)
-add_definitions("-DHAVE_CUDA")
+add_definitions(-DUSE_CUDA)
 include_directories(${CUDA_INCLUDE_DIRS}
                     "../src/"
                     "../include/opencv2/core/"
diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp
index c4b8cbe1d..26d483420 100644
--- a/modules/core/cuda/main.cpp
+++ b/modules/core/cuda/main.cpp
@@ -1,6 +1,10 @@
+#include "cvconfig.h"
 #include "opencv2/core/core.hpp"
 #include "opencv2/core/gpumat.hpp"
 
+#include <stdio.h>
+#include <iostream>
+
 #ifdef HAVE_CUDA
 #include <cuda_runtime.h>
 #include <npp.h>
@@ -17,7 +21,30 @@
 #endif
 #endif
 
+using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-#include "gpumat_cuda.hpp"
\ No newline at end of file
+#include "gpumat_cuda.hpp"
+
+#ifdef HAVE_CUDA
+static CudaDeviceInfoFuncTable deviceInfoTable;
+static CudaFuncTable gpuTable;
+#else
+static EmptyDeviceInfoFuncTable deviceInfoTable;
+static EmptyFuncTable gpuTable;
+#endif
+
+extern "C" {
+   
+DeviceInfoFuncTable* deviceInfoFactory()
+{
+    return (DeviceInfoFuncTable*)&deviceInfoTable;
+}
+
+GpuFuncTable* gpuFactory()
+{
+    return (GpuFuncTable*)&gpuTable;
+}
+
+}
diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index b50210213..d62c8749b 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -137,6 +137,9 @@ namespace cv { namespace gpu
         int deviceID() const { return device_id_; }
 
     private:
+        // Private section is fictive to preserve bin compatibility.
+        // Changes in the private fields there have no effects.
+        // see deligate code.
         void query();
 
         int device_id_;
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 9a2e36cb6..f438dfd8b 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -43,8 +43,9 @@
 #include "precomp.hpp"
 #include "opencv2/core/gpumat.hpp"
 #include <iostream>
+#include <dlfcn.h>
 
-#if defined(HAVE_CUDA)
+#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
     #include <cuda_runtime.h>
     #include <npp.h>
 
@@ -66,15 +67,81 @@ using namespace cv::gpu;
 
 #include "gpumat_cuda.hpp"
 
-namespace
+typedef GpuFuncTable* (*GpuFactoryType)();
+typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)();
+
+static GpuFactoryType gpuFactory = NULL;
+static DeviceInfoFactoryType deviceInfoFactory = NULL;
+
+static const std::string getCudaSupportLibName()
 {
-    const GpuFuncTable* gpuFuncTable()
-    {
-        static EmptyFuncTable funcTable;
-        return &funcTable;
-    }
+    return "libopencv_core_cuda.so";
 }
 
+static bool loadCudaSupportLib()
+{
+    void* handle;
+    const std::string name = getCudaSupportLibName();
+    handle = dlopen(name.c_str(), RTLD_LAZY);
+    if (!handle)
+        return false;
+
+    deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory");
+    if (!deviceInfoFactory)
+    {
+        dlclose(handle);
+        return false;
+    }
+    
+    gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory");
+    if (!gpuFactory)
+    {
+        dlclose(handle);
+        return false;
+    }
+
+    dlclose(handle);
+
+    return true;
+}
+
+static GpuFuncTable* gpuFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyFuncTable stub;
+   static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub;
+   static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaFuncTable impl;
+   static GpuFuncTable* funcTable = &impl;
+#else
+   static EmptyFuncTable stub;
+   static GpuFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
+}
+
+static DeviceInfoFuncTable* deviceInfoFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyDeviceInfoFuncTable stub;
+   static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub;
+   static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaDeviceInfoFuncTable impl;
+   static DeviceInfoFuncTable* funcTable = &impl;
+#else
+   static EmptyFuncTable stub;
+   static DeviceInfoFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
+}
+
+
 //////////////////////////////// Initialization & Info ////////////////////////
 
 int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); }
@@ -95,13 +162,13 @@ bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuF
 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); }
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); }
 
-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); }
-void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); }
-size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); }
-size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); }
-bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); }
-bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); }
-void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); }
+size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
+void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
+size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); }
+size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
+bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
+bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
+void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
 
 void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
 void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); }
@@ -556,7 +623,7 @@ namespace cv { namespace gpu
     
     void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
     {
-        gpuFuncTable()->setTo(src, s, stream);
+        gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream);
     }
     
     void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp
index 631d6ea8c..56d626a5c 100644
--- a/modules/core/src/gpumat_cuda.hpp
+++ b/modules/core/src/gpumat_cuda.hpp
@@ -1,30 +1,19 @@
-namespace
-{
-#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT)
+#ifndef __GPUMAT_CUDA_HPP__
+#define __GPUMAT_CUDA_HPP__
 
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
-
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    class DeviceInfoFuncTable
     {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            std::ostringstream msg;
-            msg << "NPP API Call Error: " << err;
-            cv::gpu::error(msg.str().c_str(), file, line, func);
-        }
-    }
-#endif
-}
-
-namespace
-{
+    public:
+        virtual size_t sharedMemPerBlock() const = 0;
+        virtual void queryMemory(size_t&, size_t&) const = 0;
+        virtual size_t freeMemory() const = 0;
+        virtual size_t totalMemory() const = 0;
+        virtual bool supports(FeatureSet) const = 0;
+        virtual bool isCompatible() const = 0;
+        virtual void query() = 0;
+        virtual ~DeviceInfoFuncTable() {};
+    };
+    
     class GpuFuncTable
     {
     public:
@@ -40,6 +29,7 @@ namespace
 
         virtual bool deviceSupports(FeatureSet) const = 0;
 
+        // TargetArchs
         virtual bool builtWith(FeatureSet) const = 0;
         virtual bool has(int, int) const = 0;
         virtual bool hasPtx(int, int) const = 0;
@@ -49,14 +39,6 @@ namespace
         virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
         virtual bool hasEqualOrGreaterBin(int, int) const = 0;
 
-        virtual size_t sharedMemPerBlock() const = 0;
-        virtual void queryMemory(size_t&, size_t&) const = 0;
-        virtual size_t freeMemory() const = 0;
-        virtual size_t totalMemory() const = 0;
-        virtual bool supports(FeatureSet) const = 0;
-        virtual bool isCompatible() const = 0;
-        virtual void query() const = 0;
-
         virtual void printCudaDeviceInfo(int) const = 0;
         virtual void printShortCudaDeviceInfo(int) const = 0;
         
@@ -72,17 +54,24 @@ namespace
         virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
 
         // for gpu::device::setTo funcs
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0;
         virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
         
         virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
         virtual void free(void* devPtr) const = 0;
     };
-}
 
-#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
-namespace
-{
+    class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
+    {
+    public:
+        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
+        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
+        size_t freeMemory() const { throw_nogpu; return 0; }
+        size_t totalMemory() const { throw_nogpu; return 0; }
+        bool supports(FeatureSet) const { throw_nogpu; return false; }
+        bool isCompatible() const { throw_nogpu; return false; }
+        void query() { throw_nogpu; }
+    };
+    
     class EmptyFuncTable : public GpuFuncTable
     {
     public:
@@ -105,15 +94,7 @@ namespace
         bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
         bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
         bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
-        
-        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
-        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
-        size_t freeMemory() const { throw_nogpu; return 0; }
-        size_t totalMemory() const { throw_nogpu; return 0; }
-        bool supports(FeatureSet) const { throw_nogpu; return false; }
-        bool isCompatible() const { throw_nogpu; return false; }
-        void query() const { throw_nogpu; }
-        
+                
         void printCudaDeviceInfo(int) const { throw_nogpu; }
         void printShortCudaDeviceInfo(int) const { throw_nogpu; }
         
@@ -126,15 +107,32 @@ namespace
         void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
         void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
 
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; }
         virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
 
         void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
         void free(void*) const {}
     };
+
+#if defined(USE_CUDA)
+
+#define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
+#define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
+
+inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
 }
 
-#else
+inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+    {
+        std::ostringstream msg;
+        msg << "NPP API Call Error: " << err;
+        cv::gpu::error(msg.str().c_str(), file, line, func);
+    }
+}
 
 namespace cv { namespace gpu { namespace device
 {
@@ -149,8 +147,6 @@ namespace cv { namespace gpu { namespace device
     void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
 }}}
 
-namespace
-{
     template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
     {
         Scalar_<T> sf = s;
@@ -162,10 +158,7 @@ namespace
         Scalar_<T> sf = s;
         cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
     }
-}
 
-namespace
-{
     template<int n> struct NPPTypeTraits;
     template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
     template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
@@ -208,6 +201,7 @@ namespace
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
+    
     template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
     {
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
@@ -361,9 +355,8 @@ namespace
     {
         return reinterpret_cast<size_t>(ptr) % size == 0;
     }
-}
      
-    namespace cv { namespace gpu { namespace devices
+    namespace cv { namespace gpu { namespace device
     {
         void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
         {
@@ -418,74 +411,52 @@ namespace
         {
             setTo(src, s, mask, 0);
         }
-    }}
+    }}}
 
-namespace
-{
-    class CudaFuncTable : public GpuFuncTable
+
+    class CudaArch
     {
-    protected:
-        
-        class CudaArch
-        {
-        public:
-            CudaArch();
-            
-            bool builtWith(FeatureSet feature_set) const;
-            bool hasPtx(int major, int minor) const;
-            bool hasBin(int major, int minor) const;
-            bool hasEqualOrLessPtx(int major, int minor) const;
-            bool hasEqualOrGreaterPtx(int major, int minor) const;
-            bool hasEqualOrGreaterBin(int major, int minor) const;
-            
-        private:
-            static void fromStr(const string& set_as_str, vector<int>& arr);
-            
-            vector<int> bin;
-            vector<int> ptx;
-            vector<int> features;
-        };
-        
-        const CudaArch cudaArch;
-        
-        CudaArch::CudaArch()
+    public:
+        CudaArch()
         {
             fromStr(CUDA_ARCH_BIN, bin);
             fromStr(CUDA_ARCH_PTX, ptx);
             fromStr(CUDA_ARCH_FEATURES, features);
         }
         
-        bool CudaArch::builtWith(FeatureSet feature_set) const
+        bool builtWith(FeatureSet feature_set) const
         {
             return !features.empty() && (features.back() >= feature_set);
         }
         
-        bool CudaArch::hasPtx(int major, int minor) const
+        bool hasPtx(int major, int minor) const
         {
             return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
         }
         
-        bool CudaArch::hasBin(int major, int minor) const
+        bool hasBin(int major, int minor) const
         {
             return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
         }
         
-        bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+        bool hasEqualOrLessPtx(int major, int minor) const
         {
             return !ptx.empty() && (ptx.front() <= major * 10 + minor);
         }
         
-        bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+        bool hasEqualOrGreaterPtx(int major, int minor) const
         {
             return !ptx.empty() && (ptx.back() >= major * 10 + minor);
         }
         
-        bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+        bool hasEqualOrGreaterBin(int major, int minor) const
         {
             return !bin.empty() && (bin.back() >= major * 10 + minor);
         }
         
-        void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
+        
+    private:
+        void fromStr(const string& set_as_str, vector<int>& arr)
         {
             if (set_as_str.find_first_not_of(" ") == string::npos)
                 return;
@@ -501,25 +472,21 @@ namespace
             
             sort(arr.begin(), arr.end());
         }
-
-        class DeviceProps
-        {
-        public:
-            DeviceProps();
-            ~DeviceProps();
-            
-            cudaDeviceProp* get(int devID);
-            
-        private:
-            std::vector<cudaDeviceProp*> props_;
-        };
         
-        DeviceProps::DeviceProps()
+        vector<int> bin;
+        vector<int> ptx;
+        vector<int> features;
+    };
+
+    class DeviceProps
+    {
+    public:
+        DeviceProps()
         {
             props_.resize(10, 0);
         }
         
-        DeviceProps::~DeviceProps()
+        ~DeviceProps()
         {
             for (size_t i = 0; i < props_.size(); ++i)
             {
@@ -529,7 +496,7 @@ namespace
             props_.clear();
         }
         
-        cudaDeviceProp* DeviceProps::get(int devID)
+        cudaDeviceProp* get(int devID)
         {
             if (devID >= (int) props_.size())
                 props_.resize(devID + 5, 0);
@@ -542,10 +509,92 @@ namespace
             
             return props_[devID];
         }
-        
-        DeviceProps deviceProps;
+    private:
+        std::vector<cudaDeviceProp*> props_;
+    };
 
-        int convertSMVer2Cores(int major, int minor)
+    DeviceProps deviceProps;
+    
+    class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
+    {
+    public:
+        size_t sharedMemPerBlock() const
+        {
+            return deviceProps.get(device_id_)->sharedMemPerBlock;
+        }
+        
+        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+        {
+            int prevDeviceID = getDevice();
+            if (prevDeviceID != device_id_)
+                setDevice(device_id_);
+            
+            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+            
+            if (prevDeviceID != device_id_)
+                setDevice(prevDeviceID);
+        }
+        
+        size_t freeMemory() const
+        {
+            size_t _totalMemory, _freeMemory;
+            queryMemory(_totalMemory, _freeMemory);
+            return _freeMemory;
+        }
+        
+        size_t totalMemory() const
+        {
+            size_t _totalMemory, _freeMemory;
+            queryMemory(_totalMemory, _freeMemory);
+            return _totalMemory;
+        }
+        
+        bool supports(FeatureSet feature_set) const
+        {
+            int version = majorVersion_ * 10 + minorVersion_;
+            return version >= feature_set;
+        }
+        
+        bool isCompatible() const
+        {
+            // Check PTX compatibility
+            if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_))
+                return true;
+            
+            // Check BIN compatibility
+                for (int i = minorVersion_; i >= 0; --i)
+                    if (TargetArchs::hasBin(majorVersion_, i))
+                        return true;
+                    
+                    return false;
+        }
+        
+        void query()
+        {
+            const cudaDeviceProp* prop = deviceProps.get(device_id_);
+            
+            name_ = prop->name;
+            multi_processor_count_ = prop->multiProcessorCount;
+            majorVersion_ = prop->major;
+            minorVersion_ = prop->minor;
+        }
+
+    private:
+        int device_id_;
+        
+        std::string name_;
+        int multi_processor_count_;
+        int majorVersion_;
+        int minorVersion_;
+    };
+    
+    class CudaFuncTable : public GpuFuncTable
+    {
+    protected:
+              
+        const CudaArch cudaArch;
+
+        int convertSMVer2Cores(int major, int minor) const
         {
             // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
             typedef struct {
@@ -600,42 +649,42 @@ namespace
             cudaSafeCall( cudaDeviceReset() );
         }
         
-        bool TargetArchs::builtWith(FeatureSet feature_set) const
+        bool builtWith(FeatureSet feature_set) const
         {
             return cudaArch.builtWith(feature_set);
         }
         
-        bool TargetArchs::has(int major, int minor) const
+        bool has(int major, int minor) const
         {
             return hasPtx(major, minor) || hasBin(major, minor);
         }
         
-        bool TargetArchs::hasPtx(int major, int minor) const
+        bool hasPtx(int major, int minor) const
         {
             return cudaArch.hasPtx(major, minor);
         }
         
-        bool TargetArchs::hasBin(int major, int minor) const
+        bool hasBin(int major, int minor) const
         {
             return cudaArch.hasBin(major, minor);
         }
         
-        bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const
+        bool hasEqualOrLessPtx(int major, int minor) const
         {
             return cudaArch.hasEqualOrLessPtx(major, minor);
         }
         
-        bool TargetArchs::hasEqualOrGreater(int major, int minor) const
+        bool hasEqualOrGreater(int major, int minor) const
         {
             return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
         }
         
-        bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const
+        bool hasEqualOrGreaterPtx(int major, int minor) const
         {
             return cudaArch.hasEqualOrGreaterPtx(major, minor);
         }
         
-        bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const
+        bool hasEqualOrGreaterBin(int major, int minor) const
         {
             return cudaArch.hasEqualOrGreaterBin(major, minor);
         }
@@ -664,68 +713,7 @@ namespace
             
             return TargetArchs::builtWith(feature_set) && (version >= feature_set);
         }
-        
-        size_t sharedMemPerBlock() const
-        {
-            return deviceProps.get(device_id_)->sharedMemPerBlock;
-        }
-        
-        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
-        {
-            int prevDeviceID = getDevice();
-            if (prevDeviceID != device_id_)
-                setDevice(device_id_);
-            
-            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
-            
-            if (prevDeviceID != device_id_)
-                setDevice(prevDeviceID);
-        }
-        
-        size_t freeMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _freeMemory;
-        }
-        
-        size_t totalMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _totalMemory;
-        }
-        
-        bool supports(FeatureSet feature_set) const
-        {
-            int version = majorVersion() * 10 + minorVersion();
-            return version >= feature_set;
-        }
-        
-        bool isCompatible() const
-        {
-            // Check PTX compatibility
-            if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
-                return true;
-            
-            // Check BIN compatibility
-                for (int i = minorVersion(); i >= 0; --i)
-                    if (TargetArchs::hasBin(majorVersion(), i))
-                        return true;
-                    
-                    return false;
-        }
-        
-        void query() const
-        {
-            const cudaDeviceProp* prop = deviceProps.get(device_id_);
-            
-            name_ = prop->name;
-            multi_processor_count_ = prop->multiProcessorCount;
-            majorVersion_ = prop->major;
-            minorVersion_ = prop->minor;
-        }
-                
+                        
         void printCudaDeviceInfo(int device) const
         {
             int count = getCudaEnabledDeviceCount();
@@ -864,16 +852,16 @@ namespace
             typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
             static const func_t funcs[7][4] =
             {
-                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
-                /*  8S */ {cv::gpu::details::copyWithMask                , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask                         , cv::gpu::details::copyWithMask                         },
-                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
-                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
-                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
-                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
-                /* 64F */ {cv::gpu::details::copyWithMask                , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask                         , cv::gpu::details::copyWithMask                         }
+                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
+                /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
+                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
+                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
+                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
+                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
+                /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
             };
 
-            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask;
+            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
 
             func(src, dst, mask, 0);
         }
@@ -971,7 +959,7 @@ namespace
             func(src, dst);
         }
 
-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
+        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
         {
             CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
             CV_Assert(dst.depth() <= CV_64F);
@@ -982,10 +970,10 @@ namespace
                     CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
             }
 
-            cv::gpu::device::convertTo(src, dst, alpha, beta);
+            cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
         }
 
-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
+        void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
         {
             if (mask.empty())
             {
@@ -1016,7 +1004,7 @@ namespace
                     {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
                     {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
                     {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                 , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
+                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
                 };
 
                 CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
@@ -1027,7 +1015,10 @@ namespace
                         CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
                 }
 
-                funcs[m.depth()][m.channels() - 1](m, s);
+                if (stream)
+                    cv::gpu::device::setTo(m, s, stream);
+                else
+                    funcs[m.depth()][m.channels() - 1](m, s);
             }
             else
             {
@@ -1051,7 +1042,10 @@ namespace
                         CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
                 }
 
-                funcs[m.depth()][m.channels() - 1](m, s, mask);
+                if (stream)
+                    cv::gpu::device::setTo(m, s, mask, stream);
+                else
+                    funcs[m.depth()][m.channels() - 1](m, s, mask);
             }
         }
 
@@ -1065,5 +1059,5 @@ namespace
             cudaFree(devPtr);
         }
     };
-}
+#endif
 #endif
\ No newline at end of file

From 88a883e68ee9ab379118a1c68aa14ebaa24d8afd Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Tue, 17 Dec 2013 10:24:00 +0400
Subject: [PATCH 03/13] Build fix.

---
 modules/core/cuda/main.cpp                   | 2 ++
 modules/core/include/opencv2/core/gpumat.hpp | 2 --
 modules/core/src/gpumat.cpp                  | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp
index 26d483420..4f47dc7e9 100644
--- a/modules/core/cuda/main.cpp
+++ b/modules/core/cuda/main.cpp
@@ -25,6 +25,8 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
+#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
+
 #include "gpumat_cuda.hpp"
 
 #ifdef HAVE_CUDA
diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index d62c8749b..755660461 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -48,8 +48,6 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/core/cuda_devptrs.hpp"
 
-#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
-
 namespace cv { namespace gpu
 {
     //////////////////////////////// Initialization & Info ////////////////////////
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index f438dfd8b..7e4eab4a1 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -65,6 +65,8 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
+#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
+
 #include "gpumat_cuda.hpp"
 
 typedef GpuFuncTable* (*GpuFactoryType)();

From be530bd0856c623688e2f2d5842ea171b2afacc1 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Wed, 18 Dec 2013 12:02:15 +0400
Subject: [PATCH 04/13] DeviceInfo class method that were implemented in header
 moved to cpp file.

---
 modules/core/include/opencv2/core/gpumat.hpp | 10 +++---
 modules/core/src/gpumat.cpp                  |  5 +++
 modules/core/src/gpumat_cuda.hpp             | 35 ++++++++++++++++++++
 3 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index 755660461..d0f415ec3 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -112,13 +112,13 @@ namespace cv { namespace gpu
         // Creates DeviceInfo object for the given GPU
         DeviceInfo(int device_id) : device_id_(device_id) { query(); }
 
-        std::string name() const { return name_; }
+        std::string name() const;
 
         // Return compute capability versions
-        int majorVersion() const { return majorVersion_; }
-        int minorVersion() const { return minorVersion_; }
+        int majorVersion() const;
+        int minorVersion() const;
 
-        int multiProcessorCount() const { return multi_processor_count_; }
+        int multiProcessorCount() const;
 
         size_t sharedMemPerBlock() const;
 
@@ -132,7 +132,7 @@ namespace cv { namespace gpu
         // Checks whether the GPU module can be run on the given device
         bool isCompatible() const;
 
-        int deviceID() const { return device_id_; }
+        int deviceID() const;
 
     private:
         // Private section is fictive to preserve bin compatibility.
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 7e4eab4a1..dc24b6e82 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -170,6 +170,11 @@ size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->f
 size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
 bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
 bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
+int cv::gpu::DeviceInfo::deviceID() const { return deviceInfoFuncTable()->deviceID(); };
+int cv::gpu::DeviceInfo::majorVersion() const { return deviceInfoFuncTable()->majorVersion(); }
+int cv::gpu::DeviceInfo::minorVersion() const { return deviceInfoFuncTable()->minorVersion(); }
+std::string cv::gpu::DeviceInfo::name() const { return deviceInfoFuncTable()->name(); }
+int cv::gpu::DeviceInfo::multiProcessorCount() const { return deviceInfoFuncTable()->multiProcessorCount(); }
 void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
 
 void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp
index 56d626a5c..83172d5ca 100644
--- a/modules/core/src/gpumat_cuda.hpp
+++ b/modules/core/src/gpumat_cuda.hpp
@@ -11,6 +11,11 @@
         virtual bool supports(FeatureSet) const = 0;
         virtual bool isCompatible() const = 0;
         virtual void query() = 0;
+        virtual int deviceID() const = 0;
+        virtual std::string name() const = 0;
+        virtual int majorVersion() const = 0;
+        virtual int minorVersion() const = 0;
+        virtual int multiProcessorCount() const = 0;
         virtual ~DeviceInfoFuncTable() {};
     };
     
@@ -70,6 +75,11 @@
         bool supports(FeatureSet) const { throw_nogpu; return false; }
         bool isCompatible() const { throw_nogpu; return false; }
         void query() { throw_nogpu; }
+        int deviceID() const { throw_nogpu; return -1; };
+        std::string name() const { throw_nogpu; return std::string(); }
+        int majorVersion() const { throw_nogpu; return -1; }
+        int minorVersion() const { throw_nogpu; return -1; }
+        int multiProcessorCount() const { throw_nogpu; return -1; }
     };
     
     class EmptyFuncTable : public GpuFuncTable
@@ -579,6 +589,31 @@ namespace cv { namespace gpu { namespace device
             minorVersion_ = prop->minor;
         }
 
+        int deviceID() const
+        {
+            return device_id_;
+        }
+
+        std::string name() const
+        {
+            return name_;
+        }
+
+        int majorVersion() const
+        {
+            return majorVersion_;
+        }
+
+        int minorVersion() const
+        {
+            return minorVersion_;
+        }
+
+        int multiProcessorCount() const
+        {
+            return multi_processor_count_;
+        }
+
     private:
         int device_id_;
         

From 442082eb0ff51353953c605899d61f1f7fb089eb Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 09:38:46 +0400
Subject: [PATCH 05/13] Fixes for Android support.

---
 CMakeLists.txt                   |  2 +
 modules/core/cuda/CMakeLists.txt |  6 +-
 modules/core/src/gpumat.cpp      | 99 +++++++++++++++++++++++++++++++-
 3 files changed, 103 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2a7c730bc..01d49ab84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,6 +128,7 @@ OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) )
+OCV_OPTION(DYNAMIC_CUDA_SUPPORT "Make CUDA support dynamic"                  OFF  IF (WITH_CUDA) AND NOT IOS AND NOT WINDOWS)
 OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT) library support"            ON  IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) )
 OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) )
 OCV_OPTION(WITH_NVCUVID        "Include NVidia Video Decoding library support"                               OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
@@ -853,6 +854,7 @@ if(HAVE_CUDA)
   status("")
   status("  NVIDIA CUDA")
 
+  status("    Dynamic CUDA support:" DYNAMIC_CUDA_SUPPORT THEN YES ELSE NO)
   status("    Use CUFFT:"            HAVE_CUFFT   THEN YES ELSE NO)
   status("    Use CUBLAS:"           HAVE_CUBLAS  THEN YES ELSE NO)
   status("    USE NVCUVID:"          HAVE_NVCUVID THEN YES ELSE NO)
diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt
index 72ecea7a4..828e13b80 100644
--- a/modules/core/cuda/CMakeLists.txt
+++ b/modules/core/cuda/CMakeLists.txt
@@ -7,4 +7,8 @@ include_directories(${CUDA_INCLUDE_DIRS}
                    )
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
 cuda_add_library(opencv_core_cuda SHARED main.cpp ../src/cuda/matrix_operations.cu)
-target_link_libraries(opencv_core_cuda ${CUDA_LIBRARIES})
\ No newline at end of file
+if(BUILD_FAT_JAVA_LIB)
+  target_link_libraries(opencv_core_cuda ${OPENCV_BUILD_DIR}/${LIBRARY_OUTPUT_PATH}/libopencv_java.so ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+else()
+  target_link_libraries(opencv_core_cuda ${OPENCV_BUILD_DIR}/${LIBRARY_OUTPUT_PATH}/libopencv_core.so ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
\ No newline at end of file
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index dc24b6e82..c8d1d058b 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -43,7 +43,6 @@
 #include "precomp.hpp"
 #include "opencv2/core/gpumat.hpp"
 #include <iostream>
-#include <dlfcn.h>
 
 #if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
     #include <cuda_runtime.h>
@@ -61,6 +60,22 @@
     #endif
 #endif
 
+#ifdef DYNAMIC_CUDA_SUPPORT
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#endif
+
+#ifdef ANDROID
+# include <android/log.h>
+
+# define LOG_TAG "OpenCV::CUDA"
+# define LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__))
+# define LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__))
+# define LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__))
+#endif
+
 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
@@ -69,16 +84,90 @@ using namespace cv::gpu;
 
 #include "gpumat_cuda.hpp"
 
+#ifdef DYNAMIC_CUDA_SUPPORT
+
 typedef GpuFuncTable* (*GpuFactoryType)();
 typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)();
 
 static GpuFactoryType gpuFactory = NULL;
 static DeviceInfoFactoryType deviceInfoFactory = NULL;
 
+# if defined(__linux__) || defined(__APPLE__) || defined (ANDROID)
+#  ifdef ANDROID
+static const std::string getCudaSupportLibName()
+{
+    Dl_info dl_info;
+    if(0 != dladdr((void *)getCudaSupportLibName, &dl_info))
+    {
+        LOGD("Library name: %s", dl_info.dli_fname);
+        LOGD("Library base address: %p", dl_info.dli_fbase);
+
+        const char* libName=dl_info.dli_fname;
+        while( ((*libName)=='/') || ((*libName)=='.') )
+        libName++;
+
+        char lineBuf[2048];
+        FILE* file = fopen("/proc/self/smaps", "rt");
+
+        if(file)
+        {
+            while (fgets(lineBuf, sizeof lineBuf, file) != NULL)
+            {
+                //verify that line ends with library name
+                int lineLength = strlen(lineBuf);
+                int libNameLength = strlen(libName);
+
+                //trim end
+                for(int i = lineLength - 1; i >= 0 && isspace(lineBuf[i]); --i)
+                {
+                    lineBuf[i] = 0;
+                    --lineLength;
+                }
+
+                if (0 != strncmp(lineBuf + lineLength - libNameLength, libName, libNameLength))
+                {
+                //the line does not contain the library name
+                    continue;
+                }
+
+                //extract path from smaps line
+                char* pathBegin = strchr(lineBuf, '/');
+                if (0 == pathBegin)
+                {
+                    LOGE("Strange error: could not find path beginning in lin \"%s\"", lineBuf);
+                    continue;
+                }
+
+                char* pathEnd = strrchr(pathBegin, '/');
+                pathEnd[1] = 0;
+
+                LOGD("Libraries folder found: %s", pathBegin);
+
+                fclose(file);
+                return std::string(pathBegin) + "/libopencv_core_cuda.so";
+            }
+            fclose(file);
+            LOGE("Could not find library path");
+        }
+        else
+        {
+            LOGE("Could not read /proc/self/smaps");
+        }
+    }
+    else
+    {
+        LOGE("Could not get library name and base address");
+    }
+
+    return string();
+}
+
+#  else
 static const std::string getCudaSupportLibName()
 {
     return "libopencv_core_cuda.so";
 }
+#  endif
 
 static bool loadCudaSupportLib()
 {
@@ -102,11 +191,15 @@ static bool loadCudaSupportLib()
         return false;
     }
 
-    dlclose(handle);
-
     return true;
 }
 
+# else
+#  error "Dynamic CUDA support is not implemented for this platform!"
+# endif
+
+#endif
+
 static GpuFuncTable* gpuFuncTable()
 {
 #ifdef DYNAMIC_CUDA_SUPPORT

From 6da7c50fb53edd291d709a06aad0b46c1311aac2 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 10:27:38 +0400
Subject: [PATCH 06/13] Make dependency from CUDA explicit to prevent from fake
 dependedcies from CUDA runtime.

---
 CMakeLists.txt                  | 12 ------------
 cmake/OpenCVModule.cmake        |  3 ---
 modules/core/CMakeLists.txt     |  6 +++++-
 modules/gpu/CMakeLists.txt      |  3 ++-
 modules/superres/CMakeLists.txt |  2 +-
 5 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 01d49ab84..56c176453 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -459,18 +459,6 @@ if(WITH_OPENCL)
   include(cmake/OpenCVDetectOpenCL.cmake)
 endif()
 
-# ----------------------------------------------------------------------------
-# Add CUDA libraries (needed for apps/tools, samples)
-# ----------------------------------------------------------------------------
-if(HAVE_CUDA)
-  set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-  if(HAVE_CUBLAS)
-    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cublas_LIBRARY})
-  endif()
-  if(HAVE_CUFFT)
-    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY})
-  endif()
-endif()
 # ----------------------------------------------------------------------------
 # Solution folders:
 # ----------------------------------------------------------------------------
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index c923aba41..d7e7c4a1c 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -537,9 +537,6 @@ macro(ocv_create_module)
     target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS})
     target_link_libraries(${the_module} LINK_INTERFACE_LIBRARIES ${OPENCV_MODULE_${the_module}_DEPS})
     target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
-    if (HAVE_CUDA)
-      target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-    endif()
   endif()
 
   add_dependencies(opencv_modules ${the_module})
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index a7a997f67..07fa08925 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -33,7 +33,11 @@ macro(ocv_glob_module_sources_no_cuda)
                                  SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs})
 endmacro()
 
-ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
+if (DYNAMIC_CUDA_SUPPORT)
+  ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
+else()
+  ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
 ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
 
 if(HAVE_WINRT)
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index a61659789..9171febc7 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,8 @@ if(IOS)
 endif()
 
 set(the_description "GPU-accelerated Computer Vision")
-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy
+               OPTIONAL ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
 
 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
 
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
index 44e9dc0f3..3da8dc2c6 100644
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -4,4 +4,4 @@ endif()
 
 set(the_description "Super Resolution")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef)
-ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl)
+ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})

From 64c94cb22c382aa3b9377d6d94648b91159a8744 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 11:18:04 +0400
Subject: [PATCH 07/13] CUDA related func tables refactored to remove unneeded
 dependencies.

---
 modules/core/src/gpumat.cpp      |  30 +--
 modules/core/src/gpumat_cuda.hpp | 384 +++++++++++++++----------------
 2 files changed, 204 insertions(+), 210 deletions(-)

diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index c8d1d058b..03dcad2af 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -239,23 +239,23 @@ static DeviceInfoFuncTable* deviceInfoFuncTable()
 
 //////////////////////////////// Initialization & Info ////////////////////////
 
-int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); }
+int cv::gpu::getCudaEnabledDeviceCount() { return deviceInfoFuncTable()->getCudaEnabledDeviceCount(); }
 
-void cv::gpu::setDevice(int device) { gpuFuncTable()->setDevice(device); }
-int cv::gpu::getDevice() { return gpuFuncTable()->getDevice(); }
+void cv::gpu::setDevice(int device) { deviceInfoFuncTable()->setDevice(device); }
+int cv::gpu::getDevice() { return deviceInfoFuncTable()->getDevice(); }
 
-void cv::gpu::resetDevice() { gpuFuncTable()->resetDevice(); }
+void cv::gpu::resetDevice() { deviceInfoFuncTable()->resetDevice(); }
 
-bool cv::gpu::deviceSupports(FeatureSet feature_set) { return gpuFuncTable()->deviceSupports(feature_set); }
+bool cv::gpu::deviceSupports(FeatureSet feature_set) { return deviceInfoFuncTable()->deviceSupports(feature_set); }
 
-bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return gpuFuncTable()->builtWith(feature_set); }
-bool cv::gpu::TargetArchs::has(int major, int minor) { return gpuFuncTable()->has(major, minor); }
-bool cv::gpu::TargetArchs::hasPtx(int major, int minor) {  return gpuFuncTable()->hasPtx(major, minor); }
-bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return gpuFuncTable()->hasBin(major, minor);  }
-bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrLessPtx(major, minor); }
-bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuFuncTable()->hasEqualOrGreater(major, minor); }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); }
-bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); }
+bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return deviceInfoFuncTable()->builtWith(feature_set); }
+bool cv::gpu::TargetArchs::has(int major, int minor) { return deviceInfoFuncTable()->has(major, minor); }
+bool cv::gpu::TargetArchs::hasPtx(int major, int minor) {  return deviceInfoFuncTable()->hasPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return deviceInfoFuncTable()->hasBin(major, minor);  }
+bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrLessPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreater(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterPtx(major, minor); }
+bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterBin(major, minor); }
 
 size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
 void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
@@ -270,8 +270,8 @@ std::string cv::gpu::DeviceInfo::name() const { return deviceInfoFuncTable()->na
 int cv::gpu::DeviceInfo::multiProcessorCount() const { return deviceInfoFuncTable()->multiProcessorCount(); }
 void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
 
-void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
-void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); }
+void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); }
+void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); }
 
 #ifdef HAVE_CUDA
 
diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp
index 83172d5ca..9281655d7 100644
--- a/modules/core/src/gpumat_cuda.hpp
+++ b/modules/core/src/gpumat_cuda.hpp
@@ -4,6 +4,7 @@
     class DeviceInfoFuncTable
     {
     public:
+        // cv::DeviceInfo
         virtual size_t sharedMemPerBlock() const = 0;
         virtual void queryMemory(size_t&, size_t&) const = 0;
         virtual size_t freeMemory() const = 0;
@@ -16,25 +17,13 @@
         virtual int majorVersion() const = 0;
         virtual int minorVersion() const = 0;
         virtual int multiProcessorCount() const = 0;
-        virtual ~DeviceInfoFuncTable() {};
-    };
-    
-    class GpuFuncTable
-    {
-    public:
-        virtual ~GpuFuncTable() {}
-
-        // DeviceInfo routines
         virtual int getCudaEnabledDeviceCount() const = 0;
-
         virtual void setDevice(int) const = 0;
         virtual int getDevice() const = 0;
-
         virtual void resetDevice() const  = 0;
-
         virtual bool deviceSupports(FeatureSet) const = 0;
 
-        // TargetArchs
+        // cv::TargetArchs
         virtual bool builtWith(FeatureSet) const = 0;
         virtual bool has(int, int) const = 0;
         virtual bool hasPtx(int, int) const = 0;
@@ -46,7 +35,15 @@
 
         virtual void printCudaDeviceInfo(int) const = 0;
         virtual void printShortCudaDeviceInfo(int) const = 0;
-        
+
+        virtual ~DeviceInfoFuncTable() {};
+    };
+
+    class GpuFuncTable
+    {
+    public:
+        virtual ~GpuFuncTable() {}
+
         // GpuMat routines
         virtual void copy(const Mat& src, GpuMat& dst) const = 0;
         virtual void copy(const GpuMat& src, Mat& dst) const = 0;
@@ -60,7 +57,7 @@
 
         // for gpu::device::setTo funcs
         virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
-        
+
         virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
         virtual void free(void* devPtr) const = 0;
     };
@@ -80,20 +77,14 @@
         int majorVersion() const { throw_nogpu; return -1; }
         int minorVersion() const { throw_nogpu; return -1; }
         int multiProcessorCount() const { throw_nogpu; return -1; }
-    };
-    
-    class EmptyFuncTable : public GpuFuncTable
-    {
-    public:
-        
-        // DeviceInfo routines
+
         int getCudaEnabledDeviceCount() const { return 0; }
-        
+
         void setDevice(int) const { throw_nogpu; }
         int getDevice() const { throw_nogpu; return 0; }
-        
+
         void resetDevice() const { throw_nogpu; }
-        
+
         bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
 
         bool builtWith(FeatureSet) const { throw_nogpu; return false; }
@@ -104,10 +95,15 @@
         bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
         bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
         bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
-                
+
         void printCudaDeviceInfo(int) const { throw_nogpu; }
         void printShortCudaDeviceInfo(int) const { throw_nogpu; }
-        
+    };
+
+    class EmptyFuncTable : public GpuFuncTable
+    {
+    public:
+
         void copy(const Mat&, GpuMat&) const { throw_nogpu; }
         void copy(const GpuMat&, Mat&) const { throw_nogpu; }
         void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
@@ -185,62 +181,62 @@ namespace cv { namespace gpu { namespace device
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        
+
         typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
     };
     template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
     {
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        
+
         typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
     };
-    
+
     template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        
+
         static void call(const GpuMat& src, GpuMat& dst)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
     {
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-        
+
         static void call(const GpuMat& src, GpuMat& dst)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     //////////////////////////////////////////////////////////////////////////
     // Set
-    
+
     template<int SDEPTH, int SCN> struct NppSetFunc
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
     };
     template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
     };
     template<int SCN> struct NppSetFunc<CV_8S, SCN>
@@ -251,172 +247,172 @@ namespace cv { namespace gpu { namespace device
     {
         typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
     };
-    
+
     template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(GpuMat& src, Scalar s)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             Scalar_<src_t> nppS = s;
-            
+
             nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
     template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(GpuMat& src, Scalar s)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             Scalar_<src_t> nppS = s;
-            
+
             nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     template<int SDEPTH, int SCN> struct NppSetMaskFunc
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
     };
     template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
     };
-    
+
     template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(GpuMat& src, Scalar s, const GpuMat& mask)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             Scalar_<src_t> nppS = s;
-            
+
             nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
     template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(GpuMat& src, Scalar s, const GpuMat& mask)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             Scalar_<src_t> nppS = s;
-            
+
             nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     //////////////////////////////////////////////////////////////////////////
     // CopyMasked
-    
+
     template<int SDEPTH> struct NppCopyMaskedFunc
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
     };
-    
+
     template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
     {
         typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        
+
         static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
         {
             NppiSize sz;
             sz.width = src.cols;
             sz.height = src.rows;
-            
+
             nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-            
+
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
-    
+
     template <typename T> static inline bool isAligned(const T* ptr, size_t size)
     {
         return reinterpret_cast<size_t>(ptr) % size == 0;
     }
-     
+
     namespace cv { namespace gpu { namespace device
     {
         void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
         {
             CV_Assert(src.size() == dst.size() && src.type() == dst.type());
             CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
-            
+
             cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
         }
-        
+
         void convertTo(const GpuMat& src, GpuMat& dst)
         {
             cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
         }
-        
+
         void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
         {
             cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
         }
-        
+
         void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
         {
             typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-            
+
             static const caller_t callers[] =
             {
                 kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
                 kernelSetCaller<float>, kernelSetCaller<double>
             };
-            
+
             callers[src.depth()](src, s, stream);
         }
-        
+
         void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
         {
             typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-            
+
             static const caller_t callers[] =
             {
                 kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
                 kernelSetCaller<float>, kernelSetCaller<double>
             };
-            
+
             callers[src.depth()](src, s, mask, stream);
         }
-        
+
         void setTo(GpuMat& src, Scalar s)
         {
             setTo(src, s, 0);
         }
-        
+
         void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
         {
             setTo(src, s, mask, 0);
@@ -433,56 +429,56 @@ namespace cv { namespace gpu { namespace device
             fromStr(CUDA_ARCH_PTX, ptx);
             fromStr(CUDA_ARCH_FEATURES, features);
         }
-        
+
         bool builtWith(FeatureSet feature_set) const
         {
             return !features.empty() && (features.back() >= feature_set);
         }
-        
+
         bool hasPtx(int major, int minor) const
         {
             return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
         }
-        
+
         bool hasBin(int major, int minor) const
         {
             return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
         }
-        
+
         bool hasEqualOrLessPtx(int major, int minor) const
         {
             return !ptx.empty() && (ptx.front() <= major * 10 + minor);
         }
-        
+
         bool hasEqualOrGreaterPtx(int major, int minor) const
         {
             return !ptx.empty() && (ptx.back() >= major * 10 + minor);
         }
-        
+
         bool hasEqualOrGreaterBin(int major, int minor) const
         {
             return !bin.empty() && (bin.back() >= major * 10 + minor);
         }
-        
-        
+
+
     private:
         void fromStr(const string& set_as_str, vector<int>& arr)
         {
             if (set_as_str.find_first_not_of(" ") == string::npos)
                 return;
-            
+
             istringstream stream(set_as_str);
             int cur_value;
-            
+
             while (!stream.eof())
             {
                 stream >> cur_value;
                 arr.push_back(cur_value);
             }
-            
+
             sort(arr.begin(), arr.end());
         }
-        
+
         vector<int> bin;
         vector<int> ptx;
         vector<int> features;
@@ -495,7 +491,7 @@ namespace cv { namespace gpu { namespace device
         {
             props_.resize(10, 0);
         }
-        
+
         ~DeviceProps()
         {
             for (size_t i = 0; i < props_.size(); ++i)
@@ -505,18 +501,18 @@ namespace cv { namespace gpu { namespace device
             }
             props_.clear();
         }
-        
+
         cudaDeviceProp* get(int devID)
         {
             if (devID >= (int) props_.size())
                 props_.resize(devID + 5, 0);
-            
+
             if (!props_[devID])
             {
                 props_[devID] = new cudaDeviceProp;
                 cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
             }
-            
+
             return props_[devID];
         }
     private:
@@ -524,7 +520,7 @@ namespace cv { namespace gpu { namespace device
     };
 
     DeviceProps deviceProps;
-    
+
     class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
     {
     public:
@@ -532,57 +528,57 @@ namespace cv { namespace gpu { namespace device
         {
             return deviceProps.get(device_id_)->sharedMemPerBlock;
         }
-        
+
         void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
         {
             int prevDeviceID = getDevice();
             if (prevDeviceID != device_id_)
                 setDevice(device_id_);
-            
+
             cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
-            
+
             if (prevDeviceID != device_id_)
                 setDevice(prevDeviceID);
         }
-        
+
         size_t freeMemory() const
         {
             size_t _totalMemory, _freeMemory;
             queryMemory(_totalMemory, _freeMemory);
             return _freeMemory;
         }
-        
+
         size_t totalMemory() const
         {
             size_t _totalMemory, _freeMemory;
             queryMemory(_totalMemory, _freeMemory);
             return _totalMemory;
         }
-        
+
         bool supports(FeatureSet feature_set) const
         {
             int version = majorVersion_ * 10 + minorVersion_;
             return version >= feature_set;
         }
-        
+
         bool isCompatible() const
         {
             // Check PTX compatibility
-            if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_))
+            if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
                 return true;
-            
+
             // Check BIN compatibility
                 for (int i = minorVersion_; i >= 0; --i)
-                    if (TargetArchs::hasBin(majorVersion_, i))
+                    if (hasBin(majorVersion_, i))
                         return true;
-                    
+
                     return false;
         }
-        
+
         void query()
         {
             const cudaDeviceProp* prop = deviceProps.get(device_id_);
-            
+
             name_ = prop->name;
             multi_processor_count_ = prop->multiProcessorCount;
             majorVersion_ = prop->major;
@@ -614,116 +610,78 @@ namespace cv { namespace gpu { namespace device
             return multi_processor_count_;
         }
 
-    private:
-        int device_id_;
-        
-        std::string name_;
-        int multi_processor_count_;
-        int majorVersion_;
-        int minorVersion_;
-    };
-    
-    class CudaFuncTable : public GpuFuncTable
-    {
-    protected:
-              
-        const CudaArch cudaArch;
-
-        int convertSMVer2Cores(int major, int minor) const
-        {
-            // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-            typedef struct {
-                int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-                int Cores;
-            } SMtoCores;
-            
-            SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
-            
-            int index = 0;
-            while (gpuArchCoresPerSM[index].SM != -1)
-            {
-                if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
-                    return gpuArchCoresPerSM[index].Cores;
-                index++;
-            }
-            
-            return -1;
-        }
-        
-    public:
-
         int getCudaEnabledDeviceCount() const
         {
             int count;
             cudaError_t error = cudaGetDeviceCount( &count );
-            
+
             if (error == cudaErrorInsufficientDriver)
                 return -1;
-            
+
             if (error == cudaErrorNoDevice)
                 return 0;
-            
+
             cudaSafeCall( error );
             return count;
         }
-        
+
         void setDevice(int device) const
         {
             cudaSafeCall( cudaSetDevice( device ) );
         }
-        
+
         int getDevice() const
         {
             int device;
             cudaSafeCall( cudaGetDevice( &device ) );
             return device;
         }
-        
+
         void resetDevice() const
         {
             cudaSafeCall( cudaDeviceReset() );
         }
-        
+
         bool builtWith(FeatureSet feature_set) const
         {
             return cudaArch.builtWith(feature_set);
         }
-        
+
         bool has(int major, int minor) const
         {
             return hasPtx(major, minor) || hasBin(major, minor);
         }
-        
+
         bool hasPtx(int major, int minor) const
         {
             return cudaArch.hasPtx(major, minor);
         }
-        
+
         bool hasBin(int major, int minor) const
         {
             return cudaArch.hasBin(major, minor);
         }
-        
+
         bool hasEqualOrLessPtx(int major, int minor) const
         {
             return cudaArch.hasEqualOrLessPtx(major, minor);
         }
-        
+
         bool hasEqualOrGreater(int major, int minor) const
         {
             return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
         }
-        
+
         bool hasEqualOrGreaterPtx(int major, int minor) const
         {
             return cudaArch.hasEqualOrGreaterPtx(major, minor);
         }
-        
+
         bool hasEqualOrGreaterBin(int major, int minor) const
         {
             return cudaArch.hasEqualOrGreaterBin(major, minor);
         }
-        
+
         bool deviceSupports(FeatureSet feature_set) const
         {
             static int versions[] =
@@ -731,11 +689,11 @@ namespace cv { namespace gpu { namespace device
                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
             };
             static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
-            
+
             const int devId = getDevice();
-            
+
             int version;
-            
+
             if (devId < cache_size && versions[devId] >= 0)
                 version = versions[devId];
             else
@@ -745,25 +703,25 @@ namespace cv { namespace gpu { namespace device
                 if (devId < cache_size)
                     versions[devId] = version;
             }
-            
+
             return TargetArchs::builtWith(feature_set) && (version >= feature_set);
         }
-                        
+
         void printCudaDeviceInfo(int device) const
         {
             int count = getCudaEnabledDeviceCount();
             bool valid = (device >= 0) && (device < count);
-            
+
             int beg = valid ? device   : 0;
             int end = valid ? device+1 : count;
-            
+
             printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
             printf("Device count: %d\n", count);
-            
+
             int driverVersion = 0, runtimeVersion = 0;
             cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
             cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-            
+
             const char *computeMode[] = {
                 "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
                 "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
@@ -772,30 +730,30 @@ namespace cv { namespace gpu { namespace device
                 "Unknown",
                 NULL
             };
-            
+
             for(int dev = beg; dev < end; ++dev)
             {
                 cudaDeviceProp prop;
                 cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-                
+
                 printf("\nDevice %d: \"%s\"\n", dev, prop.name);
                 printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
                 printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
                 printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-                
+
                 int cores = convertSMVer2Cores(prop.major, prop.minor);
                 if (cores > 0)
                     printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
-                
+
                 printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
-                
+
                 printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
-                prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
-                prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
+                       prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
+                       prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
                 printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
-                prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
-                prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-                
+                       prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
+                       prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
+
                 printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
                 printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
                 printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
@@ -805,12 +763,12 @@ namespace cv { namespace gpu { namespace device
                 printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
                 printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
                 printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
-                
+
                 printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
                 printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
                 printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
                 printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
-                
+
                 printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
                 printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
                 printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
@@ -820,7 +778,7 @@ namespace cv { namespace gpu { namespace device
                 printf("  Compute Mode:\n");
                 printf("      %s \n", computeMode[prop.computeMode]);
             }
-            
+
             printf("\n");
             printf("deviceQuery, CUDA Driver = CUDART");
             printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
@@ -828,37 +786,73 @@ namespace cv { namespace gpu { namespace device
             printf(", NumDevs = %d\n\n", count);
             fflush(stdout);
         }
-        
+
         void printShortCudaDeviceInfo(int device) const
         {
             int count = getCudaEnabledDeviceCount();
             bool valid = (device >= 0) && (device < count);
-            
+
             int beg = valid ? device   : 0;
             int end = valid ? device+1 : count;
-            
+
             int driverVersion = 0, runtimeVersion = 0;
             cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
             cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-            
+
             for(int dev = beg; dev < end; ++dev)
             {
                 cudaDeviceProp prop;
                 cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-                
+
                 const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
                 printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
                 printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
-                
+
                 int cores = convertSMVer2Cores(prop.major, prop.minor);
                 if (cores > 0)
                     printf(", %d cores", cores * prop.multiProcessorCount);
-                
+
                 printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
             }
             fflush(stdout);
         }
-        
+
+    private:
+        int device_id_;
+
+        std::string name_;
+        int multi_processor_count_;
+        int majorVersion_;
+        int minorVersion_;
+
+        const CudaArch cudaArch;
+
+        int convertSMVer2Cores(int major, int minor) const
+        {
+            // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+            typedef struct {
+                int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+                int Cores;
+            } SMtoCores;
+
+            SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
+
+            int index = 0;
+            while (gpuArchCoresPerSM[index].SM != -1)
+            {
+                if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
+                    return gpuArchCoresPerSM[index].Cores;
+                index++;
+            }
+
+            return -1;
+        }
+    };
+
+    class CudaFuncTable : public GpuFuncTable
+    {
+    public:
+
         void copy(const Mat& src, GpuMat& dst) const
         {
             cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );

From 037ffcdf99a821a5a8a3ea7a60b801244fbb93d9 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 16:42:11 +0400
Subject: [PATCH 08/13] Dynamic CUDA support library reimplemented as OpenCV
 module.

---
 CMakeLists.txt                                |  2 -
 cmake/OpenCVModule.cmake                      |  2 +-
 modules/core/CMakeLists.txt                   | 60 +++++--------------
 modules/core/cuda/CMakeLists.txt              | 14 -----
 modules/core/src/gpumat.cpp                   |  4 +-
 modules/dynamicuda/CMakeLists.txt             | 14 +++++
 .../opencv2/dynamicuda/dynamicuda.hpp}        |  0
 .../src/cuda/matrix_operations.cu             |  0
 .../{core/cuda => dynamicuda/src}/main.cpp    |  4 +-
 modules/java/CMakeLists.txt                   |  6 ++
 10 files changed, 41 insertions(+), 65 deletions(-)
 delete mode 100644 modules/core/cuda/CMakeLists.txt
 create mode 100644 modules/dynamicuda/CMakeLists.txt
 rename modules/{core/src/gpumat_cuda.hpp => dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp} (100%)
 rename modules/{core => dynamicuda}/src/cuda/matrix_operations.cu (100%)
 rename modules/{core/cuda => dynamicuda/src}/main.cpp (96%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56c176453..cf25084bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,7 +128,6 @@ OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) )
-OCV_OPTION(DYNAMIC_CUDA_SUPPORT "Make CUDA support dynamic"                  OFF  IF (WITH_CUDA) AND NOT IOS AND NOT WINDOWS)
 OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT) library support"            ON  IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) )
 OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) )
 OCV_OPTION(WITH_NVCUVID        "Include NVidia Video Decoding library support"                               OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
@@ -842,7 +841,6 @@ if(HAVE_CUDA)
   status("")
   status("  NVIDIA CUDA")
 
-  status("    Dynamic CUDA support:" DYNAMIC_CUDA_SUPPORT THEN YES ELSE NO)
   status("    Use CUFFT:"            HAVE_CUFFT   THEN YES ELSE NO)
   status("    Use CUBLAS:"           HAVE_CUBLAS  THEN YES ELSE NO)
   status("    USE NVCUVID:"          HAVE_NVCUVID THEN YES ELSE NO)
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index d7e7c4a1c..3dd749b05 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -488,7 +488,7 @@ macro(ocv_glob_module_sources)
   file(GLOB lib_cuda_srcs "src/cuda/*.cu")
   set(cuda_objs "")
   set(lib_cuda_hdrs "")
-  if(HAVE_CUDA AND lib_cuda_srcs)
+  if(HAVE_CUDA)
     ocv_include_directories(${CUDA_INCLUDE_DIRS})
     file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
 
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 07fa08925..e89d6f276 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,50 +1,18 @@
 set(the_description "The Core Functionality")
 
-macro(ocv_glob_module_sources_no_cuda)
-  file(GLOB_RECURSE lib_srcs "src/*.cpp")
-  file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
-  file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
-  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
-
-  set(cuda_objs "")
-  set(lib_cuda_hdrs "")
-  if(HAVE_CUDA)
-    ocv_include_directories(${CUDA_INCLUDE_DIRS})
-    file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
-  endif()
-
-  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
-
-  file(GLOB cl_kernels "src/opencl/*.cl")
-  if(HAVE_opencv_ocl AND cl_kernels)
-    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
-    add_custom_command(
-      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
-      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
-      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
-    source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
-    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
-  endif()
-
-  source_group("Include" FILES ${lib_hdrs})
-  source_group("Include\\detail" FILES ${lib_hdrs_detail})
-
-  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
-                                 SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs})
-endmacro()
-
-if (DYNAMIC_CUDA_SUPPORT)
+if (HAVE_opencv_dynamicuda)
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
 else()
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 endif()
-ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
+
+ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/dynamicuda/include/" ${ZLIB_INCLUDE_DIR})
 
 if(HAVE_WINRT)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
 
-if(DYNAMIC_CUDA_SUPPORT)
+if(HAVE_opencv_dynamicuda)
   add_definitions(-DDYNAMIC_CUDA_SUPPORT)
 else()
   add_definitions(-DUSE_CUDA)
@@ -58,15 +26,23 @@ endif()
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
 
+if (NOT HAVE_opencv_dynamicuda)
+  file(GLOB lib_cuda               "../dynamicuda/src/cuda/*.cu*")
+endif()
+
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
 
-if (DYNAMIC_CUDA_SUPPORT)
-  ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
-                                  HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
-else()
+if (NOT HAVE_opencv_dynamicuda)
+  source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
+endif()
+
+if (HAVE_opencv_dynamicuda)
   ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
                           HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+else()
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda}
+                          HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 endif()
 
 ocv_create_module()
@@ -74,7 +50,3 @@ ocv_add_precompiled_headers(${the_module})
 
 ocv_add_accuracy_tests()
 ocv_add_perf_tests()
-
-if (DYNAMIC_CUDA_SUPPORT)
-  add_subdirectory(cuda)
-endif()
diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt
deleted file mode 100644
index 828e13b80..000000000
--- a/modules/core/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-project(opencv_core_cuda)
-add_definitions(-DUSE_CUDA)
-include_directories(${CUDA_INCLUDE_DIRS}
-                    "../src/"
-                    "../include/opencv2/core/"
-                    "${OpenCV_SOURCE_DIR}/modules/gpu/include"
-                   )
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
-cuda_add_library(opencv_core_cuda SHARED main.cpp ../src/cuda/matrix_operations.cu)
-if(BUILD_FAT_JAVA_LIB)
-  target_link_libraries(opencv_core_cuda ${OPENCV_BUILD_DIR}/${LIBRARY_OUTPUT_PATH}/libopencv_java.so ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-else()
-  target_link_libraries(opencv_core_cuda ${OPENCV_BUILD_DIR}/${LIBRARY_OUTPUT_PATH}/libopencv_core.so ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
-endif()
\ No newline at end of file
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 03dcad2af..590685b74 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -82,7 +82,7 @@ using namespace cv::gpu;
 
 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
 
-#include "gpumat_cuda.hpp"
+#include "opencv2/dynamicuda/dynamicuda.hpp"
 
 #ifdef DYNAMIC_CUDA_SUPPORT
 
@@ -183,7 +183,7 @@ static bool loadCudaSupportLib()
         dlclose(handle);
         return false;
     }
-    
+
     gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory");
     if (!gpuFactory)
     {
diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt
new file mode 100644
index 000000000..2ae5cf84a
--- /dev/null
+++ b/modules/dynamicuda/CMakeLists.txt
@@ -0,0 +1,14 @@
+if(NOT ANDROID)
+  ocv_module_disable(dynamicuda)
+endif()
+
+set(the_description "Dynamic CUDA linkage")
+
+add_definitions(-DUSE_CUDA)
+ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
+set(OPENCV_MODULE_TYPE SHARED)
+if (BUILD_FAT_JAVA_LIB)
+  ocv_define_module(dynamicuda opencv_java PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+else()
+  ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED q${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+endif()
diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
similarity index 100%
rename from modules/core/src/gpumat_cuda.hpp
rename to modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/dynamicuda/src/cuda/matrix_operations.cu
similarity index 100%
rename from modules/core/src/cuda/matrix_operations.cu
rename to modules/dynamicuda/src/cuda/matrix_operations.cu
diff --git a/modules/core/cuda/main.cpp b/modules/dynamicuda/src/main.cpp
similarity index 96%
rename from modules/core/cuda/main.cpp
rename to modules/dynamicuda/src/main.cpp
index 4f47dc7e9..4a05d8696 100644
--- a/modules/core/cuda/main.cpp
+++ b/modules/dynamicuda/src/main.cpp
@@ -27,7 +27,7 @@ using namespace cv::gpu;
 
 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
 
-#include "gpumat_cuda.hpp"
+#include "opencv2/dynamicuda/dynamicuda.hpp"
 
 #ifdef HAVE_CUDA
 static CudaDeviceInfoFuncTable deviceInfoTable;
@@ -38,7 +38,7 @@ static EmptyFuncTable gpuTable;
 #endif
 
 extern "C" {
-   
+
 DeviceInfoFuncTable* deviceInfoFactory()
 {
     return (DeviceInfoFuncTable*)&deviceInfoTable;
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 5012f914c..291295fb5 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -297,6 +297,12 @@ if(BUILD_FAT_JAVA_LIB)
       list(REMOVE_ITEM __deps ${m})
     endif()
   endforeach()
+  if (HAVE_opencv_dynamicuda)
+    list(REMOVE_ITEM __deps "opencv_dynamicuda")
+  endif()
+  if (ANDROID AND HAVE_opencv_gpu)
+    list(REMOVE_ITEM __deps "opencv_gpu")
+  endif()
   ocv_list_unique(__deps)
   set(__extradeps ${__deps})
   ocv_list_filterout(__extradeps "^opencv_")

From 5a5c82bb1d395aeb76bd76f14a1db22742c02599 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 17:41:04 +0400
Subject: [PATCH 09/13] Additional ENABLE_DYNAMIC_CUDA option implemented in
 cmake. Warning fixes and refactoring.

---
 CMakeLists.txt                                |    1 +
 modules/core/CMakeLists.txt                   |   14 +-
 modules/dynamicuda/CMakeLists.txt             |    1 +
 .../include/opencv2/dynamicuda/dynamicuda.hpp | 1899 +++++++++--------
 modules/dynamicuda/src/main.cpp               |    3 +
 modules/java/CMakeLists.txt                   |    2 +-
 6 files changed, 969 insertions(+), 951 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf25084bc..2c5165c1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi
 
 # OpenCV build options
 # ===================================================
+OCV_OPTION(ENABLE_DYNAMIC_CUDA        "Enabled dynamic CUDA linkage"                             ON   IF ANDROID OR LINUX)
 OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON   IF (NOT IOS) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CMAKE_COMPILER_IS_GNUCXX )
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index e89d6f276..f20e32d3a 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,8 +1,12 @@
 set(the_description "The Core Functionality")
 
-if (HAVE_opencv_dynamicuda)
+message(STATUS "ENABLE_DYNAMIC_CUDA ${ENABLE_DYNAMIC_CUDA}")
+
+if (ENABLE_DYNAMIC_CUDA)
+  message(STATUS "Using dynamic cuda approach")
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
 else()
+  message(STATUS "Link CUDA statically")
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 endif()
 
@@ -12,7 +16,7 @@ if(HAVE_WINRT)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
 
-if(HAVE_opencv_dynamicuda)
+if(ENABLE_DYNAMIC_CUDA)
   add_definitions(-DDYNAMIC_CUDA_SUPPORT)
 else()
   add_definitions(-DUSE_CUDA)
@@ -26,18 +30,18 @@ endif()
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
 
-if (NOT HAVE_opencv_dynamicuda)
+if (NOT ENABLE_DYNAMIC_CUDA)
   file(GLOB lib_cuda               "../dynamicuda/src/cuda/*.cu*")
 endif()
 
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
 
-if (NOT HAVE_opencv_dynamicuda)
+if (NOT ENABLE_DYNAMIC_CUDA)
   source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
 endif()
 
-if (HAVE_opencv_dynamicuda)
+if (ENABLE_DYNAMIC_CUDA)
   ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
                           HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 else()
diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt
index 2ae5cf84a..def05d19b 100644
--- a/modules/dynamicuda/CMakeLists.txt
+++ b/modules/dynamicuda/CMakeLists.txt
@@ -5,6 +5,7 @@ endif()
 set(the_description "Dynamic CUDA linkage")
 
 add_definitions(-DUSE_CUDA)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
 ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
 set(OPENCV_MODULE_TYPE SHARED)
 if (BUILD_FAT_JAVA_LIB)
diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
index 9281655d7..4f5175513 100644
--- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
+++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
@@ -1,123 +1,123 @@
 #ifndef __GPUMAT_CUDA_HPP__
 #define __GPUMAT_CUDA_HPP__
 
-    class DeviceInfoFuncTable
-    {
-    public:
-        // cv::DeviceInfo
-        virtual size_t sharedMemPerBlock() const = 0;
-        virtual void queryMemory(size_t&, size_t&) const = 0;
-        virtual size_t freeMemory() const = 0;
-        virtual size_t totalMemory() const = 0;
-        virtual bool supports(FeatureSet) const = 0;
-        virtual bool isCompatible() const = 0;
-        virtual void query() = 0;
-        virtual int deviceID() const = 0;
-        virtual std::string name() const = 0;
-        virtual int majorVersion() const = 0;
-        virtual int minorVersion() const = 0;
-        virtual int multiProcessorCount() const = 0;
-        virtual int getCudaEnabledDeviceCount() const = 0;
-        virtual void setDevice(int) const = 0;
-        virtual int getDevice() const = 0;
-        virtual void resetDevice() const  = 0;
-        virtual bool deviceSupports(FeatureSet) const = 0;
+class DeviceInfoFuncTable
+{
+public:
+    // cv::DeviceInfo
+    virtual size_t sharedMemPerBlock() const = 0;
+    virtual void queryMemory(size_t&, size_t&) const = 0;
+    virtual size_t freeMemory() const = 0;
+    virtual size_t totalMemory() const = 0;
+    virtual bool supports(FeatureSet) const = 0;
+    virtual bool isCompatible() const = 0;
+    virtual void query() = 0;
+    virtual int deviceID() const = 0;
+    virtual std::string name() const = 0;
+    virtual int majorVersion() const = 0;
+    virtual int minorVersion() const = 0;
+    virtual int multiProcessorCount() const = 0;
+    virtual int getCudaEnabledDeviceCount() const = 0;
+    virtual void setDevice(int) const = 0;
+    virtual int getDevice() const = 0;
+    virtual void resetDevice() const  = 0;
+    virtual bool deviceSupports(FeatureSet) const = 0;
 
-        // cv::TargetArchs
-        virtual bool builtWith(FeatureSet) const = 0;
-        virtual bool has(int, int) const = 0;
-        virtual bool hasPtx(int, int) const = 0;
-        virtual bool hasBin(int, int) const = 0;
-        virtual bool hasEqualOrLessPtx(int, int) const = 0;
-        virtual bool hasEqualOrGreater(int, int) const = 0;
-        virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
-        virtual bool hasEqualOrGreaterBin(int, int) const = 0;
+    // cv::TargetArchs
+    virtual bool builtWith(FeatureSet) const = 0;
+    virtual bool has(int, int) const = 0;
+    virtual bool hasPtx(int, int) const = 0;
+    virtual bool hasBin(int, int) const = 0;
+    virtual bool hasEqualOrLessPtx(int, int) const = 0;
+    virtual bool hasEqualOrGreater(int, int) const = 0;
+    virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
+    virtual bool hasEqualOrGreaterBin(int, int) const = 0;
 
-        virtual void printCudaDeviceInfo(int) const = 0;
-        virtual void printShortCudaDeviceInfo(int) const = 0;
+    virtual void printCudaDeviceInfo(int) const = 0;
+    virtual void printShortCudaDeviceInfo(int) const = 0;
 
-        virtual ~DeviceInfoFuncTable() {};
-    };
+    virtual ~DeviceInfoFuncTable() {};
+};
 
-    class GpuFuncTable
-    {
-    public:
-        virtual ~GpuFuncTable() {}
+class GpuFuncTable
+{
+public:
+    virtual ~GpuFuncTable() {}
 
-        // GpuMat routines
-        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
-        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
-        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
+    // GpuMat routines
+    virtual void copy(const Mat& src, GpuMat& dst) const = 0;
+    virtual void copy(const GpuMat& src, Mat& dst) const = 0;
+    virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
 
-        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
+    virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
 
-        // gpu::device::convertTo funcs
-        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
-        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
+    // gpu::device::convertTo funcs
+    virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
+    virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
 
-        // for gpu::device::setTo funcs
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
+    // for gpu::device::setTo funcs
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
 
-        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
-        virtual void free(void* devPtr) const = 0;
-    };
+    virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
+    virtual void free(void* devPtr) const = 0;
+};
 
-    class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
-    {
-    public:
-        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
-        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
-        size_t freeMemory() const { throw_nogpu; return 0; }
-        size_t totalMemory() const { throw_nogpu; return 0; }
-        bool supports(FeatureSet) const { throw_nogpu; return false; }
-        bool isCompatible() const { throw_nogpu; return false; }
-        void query() { throw_nogpu; }
-        int deviceID() const { throw_nogpu; return -1; };
-        std::string name() const { throw_nogpu; return std::string(); }
-        int majorVersion() const { throw_nogpu; return -1; }
-        int minorVersion() const { throw_nogpu; return -1; }
-        int multiProcessorCount() const { throw_nogpu; return -1; }
+class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
+{
+public:
+    size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
+    void queryMemory(size_t&, size_t&) const { throw_nogpu; }
+    size_t freeMemory() const { throw_nogpu; return 0; }
+    size_t totalMemory() const { throw_nogpu; return 0; }
+    bool supports(FeatureSet) const { throw_nogpu; return false; }
+    bool isCompatible() const { throw_nogpu; return false; }
+    void query() { throw_nogpu; }
+    int deviceID() const { throw_nogpu; return -1; };
+    std::string name() const { throw_nogpu; return std::string(); }
+    int majorVersion() const { throw_nogpu; return -1; }
+    int minorVersion() const { throw_nogpu; return -1; }
+    int multiProcessorCount() const { throw_nogpu; return -1; }
 
-        int getCudaEnabledDeviceCount() const { return 0; }
+    int getCudaEnabledDeviceCount() const { return 0; }
 
-        void setDevice(int) const { throw_nogpu; }
-        int getDevice() const { throw_nogpu; return 0; }
+    void setDevice(int) const { throw_nogpu; }
+    int getDevice() const { throw_nogpu; return 0; }
 
-        void resetDevice() const { throw_nogpu; }
+    void resetDevice() const { throw_nogpu; }
 
-        bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
+    bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
 
-        bool builtWith(FeatureSet) const { throw_nogpu; return false; }
-        bool has(int, int) const { throw_nogpu; return false; }
-        bool hasPtx(int, int) const { throw_nogpu; return false; }
-        bool hasBin(int, int) const { throw_nogpu; return false; }
-        bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
-        bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
-        bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
-        bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
+    bool builtWith(FeatureSet) const { throw_nogpu; return false; }
+    bool has(int, int) const { throw_nogpu; return false; }
+    bool hasPtx(int, int) const { throw_nogpu; return false; }
+    bool hasBin(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
 
-        void printCudaDeviceInfo(int) const { throw_nogpu; }
-        void printShortCudaDeviceInfo(int) const { throw_nogpu; }
-    };
+    void printCudaDeviceInfo(int) const { throw_nogpu; }
+    void printShortCudaDeviceInfo(int) const { throw_nogpu; }
+};
 
-    class EmptyFuncTable : public GpuFuncTable
-    {
-    public:
+class EmptyFuncTable : public GpuFuncTable
+{
+public:
 
-        void copy(const Mat&, GpuMat&) const { throw_nogpu; }
-        void copy(const GpuMat&, Mat&) const { throw_nogpu; }
-        void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
+    void copy(const Mat&, GpuMat&) const { throw_nogpu; }
+    void copy(const GpuMat&, Mat&) const { throw_nogpu; }
+    void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
 
-        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
+    void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
 
-        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
+    void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
+    void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
 
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
 
-        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
-        void free(void*) const {}
-    };
+    void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
+    void free(void*) const {}
+};
 
 #if defined(USE_CUDA)
 
@@ -153,940 +153,949 @@ namespace cv { namespace gpu { namespace device
     void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
 }}}
 
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
+template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
+{
+    Scalar_<T> sf = s;
+    cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+}
+
+template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+{
+    Scalar_<T> sf = s;
+    cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+}
+
+template<int n> struct NPPTypeTraits;
+template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
+template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
+
+//////////////////////////////////////////////////////////////////////////
+// Convert
+
+template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
+};
+template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
+{
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
+};
+
+template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    static void call(const GpuMat& src, GpuMat& dst)
     {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
+{
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    static void call(const GpuMat& src, GpuMat& dst)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// Set
+
+template<int SDEPTH, int SCN> struct NppSetFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SCN> struct NppSetFunc<CV_8S, SCN>
+{
+    typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<> struct NppSetFunc<CV_8S, 1>
+{
+    typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+
+template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template<int SDEPTH, int SCN> struct NppSetMaskFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+
+template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// CopyMasked
+
+template<int SDEPTH> struct NppCopyMaskedFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+
+template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+{
+    return reinterpret_cast<size_t>(ptr) % size == 0;
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0);
+    void convertTo(const GpuMat& src, GpuMat& dst);
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0);
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask);
+
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    {
+        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+
+        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
     }
 
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    void convertTo(const GpuMat& src, GpuMat& dst)
     {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
     }
 
-    template<int n> struct NPPTypeTraits;
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
-    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
-    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
-    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
     {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
-    };
-
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Set
-
-    template<int SDEPTH, int SCN> struct NppSetFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SCN> struct NppSetFunc<CV_8S, SCN>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<> struct NppSetFunc<CV_8S, 1>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // CopyMasked
-
-    template<int SDEPTH> struct NppCopyMaskedFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
-    {
-        return reinterpret_cast<size_t>(ptr) % size == 0;
+        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
     }
 
-    namespace cv { namespace gpu { namespace device
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
     {
-        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
+        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
+
+        static const caller_t callers[] =
         {
-            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
 
-            cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
-        }
+        callers[src.depth()](src, s, stream);
+    }
 
-        void convertTo(const GpuMat& src, GpuMat& dst)
-        {
-            cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
-        }
-
-        void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
-        {
-            cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
-        }
-
-        void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-                kernelSetCaller<float>, kernelSetCaller<double>
-            };
-
-            callers[src.depth()](src, s, stream);
-        }
-
-        void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-                kernelSetCaller<float>, kernelSetCaller<double>
-            };
-
-            callers[src.depth()](src, s, mask, stream);
-        }
-
-        void setTo(GpuMat& src, Scalar s)
-        {
-            setTo(src, s, 0);
-        }
-
-        void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            setTo(src, s, mask, 0);
-        }
-    }}}
-
-
-    class CudaArch
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
     {
-    public:
-        CudaArch()
+        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+
+        static const caller_t callers[] =
         {
-            fromStr(CUDA_ARCH_BIN, bin);
-            fromStr(CUDA_ARCH_PTX, ptx);
-            fromStr(CUDA_ARCH_FEATURES, features);
-        }
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
 
-        bool builtWith(FeatureSet feature_set) const
-        {
-            return !features.empty() && (features.back() >= feature_set);
-        }
+        callers[src.depth()](src, s, mask, stream);
+    }
 
-        bool hasPtx(int major, int minor) const
-        {
-            return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
-        }
-
-        bool hasBin(int major, int minor) const
-        {
-            return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
-        }
-
-        bool hasEqualOrLessPtx(int major, int minor) const
-        {
-            return !ptx.empty() && (ptx.front() <= major * 10 + minor);
-        }
-
-        bool hasEqualOrGreaterPtx(int major, int minor) const
-        {
-            return !ptx.empty() && (ptx.back() >= major * 10 + minor);
-        }
-
-        bool hasEqualOrGreaterBin(int major, int minor) const
-        {
-            return !bin.empty() && (bin.back() >= major * 10 + minor);
-        }
-
-
-    private:
-        void fromStr(const string& set_as_str, vector<int>& arr)
-        {
-            if (set_as_str.find_first_not_of(" ") == string::npos)
-                return;
-
-            istringstream stream(set_as_str);
-            int cur_value;
-
-            while (!stream.eof())
-            {
-                stream >> cur_value;
-                arr.push_back(cur_value);
-            }
-
-            sort(arr.begin(), arr.end());
-        }
-
-        vector<int> bin;
-        vector<int> ptx;
-        vector<int> features;
-    };
-
-    class DeviceProps
+    void setTo(GpuMat& src, Scalar s)
     {
-    public:
-        DeviceProps()
-        {
-            props_.resize(10, 0);
-        }
+        setTo(src, s, 0);
+    }
 
-        ~DeviceProps()
-        {
-            for (size_t i = 0; i < props_.size(); ++i)
-            {
-                if (props_[i])
-                    delete props_[i];
-            }
-            props_.clear();
-        }
-
-        cudaDeviceProp* get(int devID)
-        {
-            if (devID >= (int) props_.size())
-                props_.resize(devID + 5, 0);
-
-            if (!props_[devID])
-            {
-                props_[devID] = new cudaDeviceProp;
-                cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
-            }
-
-            return props_[devID];
-        }
-    private:
-        std::vector<cudaDeviceProp*> props_;
-    };
-
-    DeviceProps deviceProps;
-
-    class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
     {
-    public:
-        size_t sharedMemPerBlock() const
+        setTo(src, s, mask, 0);
+    }
+}}}
+
+class CudaArch
+{
+public:
+    CudaArch()
+    {
+        fromStr(CUDA_ARCH_BIN, bin);
+        fromStr(CUDA_ARCH_PTX, ptx);
+        fromStr(CUDA_ARCH_FEATURES, features);
+    }
+
+    bool builtWith(FeatureSet feature_set) const
+    {
+        return !features.empty() && (features.back() >= feature_set);
+    }
+
+    bool hasPtx(int major, int minor) const
+    {
+        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+    }
+
+    bool hasBin(int major, int minor) const
+    {
+        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+    }
+
+    bool hasEqualOrLessPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+    }
+
+    bool hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+    }
+
+    bool hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return !bin.empty() && (bin.back() >= major * 10 + minor);
+    }
+
+
+private:
+    void fromStr(const string& set_as_str, vector<int>& arr)
+    {
+        if (set_as_str.find_first_not_of(" ") == string::npos)
+            return;
+
+        istringstream stream(set_as_str);
+        int cur_value;
+
+        while (!stream.eof())
         {
-            return deviceProps.get(device_id_)->sharedMemPerBlock;
+            stream >> cur_value;
+            arr.push_back(cur_value);
         }
 
-        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+        sort(arr.begin(), arr.end());
+    }
+
+    vector<int> bin;
+    vector<int> ptx;
+    vector<int> features;
+};
+
+class DeviceProps
+{
+public:
+    DeviceProps()
+    {
+        props_.resize(10, 0);
+    }
+
+    ~DeviceProps()
+    {
+        for (size_t i = 0; i < props_.size(); ++i)
         {
-            int prevDeviceID = getDevice();
-            if (prevDeviceID != device_id_)
-                setDevice(device_id_);
+            if (props_[i])
+                delete props_[i];
+        }
+        props_.clear();
+    }
 
-            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+    cudaDeviceProp* get(int devID)
+    {
+        if (devID >= (int) props_.size())
+            props_.resize(devID + 5, 0);
 
-            if (prevDeviceID != device_id_)
-                setDevice(prevDeviceID);
+        if (!props_[devID])
+        {
+            props_[devID] = new cudaDeviceProp;
+            cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
         }
 
-        size_t freeMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _freeMemory;
-        }
+        return props_[devID];
+    }
+private:
+    std::vector<cudaDeviceProp*> props_;
+};
 
-        size_t totalMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _totalMemory;
-        }
+DeviceProps deviceProps;
 
-        bool supports(FeatureSet feature_set) const
-        {
-            int version = majorVersion_ * 10 + minorVersion_;
-            return version >= feature_set;
-        }
+class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
+{
+public:
+    size_t sharedMemPerBlock() const
+    {
+        return deviceProps.get(device_id_)->sharedMemPerBlock;
+    }
 
-        bool isCompatible() const
-        {
-            // Check PTX compatibility
-            if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
-                return true;
+    void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+    {
+        int prevDeviceID = getDevice();
+        if (prevDeviceID != device_id_)
+            setDevice(device_id_);
 
-            // Check BIN compatibility
-                for (int i = minorVersion_; i >= 0; --i)
-                    if (hasBin(majorVersion_, i))
-                        return true;
+        cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
 
-                    return false;
-        }
+        if (prevDeviceID != device_id_)
+            setDevice(prevDeviceID);
+    }
 
-        void query()
-        {
-            const cudaDeviceProp* prop = deviceProps.get(device_id_);
+    size_t freeMemory() const
+    {
+        size_t _totalMemory, _freeMemory;
+        queryMemory(_totalMemory, _freeMemory);
+        return _freeMemory;
+    }
 
-            name_ = prop->name;
-            multi_processor_count_ = prop->multiProcessorCount;
-            majorVersion_ = prop->major;
-            minorVersion_ = prop->minor;
-        }
+    size_t totalMemory() const
+    {
+        size_t _totalMemory, _freeMemory;
+        queryMemory(_totalMemory, _freeMemory);
+        return _totalMemory;
+    }
 
-        int deviceID() const
-        {
-            return device_id_;
-        }
+    bool supports(FeatureSet feature_set) const
+    {
+        int version = majorVersion_ * 10 + minorVersion_;
+        return version >= feature_set;
+    }
 
-        std::string name() const
-        {
-            return name_;
-        }
+    bool isCompatible() const
+    {
+        // Check PTX compatibility
+        if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
+            return true;
 
-        int majorVersion() const
-        {
-            return majorVersion_;
-        }
+        // Check BIN compatibility
+            for (int i = minorVersion_; i >= 0; --i)
+                if (hasBin(majorVersion_, i))
+                    return true;
 
-        int minorVersion() const
-        {
-            return minorVersion_;
-        }
+                return false;
+    }
 
-        int multiProcessorCount() const
-        {
-            return multi_processor_count_;
-        }
+    void query()
+    {
+        const cudaDeviceProp* prop = deviceProps.get(device_id_);
 
-        int getCudaEnabledDeviceCount() const
-        {
-            int count;
-            cudaError_t error = cudaGetDeviceCount( &count );
+        name_ = prop->name;
+        multi_processor_count_ = prop->multiProcessorCount;
+        majorVersion_ = prop->major;
+        minorVersion_ = prop->minor;
+    }
 
-            if (error == cudaErrorInsufficientDriver)
-                return -1;
+    int deviceID() const
+    {
+        return device_id_;
+    }
 
-            if (error == cudaErrorNoDevice)
-                return 0;
+    std::string name() const
+    {
+        return name_;
+    }
 
-            cudaSafeCall( error );
-            return count;
-        }
+    int majorVersion() const
+    {
+        return majorVersion_;
+    }
 
-        void setDevice(int device) const
-        {
-            cudaSafeCall( cudaSetDevice( device ) );
-        }
+    int minorVersion() const
+    {
+        return minorVersion_;
+    }
 
-        int getDevice() const
-        {
-            int device;
-            cudaSafeCall( cudaGetDevice( &device ) );
-            return device;
-        }
+    int multiProcessorCount() const
+    {
+        return multi_processor_count_;
+    }
 
-        void resetDevice() const
-        {
-            cudaSafeCall( cudaDeviceReset() );
-        }
-
-        bool builtWith(FeatureSet feature_set) const
-        {
-            return cudaArch.builtWith(feature_set);
-        }
-
-        bool has(int major, int minor) const
-        {
-            return hasPtx(major, minor) || hasBin(major, minor);
-        }
-
-        bool hasPtx(int major, int minor) const
-        {
-            return cudaArch.hasPtx(major, minor);
-        }
-
-        bool hasBin(int major, int minor) const
-        {
-            return cudaArch.hasBin(major, minor);
-        }
-
-        bool hasEqualOrLessPtx(int major, int minor) const
-        {
-            return cudaArch.hasEqualOrLessPtx(major, minor);
-        }
-
-        bool hasEqualOrGreater(int major, int minor) const
-        {
-            return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
-        }
-
-        bool hasEqualOrGreaterPtx(int major, int minor) const
-        {
-            return cudaArch.hasEqualOrGreaterPtx(major, minor);
-        }
-
-        bool hasEqualOrGreaterBin(int major, int minor) const
-        {
-            return cudaArch.hasEqualOrGreaterBin(major, minor);
-        }
-
-        bool deviceSupports(FeatureSet feature_set) const
-        {
-            static int versions[] =
-            {
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-            };
-            static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
-
-            const int devId = getDevice();
-
-            int version;
-
-            if (devId < cache_size && versions[devId] >= 0)
-                version = versions[devId];
-            else
-            {
-                DeviceInfo dev(devId);
-                version = dev.majorVersion() * 10 + dev.minorVersion();
-                if (devId < cache_size)
-                    versions[devId] = version;
-            }
-
-            return TargetArchs::builtWith(feature_set) && (version >= feature_set);
-        }
-
-        void printCudaDeviceInfo(int device) const
-        {
-            int count = getCudaEnabledDeviceCount();
-            bool valid = (device >= 0) && (device < count);
-
-            int beg = valid ? device   : 0;
-            int end = valid ? device+1 : count;
-
-            printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
-            printf("Device count: %d\n", count);
-
-            int driverVersion = 0, runtimeVersion = 0;
-            cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-            cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-
-            const char *computeMode[] = {
-                "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
-                "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
-                "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
-                "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
-                "Unknown",
-                NULL
-            };
-
-            for(int dev = beg; dev < end; ++dev)
-            {
-                cudaDeviceProp prop;
-                cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-
-                printf("\nDevice %d: \"%s\"\n", dev, prop.name);
-                printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-                printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
-                printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-
-                int cores = convertSMVer2Cores(prop.major, prop.minor);
-                if (cores > 0)
-                    printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
-
-                printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
-
-                printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
-                       prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
-                       prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
-                printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
-                       prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
-                       prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-
-                printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
-                printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
-                printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
-                printf("  Warp size:                                     %d\n", prop.warpSize);
-                printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
-                printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
-                printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
-                printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
-                printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
-
-                printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
-                printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-                printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
-                printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
-
-                printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
-                printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
-                printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
-                printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
-                printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
-                printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
-                printf("  Compute Mode:\n");
-                printf("      %s \n", computeMode[prop.computeMode]);
-            }
-
-            printf("\n");
-            printf("deviceQuery, CUDA Driver = CUDART");
-            printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
-            printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
-            printf(", NumDevs = %d\n\n", count);
-            fflush(stdout);
-        }
-
-        void printShortCudaDeviceInfo(int device) const
-        {
-            int count = getCudaEnabledDeviceCount();
-            bool valid = (device >= 0) && (device < count);
-
-            int beg = valid ? device   : 0;
-            int end = valid ? device+1 : count;
-
-            int driverVersion = 0, runtimeVersion = 0;
-            cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-            cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-
-            for(int dev = beg; dev < end; ++dev)
-            {
-                cudaDeviceProp prop;
-                cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-
-                const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
-                printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
-                printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
-
-                int cores = convertSMVer2Cores(prop.major, prop.minor);
-                if (cores > 0)
-                    printf(", %d cores", cores * prop.multiProcessorCount);
-
-                printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-            }
-            fflush(stdout);
-        }
-
-    private:
-        int device_id_;
-
-        std::string name_;
-        int multi_processor_count_;
-        int majorVersion_;
-        int minorVersion_;
-
-        const CudaArch cudaArch;
-
-        int convertSMVer2Cores(int major, int minor) const
-        {
-            // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-            typedef struct {
-                int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-                int Cores;
-            } SMtoCores;
-
-            SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
-
-            int index = 0;
-            while (gpuArchCoresPerSM[index].SM != -1)
-            {
-                if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
-                    return gpuArchCoresPerSM[index].Cores;
-                index++;
-            }
+    int getCudaEnabledDeviceCount() const
+    {
+        int count;
+        cudaError_t error = cudaGetDeviceCount( &count );
 
+        if (error == cudaErrorInsufficientDriver)
             return -1;
-        }
-    };
 
-    class CudaFuncTable : public GpuFuncTable
+        if (error == cudaErrorNoDevice)
+            return 0;
+
+        cudaSafeCall( error );
+        return count;
+    }
+
+    void setDevice(int device) const
     {
-    public:
+        cudaSafeCall( cudaSetDevice( device ) );
+    }
 
-        void copy(const Mat& src, GpuMat& dst) const
+    int getDevice() const
+    {
+        int device;
+        cudaSafeCall( cudaGetDevice( &device ) );
+        return device;
+    }
+
+    void resetDevice() const
+    {
+        cudaSafeCall( cudaDeviceReset() );
+    }
+
+    bool builtWith(FeatureSet feature_set) const
+    {
+        return cudaArch.builtWith(feature_set);
+    }
+
+    bool has(int major, int minor) const
+    {
+        return hasPtx(major, minor) || hasBin(major, minor);
+    }
+
+    bool hasPtx(int major, int minor) const
+    {
+        return cudaArch.hasPtx(major, minor);
+    }
+
+    bool hasBin(int major, int minor) const
+    {
+        return cudaArch.hasBin(major, minor);
+    }
+
+    bool hasEqualOrLessPtx(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrLessPtx(major, minor);
+    }
+
+    bool hasEqualOrGreater(int major, int minor) const
+    {
+        return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+    }
+
+    bool hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrGreaterPtx(major, minor);
+    }
+
+    bool hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrGreaterBin(major, minor);
+    }
+
+    bool deviceSupports(FeatureSet feature_set) const
+    {
+        static int versions[] =
         {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
-        }
-        void copy(const GpuMat& src, Mat& dst) const
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+        };
+        static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+
+        const int devId = getDevice();
+
+        int version;
+
+        if (devId < cache_size && versions[devId] >= 0)
+            version = versions[devId];
+        else
         {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
-        }
-        void copy(const GpuMat& src, GpuMat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+            DeviceInfo dev(devId);
+            version = dev.majorVersion() * 10 + dev.minorVersion();
+            if (devId < cache_size)
+                versions[devId] = version;
         }
 
-        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+        return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+    }
 
-            if (src.depth() == CV_64F)
+    void printCudaDeviceInfo(int device) const
+    {
+        int count = getCudaEnabledDeviceCount();
+        bool valid = (device >= 0) && (device < count);
+
+        int beg = valid ? device   : 0;
+        int end = valid ? device+1 : count;
+
+        printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
+        printf("Device count: %d\n", count);
+
+        int driverVersion = 0, runtimeVersion = 0;
+        cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+        cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+
+        const char *computeMode[] = {
+            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+               "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+               "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+               "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+               "Unknown",
+               NULL
+        };
+
+        for(int dev = beg; dev < end; ++dev)
+        {
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+
+            printf("\nDevice %d: \"%s\"\n", dev, prop.name);
+            printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+            printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
+            printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
+
+        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
+
+        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
+               prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
+               prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
+        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
+               prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
+               prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
+
+        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
+        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
+        printf("  Warp size:                                     %d\n", prop.warpSize);
+        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
+        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
+        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
+        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
+
+        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
+        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
+
+        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
+        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
+        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
+        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
+        printf("  Compute Mode:\n");
+        printf("      %s \n", computeMode[prop.computeMode]);
+        }
+
+        printf("\n");
+        printf("deviceQuery, CUDA Driver = CUDART");
+        printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
+        printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
+        printf(", NumDevs = %d\n\n", count);
+        fflush(stdout);
+    }
+
+    void printShortCudaDeviceInfo(int device) const
+    {
+        int count = getCudaEnabledDeviceCount();
+        bool valid = (device >= 0) && (device < count);
+
+        int beg = valid ? device   : 0;
+        int end = valid ? device+1 : count;
+
+        int driverVersion = 0, runtimeVersion = 0;
+        cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+        cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+
+        for(int dev = beg; dev < end; ++dev)
+        {
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+
+            const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
+            printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
+            printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
+
+            int cores = convertSMVer2Cores(prop.major, prop.minor);
+            if (cores > 0)
+                printf(", %d cores", cores * prop.multiProcessorCount);
+
+            printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+        }
+        fflush(stdout);
+    }
+
+private:
+    int device_id_;
+
+    std::string name_;
+    int multi_processor_count_;
+    int majorVersion_;
+    int minorVersion_;
+
+    const CudaArch cudaArch;
+
+    int convertSMVer2Cores(int major, int minor) const
+    {
+        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+        typedef struct {
+            int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+            int Cores;
+        } SMtoCores;
+
+        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
+
+        int index = 0;
+        while (gpuArchCoresPerSM[index].SM != -1)
+        {
+            if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
+                return gpuArchCoresPerSM[index].Cores;
+            index++;
+        }
+
+        return -1;
+    }
+};
+
+class CudaFuncTable : public GpuFuncTable
+{
+public:
+
+    void copy(const Mat& src, GpuMat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
+    }
+
+    void copy(const GpuMat& src, Mat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
+    }
+
+    void copy(const GpuMat& src, GpuMat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+    }
+
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
+    {
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+
+        if (src.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
+        static const func_t funcs[7][4] =
+        {
+            /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
+            /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
+            /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
+            /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
+            /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
+            /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
+            /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
+         };
+
+         const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
+
+         func(src, dst, mask, 0);
+    }
+
+    void convert(const GpuMat& src, GpuMat& dst) const
+    {
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
+        static const func_t funcs[7][7][4] =
+        {
             {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+                /*  8U ->  8U */ {0, 0, 0, 0},
+                /*  8U ->  8S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
+                /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
+                /*  8U -> 32S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 64F */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /*  8S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S ->  8S */ {0,0,0,0},
+                /*  8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
+                /* 16U ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 16U */ {0,0,0,0},
+                /* 16U -> 16S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
+                /* 16S ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 16U */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 16S */ {0,0,0,0},
+                /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /* 32S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 32S */ {0,0,0,0},
+                /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F ->  8S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 32S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 32F */ {0,0,0,0},
+                /* 32F -> 64F */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 64F ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 64F */ {0,0,0,0}
             }
+        };
 
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-            static const func_t funcs[7][4] =
-            {
-                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
-                /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
-                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
-                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
-                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
-                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
-                /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
-            };
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(dst.depth() <= CV_64F);
+        CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
 
-            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
-
-            func(src, dst, mask, 0);
+        if (src.depth() == CV_64F || dst.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
         }
 
-        void convert(const GpuMat& src, GpuMat& dst) const
+        bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
+        if (!aligned)
         {
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
-            static const func_t funcs[7][7][4] =
-            {
-                {
-                    /*  8U ->  8U */ {0, 0, 0, 0},
-                    /*  8U ->  8S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
-                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
-                    /*  8U -> 32S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /*  8U -> 64F */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
-                },
-                {
-                    /*  8S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S ->  8S */ {0,0,0,0},
-                    /*  8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
-                },
-                {
-                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
-                    /* 16U ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16U -> 16U */ {0,0,0,0},
-                    /* 16U -> 16S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16U -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
-                },
-                {
-                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
-                    /* 16S ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16S -> 16U */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16S -> 16S */ {0,0,0,0},
-                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16S -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
-                },
-                {
-                    /* 32S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S -> 32S */ {0,0,0,0},
-                    /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
-                },
-                {
-                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F ->  8S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F -> 32S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F -> 32F */ {0,0,0,0},
-                    /* 32F -> 64F */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
-                },
-                {
-                    /* 64F ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 64F */ {0,0,0,0}
-                }
-            };
+            cv::gpu::device::convertTo(src, dst);
+            return;
+        }
 
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-            CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
+        const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
+        CV_DbgAssert(func != 0);
 
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
+        func(src, dst);
+    }
 
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-            if (!aligned)
+    void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
+    {
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(dst.depth() <= CV_64F);
+
+        if (src.depth() == CV_64F || dst.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
+    }
+
+    void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
+    {
+        if (mask.empty())
+        {
+            if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
             {
-                cv::gpu::device::convertTo(src, dst);
+                cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
                 return;
             }
 
-            const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
-            CV_DbgAssert(func != 0);
-
-            func(src, dst);
-        }
-
-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
+            if (m.depth() == CV_8U)
             {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
+                int cn = m.channels();
 
-            cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
-        }
-
-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
-        {
-            if (mask.empty())
-            {
-                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+                if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
                 {
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
+                    int val = saturate_cast<uchar>(s[0]);
+                    cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
                     return;
                 }
-
-                if (m.depth() == CV_8U)
-                {
-                    int cn = m.channels();
-
-                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-                    {
-                        int val = saturate_cast<uchar>(s[0]);
-                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
-                        return;
-                    }
-                }
-
-                typedef void (*func_t)(GpuMat& src, Scalar s);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
-                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          },
-                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
-                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
-                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
-                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
-                };
-
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
-
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-
-                if (stream)
-                    cv::gpu::device::setTo(m, s, stream);
-                else
-                    funcs[m.depth()][m.channels() - 1](m, s);
             }
-            else
+
+            typedef void (*func_t)(GpuMat& src, Scalar s);
+            static const func_t funcs[7][4] =
             {
-                typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
-                    {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               },
-                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
-                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
-                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
-                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
-                    {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               }
-                };
+                {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
+                {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          },
+                {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
+                {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
+                {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
+                {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
+                {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
+            };
 
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+            CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
 
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-
-                if (stream)
-                    cv::gpu::device::setTo(m, s, mask, stream);
-                else
-                    funcs[m.depth()][m.channels() - 1](m, s, mask);
+            if (m.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
             }
-        }
 
-        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
-        {
-            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+            if (stream)
+                cv::gpu::device::setTo(m, s, stream);
+            else
+                funcs[m.depth()][m.channels() - 1](m, s);
         }
+        else
+        {
+            typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
+            static const func_t funcs[7][4] =
+            {
+                {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
+                {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               },
+                {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
+                {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
+                {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
+                {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
+                {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               }
+            };
 
-        void free(void* devPtr) const
-        {
-            cudaFree(devPtr);
+            CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+
+            if (m.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+
+            if (stream)
+                cv::gpu::device::setTo(m, s, mask, stream);
+            else
+                funcs[m.depth()][m.channels() - 1](m, s, mask);
         }
-    };
+    }
+
+    void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
+    {
+        cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+    }
+
+    void free(void* devPtr) const
+    {
+        cudaFree(devPtr);
+    }
+};
 #endif
 #endif
\ No newline at end of file
diff --git a/modules/dynamicuda/src/main.cpp b/modules/dynamicuda/src/main.cpp
index 4a05d8696..8eb66fd98 100644
--- a/modules/dynamicuda/src/main.cpp
+++ b/modules/dynamicuda/src/main.cpp
@@ -39,6 +39,9 @@ static EmptyFuncTable gpuTable;
 
 extern "C" {
 
+DeviceInfoFuncTable* deviceInfoFactory();
+GpuFuncTable* gpuFactory();
+
 DeviceInfoFuncTable* deviceInfoFactory()
 {
     return (DeviceInfoFuncTable*)&deviceInfoTable;
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 291295fb5..3a6ebe836 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -297,7 +297,7 @@ if(BUILD_FAT_JAVA_LIB)
       list(REMOVE_ITEM __deps ${m})
     endif()
   endforeach()
-  if (HAVE_opencv_dynamicuda)
+  if (ENABLE_DYNAMIC_CUDA)
     list(REMOVE_ITEM __deps "opencv_dynamicuda")
   endif()
   if (ANDROID AND HAVE_opencv_gpu)

From 2509fa8080962256e31b178e67d1b404341eb537 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 18:02:59 +0400
Subject: [PATCH 10/13] Warious fixes for case where HAVE_CUDA==OFF.

---
 modules/core/CMakeLists.txt                   |  4 ----
 modules/core/src/gpumat.cpp                   | 22 ++++++-------------
 modules/dynamicuda/CMakeLists.txt             |  2 +-
 .../include/opencv2/dynamicuda/dynamicuda.hpp | 19 ++++++++++++----
 4 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index f20e32d3a..2409ee9e9 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,12 +1,8 @@
 set(the_description "The Core Functionality")
 
-message(STATUS "ENABLE_DYNAMIC_CUDA ${ENABLE_DYNAMIC_CUDA}")
-
 if (ENABLE_DYNAMIC_CUDA)
-  message(STATUS "Using dynamic cuda approach")
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
 else()
-  message(STATUS "Link CUDA statically")
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 endif()
 
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 590685b74..17d46abcc 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -44,7 +44,7 @@
 #include "opencv2/core/gpumat.hpp"
 #include <iostream>
 
-#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
+#if defined(HAVE_CUDA)
     #include <cuda_runtime.h>
     #include <npp.h>
 
@@ -273,8 +273,6 @@ void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
 void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); }
 void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); }
 
-#ifdef HAVE_CUDA
-
 namespace cv { namespace gpu
 {
     CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, cudaStream_t);
@@ -286,8 +284,6 @@ namespace cv { namespace gpu
     CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
 }}
 
-#endif
-
 //////////////////////////////// GpuMat ///////////////////////////////
 
 cv::gpu::GpuMat::GpuMat(const GpuMat& m)
@@ -707,43 +703,39 @@ void cv::gpu::GpuMat::release()
     refcount = 0;
 }
 
-#ifdef HAVE_CUDA
-
 namespace cv { namespace gpu
 {
     void convertTo(const GpuMat& src, GpuMat& dst)
     {
         gpuFuncTable()->convert(src, dst);
     }
-    
+
     void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
     {
         gpuFuncTable()->convert(src, dst, alpha, beta, stream);
     }
-    
+
     void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
     {
         gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream);
     }
-    
+
     void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
     {
-        gpuFuncTable()->setTo(src, s, mask, stream);        
+        gpuFuncTable()->setTo(src, s, mask, stream);
     }
-    
+
     void setTo(GpuMat& src, Scalar s)
     {
         setTo(src, s, 0);
     }
-    
+
     void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
     {
         setTo(src, s, mask, 0);
     }
 }}
 
-#endif
-
 ////////////////////////////////////////////////////////////////////////
 // Error handling
 
diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt
index def05d19b..031b5e48d 100644
--- a/modules/dynamicuda/CMakeLists.txt
+++ b/modules/dynamicuda/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT ANDROID)
+if(NOT ANDROID OR NOT HAVE_CUDA)
   ocv_module_disable(dynamicuda)
 endif()
 
diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
index 4f5175513..c5057ab99 100644
--- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
+++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
@@ -1,6 +1,10 @@
 #ifndef __GPUMAT_CUDA_HPP__
 #define __GPUMAT_CUDA_HPP__
 
+#ifndef HAVE_CUDA
+typedef void* cudaStream_t;
+#endif
+
 class DeviceInfoFuncTable
 {
 public:
@@ -56,7 +60,7 @@ public:
     virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
 
     // for gpu::device::setTo funcs
-    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const = 0;
 
     virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
     virtual void free(void* devPtr) const = 0;
@@ -96,8 +100,15 @@ public:
     bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
     bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
 
-    void printCudaDeviceInfo(int) const { throw_nogpu; }
-    void printShortCudaDeviceInfo(int) const { throw_nogpu; }
+    void printCudaDeviceInfo(int) const
+    {
+        printf("The library is compiled without CUDA support\n");
+    }
+
+    void printShortCudaDeviceInfo(int) const
+    {
+        printf("The library is compiled without CUDA support\n");
+    }
 };
 
 class EmptyFuncTable : public GpuFuncTable
@@ -113,7 +124,7 @@ public:
     void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
     void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
 
-    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const { throw_nogpu; }
 
     void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
     void free(void*) const {}

From 069f3d8d9a1b5c500e56d4547cf42105542efb62 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 18:36:02 +0400
Subject: [PATCH 11/13] Build fixes for GPU module.

---
 modules/core/src/gpumat.cpp                   |  2 +-
 modules/gpu/perf4au/CMakeLists.txt            | 30 ++++++++++---------
 modules/stitching/src/blenders.cpp            |  6 ++--
 modules/stitching/src/matchers.cpp            | 10 +++----
 modules/stitching/src/precomp.hpp             |  2 +-
 modules/stitching/src/seam_finders.cpp        |  2 +-
 modules/stitching/src/stitcher.cpp            |  2 +-
 modules/stitching/src/warpers.cpp             |  2 +-
 .../opencv2/videostab/optical_flow.hpp        |  4 +--
 modules/videostab/src/inpainting.cpp          |  2 +-
 modules/videostab/src/optical_flow.cpp        |  2 +-
 11 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 17d46abcc..7a7b91d1d 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -752,5 +752,5 @@ void cv::gpu::error(const char *error_string, const char *file, const int line,
         cerr.flush();
     }
     else
-        ::cv::error( ::cv::Exception(code, error_string, func, file, line) );
+        cv::error( cv::Exception(code, error_string, func, file, line) );
 }
diff --git a/modules/gpu/perf4au/CMakeLists.txt b/modules/gpu/perf4au/CMakeLists.txt
index 376e7b270..13efe7ffa 100644
--- a/modules/gpu/perf4au/CMakeLists.txt
+++ b/modules/gpu/perf4au/CMakeLists.txt
@@ -2,26 +2,28 @@ set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video
 
 ocv_check_dependencies(${PERF4AU_REQUIRED_DEPS})
 
-set(the_target gpu_perf4au)
-project(${the_target})
+if (OCV_DEPENDENCIES_FOUND)
+  set(the_target gpu_perf4au)
+  project(${the_target})
 
-ocv_include_modules(${PERF4AU_REQUIRED_DEPS})
+  ocv_include_modules(${PERF4AU_REQUIRED_DEPS})
 
-if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
+  if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
-endif()
+  endif()
 
-file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp)
-add_executable(${the_target} ${srcs})
+  file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp)
+  add_executable(${the_target} ${srcs})
 
-target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS})
+  target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS})
 
-if(ENABLE_SOLUTION_FOLDERS)
-  set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
-endif()
+  if(ENABLE_SOLUTION_FOLDERS)
+    set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
+  endif()
 
-if(WIN32)
+  if(WIN32)
     if(MSVC AND NOT BUILD_SHARED_LIBS)
-        set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
+      set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
     endif()
-endif()
+  endif()
+endif()
\ No newline at end of file
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
index e65023a55..fb3c0d666 100644
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -189,7 +189,7 @@ Rect FeatherBlender::createWeightMaps(const vector<Mat> &masks, const vector<Poi
 MultiBandBlender::MultiBandBlender(int try_gpu, int num_bands, int weight_type)
 {
     setNumBands(num_bands);
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     can_use_gpu_ = try_gpu && gpu::getCudaEnabledDeviceCount();
 #else
     (void)try_gpu;
@@ -491,7 +491,7 @@ void createLaplacePyr(const Mat &img, int num_levels, vector<Mat> &pyr)
 
 void createLaplacePyrGpu(const Mat &img, int num_levels, vector<Mat> &pyr)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     pyr.resize(num_levels + 1);
 
     vector<gpu::GpuMat> gpu_pyr(num_levels + 1);
@@ -531,7 +531,7 @@ void restoreImageFromLaplacePyr(vector<Mat> &pyr)
 
 void restoreImageFromLaplacePyrGpu(vector<Mat> &pyr)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     if (pyr.empty())
         return;
 
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index d918cfff2..d86206233 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -46,7 +46,7 @@ using namespace std;
 using namespace cv;
 using namespace cv::detail;
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 using namespace cv::gpu;
 #endif
 
@@ -129,7 +129,7 @@ private:
     float match_conf_;
 };
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class GpuMatcher : public FeaturesMatcher
 {
 public:
@@ -204,7 +204,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
     LOG("1->2 & 2->1 matches: " << matches_info.matches.size() << endl);
 }
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &features2, MatchesInfo& matches_info)
 {
     matches_info.matches.clear();
@@ -432,7 +432,7 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
     }
 }
 
-#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 SurfFeaturesFinderGpu::SurfFeaturesFinderGpu(double hess_thresh, int num_octaves, int num_layers,
                                              int num_octaves_descr, int num_layers_descr)
 {
@@ -533,7 +533,7 @@ void FeaturesMatcher::operator ()(const vector<ImageFeatures> &features, vector<
 
 BestOf2NearestMatcher::BestOf2NearestMatcher(bool try_use_gpu, float match_conf, int num_matches_thresh1, int num_matches_thresh2)
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     if (try_use_gpu && getCudaEnabledDeviceCount() > 0)
         impl_ = new GpuMatcher(match_conf);
     else
diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp
index 1050856d3..54b672143 100644
--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -68,7 +68,7 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/features2d/features2d.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     #include "opencv2/gpu/gpu.hpp"
 
     #ifdef HAVE_OPENCV_NONFREE
diff --git a/modules/stitching/src/seam_finders.cpp b/modules/stitching/src/seam_finders.cpp
index 784209c93..a198c1ebb 100644
--- a/modules/stitching/src/seam_finders.cpp
+++ b/modules/stitching/src/seam_finders.cpp
@@ -1318,7 +1318,7 @@ void GraphCutSeamFinder::find(const vector<Mat> &src, const vector<Point> &corne
 }
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 void GraphCutSeamFinderGpu::find(const vector<Mat> &src, const vector<Point> &corners,
                                  vector<Mat> &masks)
 {
diff --git a/modules/stitching/src/stitcher.cpp b/modules/stitching/src/stitcher.cpp
index 5da26f6db..4a36ab0a4 100644
--- a/modules/stitching/src/stitcher.cpp
+++ b/modules/stitching/src/stitcher.cpp
@@ -58,7 +58,7 @@ Stitcher Stitcher::createDefault(bool try_use_gpu)
     stitcher.setFeaturesMatcher(new detail::BestOf2NearestMatcher(try_use_gpu));
     stitcher.setBundleAdjuster(new detail::BundleAdjusterRay());
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     if (try_use_gpu && gpu::getCudaEnabledDeviceCount() > 0)
     {
 #if defined(HAVE_OPENCV_NONFREE)
diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp
index 932958c6f..935831950 100644
--- a/modules/stitching/src/warpers.cpp
+++ b/modules/stitching/src/warpers.cpp
@@ -212,7 +212,7 @@ void SphericalWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_b
 }
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, gpu::GpuMat &xmap, gpu::GpuMat &ymap)
 {
     return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap);
diff --git a/modules/videostab/include/opencv2/videostab/optical_flow.hpp b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
index 18b7d3f28..2c1742fc7 100644
--- a/modules/videostab/include/opencv2/videostab/optical_flow.hpp
+++ b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
@@ -46,7 +46,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/opencv_modules.hpp"
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 #  include "opencv2/gpu/gpu.hpp"
 #endif
 
@@ -98,7 +98,7 @@ public:
             OutputArray status, OutputArray errors);
 };
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS DensePyrLkOptFlowEstimatorGpu
         : public PyrLkOptFlowEstimatorBase, public IDenseOptFlowEstimator
 {
diff --git a/modules/videostab/src/inpainting.cpp b/modules/videostab/src/inpainting.cpp
index 4377c007c..c6568e071 100644
--- a/modules/videostab/src/inpainting.cpp
+++ b/modules/videostab/src/inpainting.cpp
@@ -323,7 +323,7 @@ public:
 
 MotionInpainter::MotionInpainter()
 {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     setOptFlowEstimator(new DensePyrLkOptFlowEstimatorGpu());
 #else
     CV_Error(CV_StsNotImplemented, "Current implementation of MotionInpainter requires GPU");
diff --git a/modules/videostab/src/optical_flow.cpp b/modules/videostab/src/optical_flow.cpp
index 46100fdb5..3441df168 100644
--- a/modules/videostab/src/optical_flow.cpp
+++ b/modules/videostab/src/optical_flow.cpp
@@ -59,7 +59,7 @@ void SparsePyrLkOptFlowEstimator::run(
 }
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 DensePyrLkOptFlowEstimatorGpu::DensePyrLkOptFlowEstimatorGpu()
 {
     CV_Assert(gpu::getCudaEnabledDeviceCount() > 0);

From 529bd41751e526604726ccc9bff68a448693a3be Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Fri, 20 Dec 2013 09:46:03 +0400
Subject: [PATCH 12/13] Build fixes for case where HAVE_CUDA==OFF.

---
 modules/core/CMakeLists.txt        | 14 ++++++++------
 modules/core/src/gpumat.cpp        |  2 +-
 samples/cpp/stitching_detailed.cpp |  8 ++++----
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 2409ee9e9..0d985f288 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(the_description "The Core Functionality")
 
-if (ENABLE_DYNAMIC_CUDA)
+if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA)
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
 else()
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
@@ -15,7 +15,9 @@ endif()
 if(ENABLE_DYNAMIC_CUDA)
   add_definitions(-DDYNAMIC_CUDA_SUPPORT)
 else()
-  add_definitions(-DUSE_CUDA)
+  if (HAVE_CUDA)
+    add_definitions(-DUSE_CUDA)
+  endif()
 endif()
 
 if(HAVE_CUDA)
@@ -26,18 +28,18 @@ endif()
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
 
-if (NOT ENABLE_DYNAMIC_CUDA)
-  file(GLOB lib_cuda               "../dynamicuda/src/cuda/*.cu*")
+if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
+  file(GLOB lib_cuda           "../dynamicuda/src/cuda/*.cu*")
 endif()
 
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
 
-if (NOT ENABLE_DYNAMIC_CUDA)
+if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
   source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
 endif()
 
-if (ENABLE_DYNAMIC_CUDA)
+if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA)
   ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
                           HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 else()
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 7a7b91d1d..310aabd58 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -229,7 +229,7 @@ static DeviceInfoFuncTable* deviceInfoFuncTable()
    static CudaDeviceInfoFuncTable impl;
    static DeviceInfoFuncTable* funcTable = &impl;
 #else
-   static EmptyFuncTable stub;
+   static EmptyDeviceInfoFuncTable stub;
    static DeviceInfoFuncTable* funcTable = &stub;
 #endif
 #endif
diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp
index 49d86086d..7394a7282 100644
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -355,7 +355,7 @@ int main(int argc, char* argv[])
     Ptr<FeaturesFinder> finder;
     if (features_type == "surf")
     {
-#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
         if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
             finder = new SurfFeaturesFinderGpu();
         else
@@ -543,7 +543,7 @@ int main(int argc, char* argv[])
     // Warp images and their masks
 
     Ptr<WarperCreator> warper_creator;
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
     if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
     {
         if (warp_type == "plane") warper_creator = new cv::PlaneWarperGpu();
@@ -608,7 +608,7 @@ int main(int argc, char* argv[])
         seam_finder = new detail::VoronoiSeamFinder();
     else if (seam_find_type == "gc_color")
     {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
         if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
             seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR);
         else
@@ -617,7 +617,7 @@ int main(int argc, char* argv[])
     }
     else if (seam_find_type == "gc_colorgrad")
     {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
         if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
             seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR_GRAD);
         else

From bc72f4d2a2bb75af19edeb6bf5ed0128b891a2cd Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Fri, 20 Dec 2013 16:32:34 +0400
Subject: [PATCH 13/13] Code review fixes.

---
 CMakeLists.txt                                | 19 ++++++++++++++++++-
 modules/core/CMakeLists.txt                   |  6 ++++--
 modules/core/include/opencv2/core/gpumat.hpp  | 13 +++++--------
 modules/core/src/gpumat.cpp                   | 15 +++++++++------
 modules/dynamicuda/CMakeLists.txt             |  4 ++--
 .../include/opencv2/dynamicuda/dynamicuda.hpp |  4 ++--
 modules/stitching/CMakeLists.txt              |  6 +++++-
 .../opencv2/stitching/detail/seam_finders.hpp |  2 +-
 .../opencv2/stitching/detail/warpers.hpp      |  4 ++--
 .../include/opencv2/stitching/warpers.hpp     |  2 +-
 modules/videostab/CMakeLists.txt              |  6 +++++-
 11 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c5165c1e..06863804d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,7 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi
 
 # OpenCV build options
 # ===================================================
-OCV_OPTION(ENABLE_DYNAMIC_CUDA        "Enabled dynamic CUDA linkage"                             ON   IF ANDROID OR LINUX)
+OCV_OPTION(ENABLE_DYNAMIC_CUDA        "Enabled dynamic CUDA linkage"                             ON   IF ANDROID )
 OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON   IF (NOT IOS) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CMAKE_COMPILER_IS_GNUCXX )
@@ -459,6 +459,23 @@ if(WITH_OPENCL)
   include(cmake/OpenCVDetectOpenCL.cmake)
 endif()
 
+# ----------------------------------------------------------------------------
+# Add CUDA libraries (needed for apps/tools, samples)
+# ----------------------------------------------------------------------------
+if(NOT HAVE_CUDA)
+  set(ENABLE_DYNAMIC_CUDA OFF)
+endif()
+
+if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
+  set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+  if(HAVE_CUBLAS)
+    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cublas_LIBRARY})
+  endif()
+  if(HAVE_CUFFT)
+    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY})
+  endif()
+endif()
+
 # ----------------------------------------------------------------------------
 # Solution folders:
 # ----------------------------------------------------------------------------
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 0d985f288..a1e71bf4f 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -28,8 +28,10 @@ endif()
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
 
-if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
+if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA)
   file(GLOB lib_cuda           "../dynamicuda/src/cuda/*.cu*")
+  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+  ocv_cuda_compile(cuda_objs ${lib_cuda})
 endif()
 
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
@@ -43,7 +45,7 @@ if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA)
   ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
                           HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 else()
-  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda}
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda} ${cuda_objs}
                           HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 endif()
 
diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index d0f415ec3..193c9aa70 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -112,13 +112,13 @@ namespace cv { namespace gpu
         // Creates DeviceInfo object for the given GPU
         DeviceInfo(int device_id) : device_id_(device_id) { query(); }
 
-        std::string name() const;
+        std::string name() const { return name_; }
 
         // Return compute capability versions
-        int majorVersion() const;
-        int minorVersion() const;
+        int majorVersion() const { return majorVersion_; }
+        int minorVersion() const { return minorVersion_; }
 
-        int multiProcessorCount() const;
+        int multiProcessorCount() const { return multi_processor_count_; }
 
         size_t sharedMemPerBlock() const;
 
@@ -132,12 +132,9 @@ namespace cv { namespace gpu
         // Checks whether the GPU module can be run on the given device
         bool isCompatible() const;
 
-        int deviceID() const;
+        int deviceID() const { return device_id_; }
 
     private:
-        // Private section is fictive to preserve bin compatibility.
-        // Changes in the private fields there have no effects.
-        // see deligate code.
         void query();
 
         int device_id_;
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 310aabd58..94bb54823 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -263,12 +263,15 @@ size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->f
 size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
 bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
 bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
-int cv::gpu::DeviceInfo::deviceID() const { return deviceInfoFuncTable()->deviceID(); };
-int cv::gpu::DeviceInfo::majorVersion() const { return deviceInfoFuncTable()->majorVersion(); }
-int cv::gpu::DeviceInfo::minorVersion() const { return deviceInfoFuncTable()->minorVersion(); }
-std::string cv::gpu::DeviceInfo::name() const { return deviceInfoFuncTable()->name(); }
-int cv::gpu::DeviceInfo::multiProcessorCount() const { return deviceInfoFuncTable()->multiProcessorCount(); }
-void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
+
+void cv::gpu::DeviceInfo::query()
+{
+    deviceInfoFuncTable()->query();
+    name_ = deviceInfoFuncTable()->name();
+    multi_processor_count_ = deviceInfoFuncTable()->multiProcessorCount();
+    majorVersion_ = deviceInfoFuncTable()->majorVersion();
+    minorVersion_ = deviceInfoFuncTable()->minorVersion();
+}
 
 void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); }
 void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); }
diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt
index 031b5e48d..f67879ef9 100644
--- a/modules/dynamicuda/CMakeLists.txt
+++ b/modules/dynamicuda/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT ANDROID OR NOT HAVE_CUDA)
+if(NOT DYNAMIC_CUDA_SUPPORT)
   ocv_module_disable(dynamicuda)
 endif()
 
@@ -11,5 +11,5 @@ set(OPENCV_MODULE_TYPE SHARED)
 if (BUILD_FAT_JAVA_LIB)
   ocv_define_module(dynamicuda opencv_java PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 else()
-  ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED q${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+  ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 endif()
diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
index c5057ab99..8973c5304 100644
--- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
+++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
@@ -539,7 +539,7 @@ private:
 
 DeviceProps deviceProps;
 
-class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
+class CudaDeviceInfoFuncTable : public DeviceInfoFuncTable
 {
 public:
     size_t sharedMemPerBlock() const
@@ -1109,4 +1109,4 @@ public:
     }
 };
 #endif
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt
index fda44591f..6e9a35ba7 100644
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -1,2 +1,6 @@
 set(the_description "Images stitching")
-ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree)
+if (ENABLE_DYNAMIC_CUDA)
+  ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_nonfree)
+else()
+  ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree)
+endif()
\ No newline at end of file
diff --git a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
index 09a1a106f..9301dc5eb 100644
--- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
@@ -227,7 +227,7 @@ private:
 };
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS GraphCutSeamFinderGpu : public GraphCutSeamFinderBase, public PairwiseSeamFinder
 {
 public:
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
index 2bd46f75a..d44bfe69e 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -46,7 +46,7 @@
 #include "opencv2/core/core.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 # include "opencv2/gpu/gpu.hpp"
 #endif
 
@@ -331,7 +331,7 @@ public:
 };
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
 {
 public:
diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp
index 7475d1304..87efa7e80 100644
--- a/modules/stitching/include/opencv2/stitching/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/warpers.hpp
@@ -145,7 +145,7 @@ public:
 
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID)
 class PlaneWarperGpu: public WarperCreator
 {
 public:
diff --git a/modules/videostab/CMakeLists.txt b/modules/videostab/CMakeLists.txt
index ac5cb0d69..84ec1d2e8 100644
--- a/modules/videostab/CMakeLists.txt
+++ b/modules/videostab/CMakeLists.txt
@@ -1,2 +1,6 @@
 set(the_description "Video stabilization")
-ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu)
+if(ENABLE_DYNAMIC_CUDA)
+  ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui)
+else()
+  ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu)
+endif()