From 5a5c82bb1d395aeb76bd76f14a1db22742c02599 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Thu, 19 Dec 2013 17:41:04 +0400
Subject: [PATCH] Additional ENABLE_DYNAMIC_CUDA option implemented in cmake.
 Warning fixes and refactoring.

---
 CMakeLists.txt                                |    1 +
 modules/core/CMakeLists.txt                   |   14 +-
 modules/dynamicuda/CMakeLists.txt             |    1 +
 .../include/opencv2/dynamicuda/dynamicuda.hpp | 1899 +++++++++--------
 modules/dynamicuda/src/main.cpp               |    3 +
 modules/java/CMakeLists.txt                   |    2 +-
 6 files changed, 969 insertions(+), 951 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf25084bc..2c5165c1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi
 
 # OpenCV build options
 # ===================================================
+OCV_OPTION(ENABLE_DYNAMIC_CUDA        "Enabled dynamic CUDA linkage"                             ON   IF ANDROID OR LINUX)
 OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON   IF (NOT IOS) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CMAKE_COMPILER_IS_GNUCXX )
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index e89d6f276..f20e32d3a 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,8 +1,12 @@
 set(the_description "The Core Functionality")
 
-if (HAVE_opencv_dynamicuda)
+message(STATUS "ENABLE_DYNAMIC_CUDA ${ENABLE_DYNAMIC_CUDA}")
+
+if (ENABLE_DYNAMIC_CUDA)
+  message(STATUS "Using dynamic cuda approach")
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
 else()
+  message(STATUS "Link CUDA statically")
   ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 endif()
 
@@ -12,7 +16,7 @@ if(HAVE_WINRT)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
 
-if(HAVE_opencv_dynamicuda)
+if(ENABLE_DYNAMIC_CUDA)
   add_definitions(-DDYNAMIC_CUDA_SUPPORT)
 else()
   add_definitions(-DUSE_CUDA)
@@ -26,18 +30,18 @@ endif()
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
 file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
 
-if (NOT HAVE_opencv_dynamicuda)
+if (NOT ENABLE_DYNAMIC_CUDA)
   file(GLOB lib_cuda               "../dynamicuda/src/cuda/*.cu*")
 endif()
 
 source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
 source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
 
-if (NOT HAVE_opencv_dynamicuda)
+if (NOT ENABLE_DYNAMIC_CUDA)
   source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
 endif()
 
-if (HAVE_opencv_dynamicuda)
+if (ENABLE_DYNAMIC_CUDA)
   ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
                           HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
 else()
diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt
index 2ae5cf84a..def05d19b 100644
--- a/modules/dynamicuda/CMakeLists.txt
+++ b/modules/dynamicuda/CMakeLists.txt
@@ -5,6 +5,7 @@ endif()
 set(the_description "Dynamic CUDA linkage")
 
 add_definitions(-DUSE_CUDA)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
 ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
 set(OPENCV_MODULE_TYPE SHARED)
 if (BUILD_FAT_JAVA_LIB)
diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
index 9281655d7..4f5175513 100644
--- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
+++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp
@@ -1,123 +1,123 @@
 #ifndef __GPUMAT_CUDA_HPP__
 #define __GPUMAT_CUDA_HPP__
 
-    class DeviceInfoFuncTable
-    {
-    public:
-        // cv::DeviceInfo
-        virtual size_t sharedMemPerBlock() const = 0;
-        virtual void queryMemory(size_t&, size_t&) const = 0;
-        virtual size_t freeMemory() const = 0;
-        virtual size_t totalMemory() const = 0;
-        virtual bool supports(FeatureSet) const = 0;
-        virtual bool isCompatible() const = 0;
-        virtual void query() = 0;
-        virtual int deviceID() const = 0;
-        virtual std::string name() const = 0;
-        virtual int majorVersion() const = 0;
-        virtual int minorVersion() const = 0;
-        virtual int multiProcessorCount() const = 0;
-        virtual int getCudaEnabledDeviceCount() const = 0;
-        virtual void setDevice(int) const = 0;
-        virtual int getDevice() const = 0;
-        virtual void resetDevice() const  = 0;
-        virtual bool deviceSupports(FeatureSet) const = 0;
+class DeviceInfoFuncTable
+{
+public:
+    // cv::DeviceInfo
+    virtual size_t sharedMemPerBlock() const = 0;
+    virtual void queryMemory(size_t&, size_t&) const = 0;
+    virtual size_t freeMemory() const = 0;
+    virtual size_t totalMemory() const = 0;
+    virtual bool supports(FeatureSet) const = 0;
+    virtual bool isCompatible() const = 0;
+    virtual void query() = 0;
+    virtual int deviceID() const = 0;
+    virtual std::string name() const = 0;
+    virtual int majorVersion() const = 0;
+    virtual int minorVersion() const = 0;
+    virtual int multiProcessorCount() const = 0;
+    virtual int getCudaEnabledDeviceCount() const = 0;
+    virtual void setDevice(int) const = 0;
+    virtual int getDevice() const = 0;
+    virtual void resetDevice() const  = 0;
+    virtual bool deviceSupports(FeatureSet) const = 0;
 
-        // cv::TargetArchs
-        virtual bool builtWith(FeatureSet) const = 0;
-        virtual bool has(int, int) const = 0;
-        virtual bool hasPtx(int, int) const = 0;
-        virtual bool hasBin(int, int) const = 0;
-        virtual bool hasEqualOrLessPtx(int, int) const = 0;
-        virtual bool hasEqualOrGreater(int, int) const = 0;
-        virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
-        virtual bool hasEqualOrGreaterBin(int, int) const = 0;
+    // cv::TargetArchs
+    virtual bool builtWith(FeatureSet) const = 0;
+    virtual bool has(int, int) const = 0;
+    virtual bool hasPtx(int, int) const = 0;
+    virtual bool hasBin(int, int) const = 0;
+    virtual bool hasEqualOrLessPtx(int, int) const = 0;
+    virtual bool hasEqualOrGreater(int, int) const = 0;
+    virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
+    virtual bool hasEqualOrGreaterBin(int, int) const = 0;
 
-        virtual void printCudaDeviceInfo(int) const = 0;
-        virtual void printShortCudaDeviceInfo(int) const = 0;
+    virtual void printCudaDeviceInfo(int) const = 0;
+    virtual void printShortCudaDeviceInfo(int) const = 0;
 
-        virtual ~DeviceInfoFuncTable() {};
-    };
+    virtual ~DeviceInfoFuncTable() {};
+};
 
-    class GpuFuncTable
-    {
-    public:
-        virtual ~GpuFuncTable() {}
+class GpuFuncTable
+{
+public:
+    virtual ~GpuFuncTable() {}
 
-        // GpuMat routines
-        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
-        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
-        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
+    // GpuMat routines
+    virtual void copy(const Mat& src, GpuMat& dst) const = 0;
+    virtual void copy(const GpuMat& src, Mat& dst) const = 0;
+    virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
 
-        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
+    virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
 
-        // gpu::device::convertTo funcs
-        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
-        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
+    // gpu::device::convertTo funcs
+    virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0;
+    virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
 
-        // for gpu::device::setTo funcs
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
+    // for gpu::device::setTo funcs
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
 
-        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
-        virtual void free(void* devPtr) const = 0;
-    };
+    virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
+    virtual void free(void* devPtr) const = 0;
+};
 
-    class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
-    {
-    public:
-        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
-        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
-        size_t freeMemory() const { throw_nogpu; return 0; }
-        size_t totalMemory() const { throw_nogpu; return 0; }
-        bool supports(FeatureSet) const { throw_nogpu; return false; }
-        bool isCompatible() const { throw_nogpu; return false; }
-        void query() { throw_nogpu; }
-        int deviceID() const { throw_nogpu; return -1; };
-        std::string name() const { throw_nogpu; return std::string(); }
-        int majorVersion() const { throw_nogpu; return -1; }
-        int minorVersion() const { throw_nogpu; return -1; }
-        int multiProcessorCount() const { throw_nogpu; return -1; }
+class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
+{
+public:
+    size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
+    void queryMemory(size_t&, size_t&) const { throw_nogpu; }
+    size_t freeMemory() const { throw_nogpu; return 0; }
+    size_t totalMemory() const { throw_nogpu; return 0; }
+    bool supports(FeatureSet) const { throw_nogpu; return false; }
+    bool isCompatible() const { throw_nogpu; return false; }
+    void query() { throw_nogpu; }
+    int deviceID() const { throw_nogpu; return -1; };
+    std::string name() const { throw_nogpu; return std::string(); }
+    int majorVersion() const { throw_nogpu; return -1; }
+    int minorVersion() const { throw_nogpu; return -1; }
+    int multiProcessorCount() const { throw_nogpu; return -1; }
 
-        int getCudaEnabledDeviceCount() const { return 0; }
+    int getCudaEnabledDeviceCount() const { return 0; }
 
-        void setDevice(int) const { throw_nogpu; }
-        int getDevice() const { throw_nogpu; return 0; }
+    void setDevice(int) const { throw_nogpu; }
+    int getDevice() const { throw_nogpu; return 0; }
 
-        void resetDevice() const { throw_nogpu; }
+    void resetDevice() const { throw_nogpu; }
 
-        bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
+    bool deviceSupports(FeatureSet) const { throw_nogpu; return false; }
 
-        bool builtWith(FeatureSet) const { throw_nogpu; return false; }
-        bool has(int, int) const { throw_nogpu; return false; }
-        bool hasPtx(int, int) const { throw_nogpu; return false; }
-        bool hasBin(int, int) const { throw_nogpu; return false; }
-        bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
-        bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
-        bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
-        bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
+    bool builtWith(FeatureSet) const { throw_nogpu; return false; }
+    bool has(int, int) const { throw_nogpu; return false; }
+    bool hasPtx(int, int) const { throw_nogpu; return false; }
+    bool hasBin(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
+    bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
 
-        void printCudaDeviceInfo(int) const { throw_nogpu; }
-        void printShortCudaDeviceInfo(int) const { throw_nogpu; }
-    };
+    void printCudaDeviceInfo(int) const { throw_nogpu; }
+    void printShortCudaDeviceInfo(int) const { throw_nogpu; }
+};
 
-    class EmptyFuncTable : public GpuFuncTable
-    {
-    public:
+class EmptyFuncTable : public GpuFuncTable
+{
+public:
 
-        void copy(const Mat&, GpuMat&) const { throw_nogpu; }
-        void copy(const GpuMat&, Mat&) const { throw_nogpu; }
-        void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
+    void copy(const Mat&, GpuMat&) const { throw_nogpu; }
+    void copy(const GpuMat&, Mat&) const { throw_nogpu; }
+    void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
 
-        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
+    void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
 
-        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
+    void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
+    void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
 
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
+    virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
 
-        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
-        void free(void*) const {}
-    };
+    void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
+    void free(void*) const {}
+};
 
 #if defined(USE_CUDA)
 
@@ -153,940 +153,949 @@ namespace cv { namespace gpu { namespace device
     void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
 }}}
 
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
+template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
+{
+    Scalar_<T> sf = s;
+    cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+}
+
+template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+{
+    Scalar_<T> sf = s;
+    cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+}
+
+template<int n> struct NPPTypeTraits;
+template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
+template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
+
+//////////////////////////////////////////////////////////////////////////
+// Convert
+
+template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
+};
+template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
+{
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
+};
+
+template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    static void call(const GpuMat& src, GpuMat& dst)
     {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
+{
+    typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+    static void call(const GpuMat& src, GpuMat& dst)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// Set
+
+template<int SDEPTH, int SCN> struct NppSetFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<int SCN> struct NppSetFunc<CV_8S, SCN>
+{
+    typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+template<> struct NppSetFunc<CV_8S, 1>
+{
+    typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
+};
+
+template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template<int SDEPTH, int SCN> struct NppSetMaskFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+
+template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        Scalar_<src_t> nppS = s;
+
+        nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// CopyMasked
+
+template<int SDEPTH> struct NppCopyMaskedFunc
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+};
+
+template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
+{
+    typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
+    {
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+
+template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+{
+    return reinterpret_cast<size_t>(ptr) % size == 0;
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0);
+    void convertTo(const GpuMat& src, GpuMat& dst);
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0);
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask);
+
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    {
+        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+
+        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
     }
 
-    template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    void convertTo(const GpuMat& src, GpuMat& dst)
     {
-        Scalar_<T> sf = s;
-        cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
     }
 
-    template<int n> struct NPPTypeTraits;
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
-    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
-    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
-    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
     {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
-    };
-
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Set
-
-    template<int SDEPTH, int SCN> struct NppSetFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SCN> struct NppSetFunc<CV_8S, SCN>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<> struct NppSetFunc<CV_8S, 1>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // CopyMasked
-
-    template<int SDEPTH> struct NppCopyMaskedFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
-    {
-        return reinterpret_cast<size_t>(ptr) % size == 0;
+        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
     }
 
-    namespace cv { namespace gpu { namespace device
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
     {
-        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
+        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
+
+        static const caller_t callers[] =
         {
-            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
 
-            cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
-        }
+        callers[src.depth()](src, s, stream);
+    }
 
-        void convertTo(const GpuMat& src, GpuMat& dst)
-        {
-            cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
-        }
-
-        void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
-        {
-            cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
-        }
-
-        void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-                kernelSetCaller<float>, kernelSetCaller<double>
-            };
-
-            callers[src.depth()](src, s, stream);
-        }
-
-        void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
-                kernelSetCaller<float>, kernelSetCaller<double>
-            };
-
-            callers[src.depth()](src, s, mask, stream);
-        }
-
-        void setTo(GpuMat& src, Scalar s)
-        {
-            setTo(src, s, 0);
-        }
-
-        void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
-        {
-            setTo(src, s, mask, 0);
-        }
-    }}}
-
-
-    class CudaArch
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
     {
-    public:
-        CudaArch()
+        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+
+        static const caller_t callers[] =
         {
-            fromStr(CUDA_ARCH_BIN, bin);
-            fromStr(CUDA_ARCH_PTX, ptx);
-            fromStr(CUDA_ARCH_FEATURES, features);
-        }
+            kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
+            kernelSetCaller<float>, kernelSetCaller<double>
+        };
 
-        bool builtWith(FeatureSet feature_set) const
-        {
-            return !features.empty() && (features.back() >= feature_set);
-        }
+        callers[src.depth()](src, s, mask, stream);
+    }
 
-        bool hasPtx(int major, int minor) const
-        {
-            return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
-        }
-
-        bool hasBin(int major, int minor) const
-        {
-            return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
-        }
-
-        bool hasEqualOrLessPtx(int major, int minor) const
-        {
-            return !ptx.empty() && (ptx.front() <= major * 10 + minor);
-        }
-
-        bool hasEqualOrGreaterPtx(int major, int minor) const
-        {
-            return !ptx.empty() && (ptx.back() >= major * 10 + minor);
-        }
-
-        bool hasEqualOrGreaterBin(int major, int minor) const
-        {
-            return !bin.empty() && (bin.back() >= major * 10 + minor);
-        }
-
-
-    private:
-        void fromStr(const string& set_as_str, vector<int>& arr)
-        {
-            if (set_as_str.find_first_not_of(" ") == string::npos)
-                return;
-
-            istringstream stream(set_as_str);
-            int cur_value;
-
-            while (!stream.eof())
-            {
-                stream >> cur_value;
-                arr.push_back(cur_value);
-            }
-
-            sort(arr.begin(), arr.end());
-        }
-
-        vector<int> bin;
-        vector<int> ptx;
-        vector<int> features;
-    };
-
-    class DeviceProps
+    void setTo(GpuMat& src, Scalar s)
     {
-    public:
-        DeviceProps()
-        {
-            props_.resize(10, 0);
-        }
+        setTo(src, s, 0);
+    }
 
-        ~DeviceProps()
-        {
-            for (size_t i = 0; i < props_.size(); ++i)
-            {
-                if (props_[i])
-                    delete props_[i];
-            }
-            props_.clear();
-        }
-
-        cudaDeviceProp* get(int devID)
-        {
-            if (devID >= (int) props_.size())
-                props_.resize(devID + 5, 0);
-
-            if (!props_[devID])
-            {
-                props_[devID] = new cudaDeviceProp;
-                cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
-            }
-
-            return props_[devID];
-        }
-    private:
-        std::vector<cudaDeviceProp*> props_;
-    };
-
-    DeviceProps deviceProps;
-
-    class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
     {
-    public:
-        size_t sharedMemPerBlock() const
+        setTo(src, s, mask, 0);
+    }
+}}}
+
+class CudaArch
+{
+public:
+    CudaArch()
+    {
+        fromStr(CUDA_ARCH_BIN, bin);
+        fromStr(CUDA_ARCH_PTX, ptx);
+        fromStr(CUDA_ARCH_FEATURES, features);
+    }
+
+    bool builtWith(FeatureSet feature_set) const
+    {
+        return !features.empty() && (features.back() >= feature_set);
+    }
+
+    bool hasPtx(int major, int minor) const
+    {
+        return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
+    }
+
+    bool hasBin(int major, int minor) const
+    {
+        return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
+    }
+
+    bool hasEqualOrLessPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.front() <= major * 10 + minor);
+    }
+
+    bool hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return !ptx.empty() && (ptx.back() >= major * 10 + minor);
+    }
+
+    bool hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return !bin.empty() && (bin.back() >= major * 10 + minor);
+    }
+
+
+private:
+    void fromStr(const string& set_as_str, vector<int>& arr)
+    {
+        if (set_as_str.find_first_not_of(" ") == string::npos)
+            return;
+
+        istringstream stream(set_as_str);
+        int cur_value;
+
+        while (!stream.eof())
         {
-            return deviceProps.get(device_id_)->sharedMemPerBlock;
+            stream >> cur_value;
+            arr.push_back(cur_value);
         }
 
-        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+        sort(arr.begin(), arr.end());
+    }
+
+    vector<int> bin;
+    vector<int> ptx;
+    vector<int> features;
+};
+
+class DeviceProps
+{
+public:
+    DeviceProps()
+    {
+        props_.resize(10, 0);
+    }
+
+    ~DeviceProps()
+    {
+        for (size_t i = 0; i < props_.size(); ++i)
         {
-            int prevDeviceID = getDevice();
-            if (prevDeviceID != device_id_)
-                setDevice(device_id_);
+            if (props_[i])
+                delete props_[i];
+        }
+        props_.clear();
+    }
 
-            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+    cudaDeviceProp* get(int devID)
+    {
+        if (devID >= (int) props_.size())
+            props_.resize(devID + 5, 0);
 
-            if (prevDeviceID != device_id_)
-                setDevice(prevDeviceID);
+        if (!props_[devID])
+        {
+            props_[devID] = new cudaDeviceProp;
+            cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
         }
 
-        size_t freeMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _freeMemory;
-        }
+        return props_[devID];
+    }
+private:
+    std::vector<cudaDeviceProp*> props_;
+};
 
-        size_t totalMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _totalMemory;
-        }
+DeviceProps deviceProps;
 
-        bool supports(FeatureSet feature_set) const
-        {
-            int version = majorVersion_ * 10 + minorVersion_;
-            return version >= feature_set;
-        }
+class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
+{
+public:
+    size_t sharedMemPerBlock() const
+    {
+        return deviceProps.get(device_id_)->sharedMemPerBlock;
+    }
 
-        bool isCompatible() const
-        {
-            // Check PTX compatibility
-            if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
-                return true;
+    void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+    {
+        int prevDeviceID = getDevice();
+        if (prevDeviceID != device_id_)
+            setDevice(device_id_);
 
-            // Check BIN compatibility
-                for (int i = minorVersion_; i >= 0; --i)
-                    if (hasBin(majorVersion_, i))
-                        return true;
+        cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
 
-                    return false;
-        }
+        if (prevDeviceID != device_id_)
+            setDevice(prevDeviceID);
+    }
 
-        void query()
-        {
-            const cudaDeviceProp* prop = deviceProps.get(device_id_);
+    size_t freeMemory() const
+    {
+        size_t _totalMemory, _freeMemory;
+        queryMemory(_totalMemory, _freeMemory);
+        return _freeMemory;
+    }
 
-            name_ = prop->name;
-            multi_processor_count_ = prop->multiProcessorCount;
-            majorVersion_ = prop->major;
-            minorVersion_ = prop->minor;
-        }
+    size_t totalMemory() const
+    {
+        size_t _totalMemory, _freeMemory;
+        queryMemory(_totalMemory, _freeMemory);
+        return _totalMemory;
+    }
 
-        int deviceID() const
-        {
-            return device_id_;
-        }
+    bool supports(FeatureSet feature_set) const
+    {
+        int version = majorVersion_ * 10 + minorVersion_;
+        return version >= feature_set;
+    }
 
-        std::string name() const
-        {
-            return name_;
-        }
+    bool isCompatible() const
+    {
+        // Check PTX compatibility
+        if (hasEqualOrLessPtx(majorVersion_, minorVersion_))
+            return true;
 
-        int majorVersion() const
-        {
-            return majorVersion_;
-        }
+        // Check BIN compatibility
+            for (int i = minorVersion_; i >= 0; --i)
+                if (hasBin(majorVersion_, i))
+                    return true;
 
-        int minorVersion() const
-        {
-            return minorVersion_;
-        }
+                return false;
+    }
 
-        int multiProcessorCount() const
-        {
-            return multi_processor_count_;
-        }
+    void query()
+    {
+        const cudaDeviceProp* prop = deviceProps.get(device_id_);
 
-        int getCudaEnabledDeviceCount() const
-        {
-            int count;
-            cudaError_t error = cudaGetDeviceCount( &count );
+        name_ = prop->name;
+        multi_processor_count_ = prop->multiProcessorCount;
+        majorVersion_ = prop->major;
+        minorVersion_ = prop->minor;
+    }
 
-            if (error == cudaErrorInsufficientDriver)
-                return -1;
+    int deviceID() const
+    {
+        return device_id_;
+    }
 
-            if (error == cudaErrorNoDevice)
-                return 0;
+    std::string name() const
+    {
+        return name_;
+    }
 
-            cudaSafeCall( error );
-            return count;
-        }
+    int majorVersion() const
+    {
+        return majorVersion_;
+    }
 
-        void setDevice(int device) const
-        {
-            cudaSafeCall( cudaSetDevice( device ) );
-        }
+    int minorVersion() const
+    {
+        return minorVersion_;
+    }
 
-        int getDevice() const
-        {
-            int device;
-            cudaSafeCall( cudaGetDevice( &device ) );
-            return device;
-        }
+    int multiProcessorCount() const
+    {
+        return multi_processor_count_;
+    }
 
-        void resetDevice() const
-        {
-            cudaSafeCall( cudaDeviceReset() );
-        }
-
-        bool builtWith(FeatureSet feature_set) const
-        {
-            return cudaArch.builtWith(feature_set);
-        }
-
-        bool has(int major, int minor) const
-        {
-            return hasPtx(major, minor) || hasBin(major, minor);
-        }
-
-        bool hasPtx(int major, int minor) const
-        {
-            return cudaArch.hasPtx(major, minor);
-        }
-
-        bool hasBin(int major, int minor) const
-        {
-            return cudaArch.hasBin(major, minor);
-        }
-
-        bool hasEqualOrLessPtx(int major, int minor) const
-        {
-            return cudaArch.hasEqualOrLessPtx(major, minor);
-        }
-
-        bool hasEqualOrGreater(int major, int minor) const
-        {
-            return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
-        }
-
-        bool hasEqualOrGreaterPtx(int major, int minor) const
-        {
-            return cudaArch.hasEqualOrGreaterPtx(major, minor);
-        }
-
-        bool hasEqualOrGreaterBin(int major, int minor) const
-        {
-            return cudaArch.hasEqualOrGreaterBin(major, minor);
-        }
-
-        bool deviceSupports(FeatureSet feature_set) const
-        {
-            static int versions[] =
-            {
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-            };
-            static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
-
-            const int devId = getDevice();
-
-            int version;
-
-            if (devId < cache_size && versions[devId] >= 0)
-                version = versions[devId];
-            else
-            {
-                DeviceInfo dev(devId);
-                version = dev.majorVersion() * 10 + dev.minorVersion();
-                if (devId < cache_size)
-                    versions[devId] = version;
-            }
-
-            return TargetArchs::builtWith(feature_set) && (version >= feature_set);
-        }
-
-        void printCudaDeviceInfo(int device) const
-        {
-            int count = getCudaEnabledDeviceCount();
-            bool valid = (device >= 0) && (device < count);
-
-            int beg = valid ? device   : 0;
-            int end = valid ? device+1 : count;
-
-            printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
-            printf("Device count: %d\n", count);
-
-            int driverVersion = 0, runtimeVersion = 0;
-            cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-            cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-
-            const char *computeMode[] = {
-                "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
-                "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
-                "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
-                "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
-                "Unknown",
-                NULL
-            };
-
-            for(int dev = beg; dev < end; ++dev)
-            {
-                cudaDeviceProp prop;
-                cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-
-                printf("\nDevice %d: \"%s\"\n", dev, prop.name);
-                printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-                printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
-                printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-
-                int cores = convertSMVer2Cores(prop.major, prop.minor);
-                if (cores > 0)
-                    printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
-
-                printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
-
-                printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
-                       prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
-                       prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
-                printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
-                       prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
-                       prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-
-                printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
-                printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
-                printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
-                printf("  Warp size:                                     %d\n", prop.warpSize);
-                printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
-                printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
-                printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
-                printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
-                printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
-
-                printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
-                printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
-                printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
-                printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
-
-                printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
-                printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
-                printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
-                printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
-                printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
-                printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
-                printf("  Compute Mode:\n");
-                printf("      %s \n", computeMode[prop.computeMode]);
-            }
-
-            printf("\n");
-            printf("deviceQuery, CUDA Driver = CUDART");
-            printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
-            printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
-            printf(", NumDevs = %d\n\n", count);
-            fflush(stdout);
-        }
-
-        void printShortCudaDeviceInfo(int device) const
-        {
-            int count = getCudaEnabledDeviceCount();
-            bool valid = (device >= 0) && (device < count);
-
-            int beg = valid ? device   : 0;
-            int end = valid ? device+1 : count;
-
-            int driverVersion = 0, runtimeVersion = 0;
-            cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
-            cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
-
-            for(int dev = beg; dev < end; ++dev)
-            {
-                cudaDeviceProp prop;
-                cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
-
-                const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
-                printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
-                printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
-
-                int cores = convertSMVer2Cores(prop.major, prop.minor);
-                if (cores > 0)
-                    printf(", %d cores", cores * prop.multiProcessorCount);
-
-                printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
-            }
-            fflush(stdout);
-        }
-
-    private:
-        int device_id_;
-
-        std::string name_;
-        int multi_processor_count_;
-        int majorVersion_;
-        int minorVersion_;
-
-        const CudaArch cudaArch;
-
-        int convertSMVer2Cores(int major, int minor) const
-        {
-            // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-            typedef struct {
-                int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-                int Cores;
-            } SMtoCores;
-
-            SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
-
-            int index = 0;
-            while (gpuArchCoresPerSM[index].SM != -1)
-            {
-                if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
-                    return gpuArchCoresPerSM[index].Cores;
-                index++;
-            }
+    int getCudaEnabledDeviceCount() const
+    {
+        int count;
+        cudaError_t error = cudaGetDeviceCount( &count );
 
+        if (error == cudaErrorInsufficientDriver)
             return -1;
-        }
-    };
 
-    class CudaFuncTable : public GpuFuncTable
+        if (error == cudaErrorNoDevice)
+            return 0;
+
+        cudaSafeCall( error );
+        return count;
+    }
+
+    void setDevice(int device) const
     {
-    public:
+        cudaSafeCall( cudaSetDevice( device ) );
+    }
 
-        void copy(const Mat& src, GpuMat& dst) const
+    int getDevice() const
+    {
+        int device;
+        cudaSafeCall( cudaGetDevice( &device ) );
+        return device;
+    }
+
+    void resetDevice() const
+    {
+        cudaSafeCall( cudaDeviceReset() );
+    }
+
+    bool builtWith(FeatureSet feature_set) const
+    {
+        return cudaArch.builtWith(feature_set);
+    }
+
+    bool has(int major, int minor) const
+    {
+        return hasPtx(major, minor) || hasBin(major, minor);
+    }
+
+    bool hasPtx(int major, int minor) const
+    {
+        return cudaArch.hasPtx(major, minor);
+    }
+
+    bool hasBin(int major, int minor) const
+    {
+        return cudaArch.hasBin(major, minor);
+    }
+
+    bool hasEqualOrLessPtx(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrLessPtx(major, minor);
+    }
+
+    bool hasEqualOrGreater(int major, int minor) const
+    {
+        return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+    }
+
+    bool hasEqualOrGreaterPtx(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrGreaterPtx(major, minor);
+    }
+
+    bool hasEqualOrGreaterBin(int major, int minor) const
+    {
+        return cudaArch.hasEqualOrGreaterBin(major, minor);
+    }
+
+    bool deviceSupports(FeatureSet feature_set) const
+    {
+        static int versions[] =
         {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
-        }
-        void copy(const GpuMat& src, Mat& dst) const
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+        };
+        static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
+
+        const int devId = getDevice();
+
+        int version;
+
+        if (devId < cache_size && versions[devId] >= 0)
+            version = versions[devId];
+        else
         {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
-        }
-        void copy(const GpuMat& src, GpuMat& dst) const
-        {
-            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+            DeviceInfo dev(devId);
+            version = dev.majorVersion() * 10 + dev.minorVersion();
+            if (devId < cache_size)
+                versions[devId] = version;
         }
 
-        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
-            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+        return TargetArchs::builtWith(feature_set) && (version >= feature_set);
+    }
 
-            if (src.depth() == CV_64F)
+    void printCudaDeviceInfo(int device) const
+    {
+        int count = getCudaEnabledDeviceCount();
+        bool valid = (device >= 0) && (device < count);
+
+        int beg = valid ? device   : 0;
+        int end = valid ? device+1 : count;
+
+        printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
+        printf("Device count: %d\n", count);
+
+        int driverVersion = 0, runtimeVersion = 0;
+        cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+        cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+
+        const char *computeMode[] = {
+            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+               "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+               "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+               "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+               "Unknown",
+               NULL
+        };
+
+        for(int dev = beg; dev < end; ++dev)
+        {
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+
+            printf("\nDevice %d: \"%s\"\n", dev, prop.name);
+            printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+            printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
+            printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
+
+        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
+
+        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
+               prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
+               prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
+        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
+               prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
+               prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
+
+        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
+        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
+        printf("  Warp size:                                     %d\n", prop.warpSize);
+        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
+        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
+        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
+        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
+
+        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
+        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
+
+        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
+        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
+        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
+        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
+        printf("  Compute Mode:\n");
+        printf("      %s \n", computeMode[prop.computeMode]);
+        }
+
+        printf("\n");
+        printf("deviceQuery, CUDA Driver = CUDART");
+        printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
+        printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
+        printf(", NumDevs = %d\n\n", count);
+        fflush(stdout);
+    }
+
+    void printShortCudaDeviceInfo(int device) const
+    {
+        int count = getCudaEnabledDeviceCount();
+        bool valid = (device >= 0) && (device < count);
+
+        int beg = valid ? device   : 0;
+        int end = valid ? device+1 : count;
+
+        int driverVersion = 0, runtimeVersion = 0;
+        cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+        cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+
+        for(int dev = beg; dev < end; ++dev)
+        {
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+
+            const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
+            printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
+            printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
+
+            int cores = convertSMVer2Cores(prop.major, prop.minor);
+            if (cores > 0)
+                printf(", %d cores", cores * prop.multiProcessorCount);
+
+            printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+        }
+        fflush(stdout);
+    }
+
+private:
+    int device_id_;
+
+    std::string name_;
+    int multi_processor_count_;
+    int majorVersion_;
+    int minorVersion_;
+
+    const CudaArch cudaArch;
+
+    int convertSMVer2Cores(int major, int minor) const
+    {
+        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+        typedef struct {
+            int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+            int Cores;
+        } SMtoCores;
+
+        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
+
+        int index = 0;
+        while (gpuArchCoresPerSM[index].SM != -1)
+        {
+            if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
+                return gpuArchCoresPerSM[index].Cores;
+            index++;
+        }
+
+        return -1;
+    }
+};
+
+class CudaFuncTable : public GpuFuncTable
+{
+public:
+
+    void copy(const Mat& src, GpuMat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
+    }
+
+    void copy(const GpuMat& src, Mat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
+    }
+
+    void copy(const GpuMat& src, GpuMat& dst) const
+    {
+        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+    }
+
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
+    {
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
+        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
+
+        if (src.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
+        static const func_t funcs[7][4] =
+        {
+            /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
+            /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
+            /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
+            /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
+            /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
+            /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
+            /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
+         };
+
+         const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
+
+         func(src, dst, mask, 0);
+    }
+
+    void convert(const GpuMat& src, GpuMat& dst) const
+    {
+        typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
+        static const func_t funcs[7][7][4] =
+        {
             {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+                /*  8U ->  8U */ {0, 0, 0, 0},
+                /*  8U ->  8S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
+                /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
+                /*  8U -> 32S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /*  8U -> 64F */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /*  8S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S ->  8S */ {0,0,0,0},
+                /*  8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /*  8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
+                /* 16U ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 16U */ {0,0,0,0},
+                /* 16U -> 16S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16U -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
+                /* 16S ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 16U */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 16S */ {0,0,0,0},
+                /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
+                /* 16S -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
+            },
+            {
+                /* 32S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 32S */ {0,0,0,0},
+                /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F ->  8S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 32S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 32F -> 32F */ {0,0,0,0},
+                /* 32F -> 64F */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
+            },
+            {
+                /* 64F ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
+                /* 64F -> 64F */ {0,0,0,0}
             }
+        };
 
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-            static const func_t funcs[7][4] =
-            {
-                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
-                /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
-                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
-                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
-                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
-                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
-                /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
-            };
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(dst.depth() <= CV_64F);
+        CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
 
-            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
-
-            func(src, dst, mask, 0);
+        if (src.depth() == CV_64F || dst.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
         }
 
-        void convert(const GpuMat& src, GpuMat& dst) const
+        bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
+        if (!aligned)
         {
-            typedef void (*func_t)(const GpuMat& src, GpuMat& dst);
-            static const func_t funcs[7][7][4] =
-            {
-                {
-                    /*  8U ->  8U */ {0, 0, 0, 0},
-                    /*  8U ->  8S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
-                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
-                    /*  8U -> 32S */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /*  8U -> 64F */ {cv::gpu::device::convertTo                        , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
-                },
-                {
-                    /*  8S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S ->  8S */ {0,0,0,0},
-                    /*  8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /*  8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
-                },
-                {
-                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
-                    /* 16U ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16U -> 16U */ {0,0,0,0},
-                    /* 16U -> 16S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16U -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
-                },
-                {
-                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
-                    /* 16S ->  8S */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16S -> 16U */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16S -> 16S */ {0,0,0,0},
-                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                },
-                    /* 16S -> 64F */ {cv::gpu::device::convertTo                                  , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo                                }
-                },
-                {
-                    /* 32S ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S -> 32S */ {0,0,0,0},
-                    /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
-                },
-                {
-                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F ->  8S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F -> 32S */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 32F -> 32F */ {0,0,0,0},
-                    /* 32F -> 64F */ {cv::gpu::device::convertTo                          , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}
-                },
-                {
-                    /* 64F ->  8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F ->  8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo},
-                    /* 64F -> 64F */ {0,0,0,0}
-                }
-            };
+            cv::gpu::device::convertTo(src, dst);
+            return;
+        }
 
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-            CV_Assert(src.size() == dst.size() && src.channels() == dst.channels());
+        const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
+        CV_DbgAssert(func != 0);
 
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
-            {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
+        func(src, dst);
+    }
 
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-            if (!aligned)
+    void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
+    {
+        CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
+        CV_Assert(dst.depth() <= CV_64F);
+
+        if (src.depth() == CV_64F || dst.depth() == CV_64F)
+        {
+            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
+    }
+
+    void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
+    {
+        if (mask.empty())
+        {
+            if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
             {
-                cv::gpu::device::convertTo(src, dst);
+                cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
                 return;
             }
 
-            const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
-            CV_DbgAssert(func != 0);
-
-            func(src, dst);
-        }
-
-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
-        {
-            CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
-            CV_Assert(dst.depth() <= CV_64F);
-
-            if (src.depth() == CV_64F || dst.depth() == CV_64F)
+            if (m.depth() == CV_8U)
             {
-                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-            }
+                int cn = m.channels();
 
-            cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
-        }
-
-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
-        {
-            if (mask.empty())
-            {
-                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+                if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
                 {
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
+                    int val = saturate_cast<uchar>(s[0]);
+                    cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
                     return;
                 }
-
-                if (m.depth() == CV_8U)
-                {
-                    int cn = m.channels();
-
-                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-                    {
-                        int val = saturate_cast<uchar>(s[0]);
-                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
-                        return;
-                    }
-                }
-
-                typedef void (*func_t)(GpuMat& src, Scalar s);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
-                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          },
-                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
-                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
-                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
-                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
-                };
-
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
-
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-
-                if (stream)
-                    cv::gpu::device::setTo(m, s, stream);
-                else
-                    funcs[m.depth()][m.channels() - 1](m, s);
             }
-            else
+
+            typedef void (*func_t)(GpuMat& src, Scalar s);
+            static const func_t funcs[7][4] =
             {
-                typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
-                static const func_t funcs[7][4] =
-                {
-                    {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
-                    {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               },
-                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
-                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
-                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
-                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
-                    {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               }
-                };
+                {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
+                {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          },
+                {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
+                {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
+                {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
+                {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
+                {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
+            };
 
-                CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+            CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
 
-                if (m.depth() == CV_64F)
-                {
-                    if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-                        CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-                }
-
-                if (stream)
-                    cv::gpu::device::setTo(m, s, mask, stream);
-                else
-                    funcs[m.depth()][m.channels() - 1](m, s, mask);
+            if (m.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
             }
-        }
 
-        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
-        {
-            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+            if (stream)
+                cv::gpu::device::setTo(m, s, stream);
+            else
+                funcs[m.depth()][m.channels() - 1](m, s);
         }
+        else
+        {
+            typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask);
+            static const func_t funcs[7][4] =
+            {
+                {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
+                {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               },
+                {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
+                {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
+                {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
+                {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
+                {cv::gpu::device::setTo                       , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo                               }
+            };
 
-        void free(void* devPtr) const
-        {
-            cudaFree(devPtr);
+            CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
+
+            if (m.depth() == CV_64F)
+            {
+                if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+                    CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+            }
+
+            if (stream)
+                cv::gpu::device::setTo(m, s, mask, stream);
+            else
+                funcs[m.depth()][m.channels() - 1](m, s, mask);
         }
-    };
+    }
+
+    void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
+    {
+        cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+    }
+
+    void free(void* devPtr) const
+    {
+        cudaFree(devPtr);
+    }
+};
 #endif
 #endif
\ No newline at end of file
diff --git a/modules/dynamicuda/src/main.cpp b/modules/dynamicuda/src/main.cpp
index 4a05d8696..8eb66fd98 100644
--- a/modules/dynamicuda/src/main.cpp
+++ b/modules/dynamicuda/src/main.cpp
@@ -39,6 +39,9 @@ static EmptyFuncTable gpuTable;
 
 extern "C" {
 
+DeviceInfoFuncTable* deviceInfoFactory();
+GpuFuncTable* gpuFactory();
+
 DeviceInfoFuncTable* deviceInfoFactory()
 {
     return (DeviceInfoFuncTable*)&deviceInfoTable;
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 291295fb5..3a6ebe836 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -297,7 +297,7 @@ if(BUILD_FAT_JAVA_LIB)
       list(REMOVE_ITEM __deps ${m})
     endif()
   endforeach()
-  if (HAVE_opencv_dynamicuda)
+  if (ENABLE_DYNAMIC_CUDA)
     list(REMOVE_ITEM __deps "opencv_dynamicuda")
   endif()
   if (ANDROID AND HAVE_opencv_gpu)