From 5a5c82bb1d395aeb76bd76f14a1db22742c02599 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 17:41:04 +0400 Subject: [PATCH] Additional ENABLE_DYNAMIC_CUDA option implemented in cmake. Warning fixes and refactoring. --- CMakeLists.txt | 1 + modules/core/CMakeLists.txt | 14 +- modules/dynamicuda/CMakeLists.txt | 1 + .../include/opencv2/dynamicuda/dynamicuda.hpp | 1899 +++++++++-------- modules/dynamicuda/src/main.cpp | 3 + modules/java/CMakeLists.txt | 2 +- 6 files changed, 969 insertions(+), 951 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cf25084bc..2c5165c1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi # OpenCV build options # =================================================== +OCV_OPTION(ENABLE_DYNAMIC_CUDA "Enabled dynamic CUDA linkage" ON IF ANDROID OR LINUX) OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers" ON IF (NOT IOS) ) OCV_OPTION(ENABLE_SOLUTION_FOLDERS "Solution folder in Visual Studio or in other IDEs" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") ) OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF IF CMAKE_COMPILER_IS_GNUCXX ) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index e89d6f276..f20e32d3a 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,8 +1,12 @@ set(the_description "The Core Functionality") -if (HAVE_opencv_dynamicuda) +message(STATUS "ENABLE_DYNAMIC_CUDA ${ENABLE_DYNAMIC_CUDA}") + +if (ENABLE_DYNAMIC_CUDA) + message(STATUS "Using dynamic cuda approach") ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) else() + message(STATUS "Link CUDA statically") ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) endif() @@ -12,7 +16,7 @@ if(HAVE_WINRT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") endif() -if(HAVE_opencv_dynamicuda) +if(ENABLE_DYNAMIC_CUDA) add_definitions(-DDYNAMIC_CUDA_SUPPORT) else() add_definitions(-DUSE_CUDA) @@ -26,18 +30,18 @@ endif() file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") -if (NOT HAVE_opencv_dynamicuda) +if (NOT ENABLE_DYNAMIC_CUDA) file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*") endif() source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) -if (NOT HAVE_opencv_dynamicuda) +if (NOT ENABLE_DYNAMIC_CUDA) source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs}) endif() -if (HAVE_opencv_dynamicuda) +if (ENABLE_DYNAMIC_CUDA) ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) else() diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt index 2ae5cf84a..def05d19b 100644 --- a/modules/dynamicuda/CMakeLists.txt +++ b/modules/dynamicuda/CMakeLists.txt @@ -5,6 +5,7 @@ endif() set(the_description "Dynamic CUDA linkage") add_definitions(-DUSE_CUDA) +ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") set(OPENCV_MODULE_TYPE SHARED) if (BUILD_FAT_JAVA_LIB) diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp index 9281655d7..4f5175513 100644 --- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp +++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp @@ -1,123 +1,123 @@ #ifndef __GPUMAT_CUDA_HPP__ #define __GPUMAT_CUDA_HPP__ - class DeviceInfoFuncTable - { - public: - // cv::DeviceInfo - virtual size_t sharedMemPerBlock() const = 0; - virtual void queryMemory(size_t&, size_t&) const = 0; - virtual size_t freeMemory() const = 0; - virtual size_t totalMemory() const = 0; - virtual bool supports(FeatureSet) const = 0; - virtual bool isCompatible() const = 0; - virtual void query() = 0; - virtual int deviceID() const = 0; - virtual std::string name() const = 0; - virtual int majorVersion() const = 0; - virtual int minorVersion() const = 0; - virtual int multiProcessorCount() const = 0; - virtual int getCudaEnabledDeviceCount() const = 0; - virtual void setDevice(int) const = 0; - virtual int getDevice() const = 0; - virtual void resetDevice() const = 0; - virtual bool deviceSupports(FeatureSet) const = 0; +class DeviceInfoFuncTable +{ +public: + // cv::DeviceInfo + virtual size_t sharedMemPerBlock() const = 0; + virtual void queryMemory(size_t&, size_t&) const = 0; + virtual size_t freeMemory() const = 0; + virtual size_t totalMemory() const = 0; + virtual bool supports(FeatureSet) const = 0; + virtual bool isCompatible() const = 0; + virtual void query() = 0; + virtual int deviceID() const = 0; + virtual std::string name() const = 0; + virtual int majorVersion() const = 0; + virtual int minorVersion() const = 0; + virtual int multiProcessorCount() const = 0; + virtual int getCudaEnabledDeviceCount() const = 0; + virtual void setDevice(int) const = 0; + virtual int getDevice() const = 0; + virtual void resetDevice() const = 0; + virtual bool deviceSupports(FeatureSet) const = 0; - // cv::TargetArchs - virtual bool builtWith(FeatureSet) const = 0; - virtual bool has(int, int) const = 0; - virtual bool hasPtx(int, int) const = 0; - virtual bool hasBin(int, int) const = 0; - virtual bool hasEqualOrLessPtx(int, int) const = 0; - virtual bool hasEqualOrGreater(int, int) const = 0; - virtual bool hasEqualOrGreaterPtx(int, int) const = 0; - virtual bool hasEqualOrGreaterBin(int, int) const = 0; + // cv::TargetArchs + virtual bool builtWith(FeatureSet) const = 0; + virtual bool has(int, int) const = 0; + virtual bool hasPtx(int, int) const = 0; + virtual bool hasBin(int, int) const = 0; + virtual bool hasEqualOrLessPtx(int, int) const = 0; + virtual bool hasEqualOrGreater(int, int) const = 0; + virtual bool hasEqualOrGreaterPtx(int, int) const = 0; + virtual bool hasEqualOrGreaterBin(int, int) const = 0; - virtual void printCudaDeviceInfo(int) const = 0; - virtual void printShortCudaDeviceInfo(int) const = 0; + virtual void printCudaDeviceInfo(int) const = 0; + virtual void printShortCudaDeviceInfo(int) const = 0; - virtual ~DeviceInfoFuncTable() {}; - }; + virtual ~DeviceInfoFuncTable() {}; +}; - class GpuFuncTable - { - public: - virtual ~GpuFuncTable() {} +class GpuFuncTable +{ +public: + virtual ~GpuFuncTable() {} - // GpuMat routines - virtual void copy(const Mat& src, GpuMat& dst) const = 0; - virtual void copy(const GpuMat& src, Mat& dst) const = 0; - virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; + // GpuMat routines + virtual void copy(const Mat& src, GpuMat& dst) const = 0; + virtual void copy(const GpuMat& src, Mat& dst) const = 0; + virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; - virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; + virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; - // gpu::device::convertTo funcs - virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0; - virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; + // gpu::device::convertTo funcs + virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0; + virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; - // for gpu::device::setTo funcs - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; + // for gpu::device::setTo funcs + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; - virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; - virtual void free(void* devPtr) const = 0; - }; + virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; + virtual void free(void* devPtr) const = 0; +}; - class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable - { - public: - size_t sharedMemPerBlock() const { throw_nogpu; return 0; } - void queryMemory(size_t&, size_t&) const { throw_nogpu; } - size_t freeMemory() const { throw_nogpu; return 0; } - size_t totalMemory() const { throw_nogpu; return 0; } - bool supports(FeatureSet) const { throw_nogpu; return false; } - bool isCompatible() const { throw_nogpu; return false; } - void query() { throw_nogpu; } - int deviceID() const { throw_nogpu; return -1; }; - std::string name() const { throw_nogpu; return std::string(); } - int majorVersion() const { throw_nogpu; return -1; } - int minorVersion() const { throw_nogpu; return -1; } - int multiProcessorCount() const { throw_nogpu; return -1; } +class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable +{ +public: + size_t sharedMemPerBlock() const { throw_nogpu; return 0; } + void queryMemory(size_t&, size_t&) const { throw_nogpu; } + size_t freeMemory() const { throw_nogpu; return 0; } + size_t totalMemory() const { throw_nogpu; return 0; } + bool supports(FeatureSet) const { throw_nogpu; return false; } + bool isCompatible() const { throw_nogpu; return false; } + void query() { throw_nogpu; } + int deviceID() const { throw_nogpu; return -1; }; + std::string name() const { throw_nogpu; return std::string(); } + int majorVersion() const { throw_nogpu; return -1; } + int minorVersion() const { throw_nogpu; return -1; } + int multiProcessorCount() const { throw_nogpu; return -1; } - int getCudaEnabledDeviceCount() const { return 0; } + int getCudaEnabledDeviceCount() const { return 0; } - void setDevice(int) const { throw_nogpu; } - int getDevice() const { throw_nogpu; return 0; } + void setDevice(int) const { throw_nogpu; } + int getDevice() const { throw_nogpu; return 0; } - void resetDevice() const { throw_nogpu; } + void resetDevice() const { throw_nogpu; } - bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } + bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } - bool builtWith(FeatureSet) const { throw_nogpu; return false; } - bool has(int, int) const { throw_nogpu; return false; } - bool hasPtx(int, int) const { throw_nogpu; return false; } - bool hasBin(int, int) const { throw_nogpu; return false; } - bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; } - bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } - bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } - bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } + bool builtWith(FeatureSet) const { throw_nogpu; return false; } + bool has(int, int) const { throw_nogpu; return false; } + bool hasPtx(int, int) const { throw_nogpu; return false; } + bool hasBin(int, int) const { throw_nogpu; return false; } + bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } - void printCudaDeviceInfo(int) const { throw_nogpu; } - void printShortCudaDeviceInfo(int) const { throw_nogpu; } - }; + void printCudaDeviceInfo(int) const { throw_nogpu; } + void printShortCudaDeviceInfo(int) const { throw_nogpu; } +}; - class EmptyFuncTable : public GpuFuncTable - { - public: +class EmptyFuncTable : public GpuFuncTable +{ +public: - void copy(const Mat&, GpuMat&) const { throw_nogpu; } - void copy(const GpuMat&, Mat&) const { throw_nogpu; } - void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } + void copy(const Mat&, GpuMat&) const { throw_nogpu; } + void copy(const GpuMat&, Mat&) const { throw_nogpu; } + void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } - void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } + void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } - void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } - void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } + void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } + void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } - void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } - void free(void*) const {} - }; + void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } + void free(void*) const {} +}; #if defined(USE_CUDA) @@ -153,940 +153,949 @@ namespace cv { namespace gpu { namespace device void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); }}} - template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) +template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) +{ + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); +} + +template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) +{ + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); +} + +template struct NPPTypeTraits; +template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; +template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; +template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; +template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; + +////////////////////////////////////////////////////////////////////////// +// Convert + +template struct NppConvertFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); +}; +template struct NppConvertFunc +{ + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); +}; + +template::func_ptr func> struct NppCvt +{ + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template::func_ptr func> struct NppCvt +{ + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +////////////////////////////////////////////////////////////////////////// +// Set + +template struct NppSetFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template struct NppSetFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template struct NppSetFunc +{ + typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template<> struct NppSetFunc +{ + typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); +}; + +template::func_ptr func> struct NppSet +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; +template::func_ptr func> struct NppSet +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template struct NppSetMaskFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; +template struct NppSetMaskFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; + +template::func_ptr func> struct NppSetMask +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; +template::func_ptr func> struct NppSetMask +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +////////////////////////////////////////////////////////////////////////// +// CopyMasked + +template struct NppCopyMaskedFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; + +template::func_ptr func> struct NppCopyMasked +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template static inline bool isAligned(const T* ptr, size_t size) +{ + return reinterpret_cast(ptr) % size == 0; +} + +namespace cv { namespace gpu { namespace device +{ + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0); + void convertTo(const GpuMat& src, GpuMat& dst); + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0); + void setTo(GpuMat& src, Scalar s, cudaStream_t stream); + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + void setTo(GpuMat& src, Scalar s); + void setTo(GpuMat& src, Scalar s, const GpuMat& mask); + + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) + { + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); } - template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + void convertTo(const GpuMat& src, GpuMat& dst) { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); } - template struct NPPTypeTraits; - template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; - template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; - - ////////////////////////////////////////////////////////////////////////// - // Convert - - template struct NppConvertFunc + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // Set - - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template<> struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // CopyMasked - - template struct NppCopyMaskedFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppCopyMasked - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template static inline bool isAligned(const T* ptr, size_t size) - { - return reinterpret_cast(ptr) % size == 0; + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); } - namespace cv { namespace gpu { namespace device + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) { - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) + typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); + + static const caller_t callers[] = { - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; - cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); - } + callers[src.depth()](src, s, stream); + } - void convertTo(const GpuMat& src, GpuMat& dst) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); - } - - void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); - } - - void setTo(GpuMat& src, Scalar s, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, stream); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, mask, stream); - } - - void setTo(GpuMat& src, Scalar s) - { - setTo(src, s, 0); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask) - { - setTo(src, s, mask, 0); - } - }}} - - - class CudaArch + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) { - public: - CudaArch() + typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + + static const caller_t callers[] = { - fromStr(CUDA_ARCH_BIN, bin); - fromStr(CUDA_ARCH_PTX, ptx); - fromStr(CUDA_ARCH_FEATURES, features); - } + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; - bool builtWith(FeatureSet feature_set) const - { - return !features.empty() && (features.back() >= feature_set); - } + callers[src.depth()](src, s, mask, stream); + } - bool hasPtx(int major, int minor) const - { - return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); - } - - bool hasBin(int major, int minor) const - { - return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); - } - - bool hasEqualOrLessPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.front() <= major * 10 + minor); - } - - bool hasEqualOrGreaterPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.back() >= major * 10 + minor); - } - - bool hasEqualOrGreaterBin(int major, int minor) const - { - return !bin.empty() && (bin.back() >= major * 10 + minor); - } - - - private: - void fromStr(const string& set_as_str, vector& arr) - { - if (set_as_str.find_first_not_of(" ") == string::npos) - return; - - istringstream stream(set_as_str); - int cur_value; - - while (!stream.eof()) - { - stream >> cur_value; - arr.push_back(cur_value); - } - - sort(arr.begin(), arr.end()); - } - - vector bin; - vector ptx; - vector features; - }; - - class DeviceProps + void setTo(GpuMat& src, Scalar s) { - public: - DeviceProps() - { - props_.resize(10, 0); - } + setTo(src, s, 0); + } - ~DeviceProps() - { - for (size_t i = 0; i < props_.size(); ++i) - { - if (props_[i]) - delete props_[i]; - } - props_.clear(); - } - - cudaDeviceProp* get(int devID) - { - if (devID >= (int) props_.size()) - props_.resize(devID + 5, 0); - - if (!props_[devID]) - { - props_[devID] = new cudaDeviceProp; - cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); - } - - return props_[devID]; - } - private: - std::vector props_; - }; - - DeviceProps deviceProps; - - class CudaDeviceInfoFuncTable: DeviceInfoFuncTable + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) { - public: - size_t sharedMemPerBlock() const + setTo(src, s, mask, 0); + } +}}} + +class CudaArch +{ +public: + CudaArch() + { + fromStr(CUDA_ARCH_BIN, bin); + fromStr(CUDA_ARCH_PTX, ptx); + fromStr(CUDA_ARCH_FEATURES, features); + } + + bool builtWith(FeatureSet feature_set) const + { + return !features.empty() && (features.back() >= feature_set); + } + + bool hasPtx(int major, int minor) const + { + return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); + } + + bool hasBin(int major, int minor) const + { + return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); + } + + bool hasEqualOrLessPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.front() <= major * 10 + minor); + } + + bool hasEqualOrGreaterPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.back() >= major * 10 + minor); + } + + bool hasEqualOrGreaterBin(int major, int minor) const + { + return !bin.empty() && (bin.back() >= major * 10 + minor); + } + + +private: + void fromStr(const string& set_as_str, vector& arr) + { + if (set_as_str.find_first_not_of(" ") == string::npos) + return; + + istringstream stream(set_as_str); + int cur_value; + + while (!stream.eof()) { - return deviceProps.get(device_id_)->sharedMemPerBlock; + stream >> cur_value; + arr.push_back(cur_value); } - void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + sort(arr.begin(), arr.end()); + } + + vector bin; + vector ptx; + vector features; +}; + +class DeviceProps +{ +public: + DeviceProps() + { + props_.resize(10, 0); + } + + ~DeviceProps() + { + for (size_t i = 0; i < props_.size(); ++i) { - int prevDeviceID = getDevice(); - if (prevDeviceID != device_id_) - setDevice(device_id_); + if (props_[i]) + delete props_[i]; + } + props_.clear(); + } - cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); + cudaDeviceProp* get(int devID) + { + if (devID >= (int) props_.size()) + props_.resize(devID + 5, 0); - if (prevDeviceID != device_id_) - setDevice(prevDeviceID); + if (!props_[devID]) + { + props_[devID] = new cudaDeviceProp; + cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); } - size_t freeMemory() const - { - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _freeMemory; - } + return props_[devID]; + } +private: + std::vector props_; +}; - size_t totalMemory() const - { - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _totalMemory; - } +DeviceProps deviceProps; - bool supports(FeatureSet feature_set) const - { - int version = majorVersion_ * 10 + minorVersion_; - return version >= feature_set; - } +class CudaDeviceInfoFuncTable: DeviceInfoFuncTable +{ +public: + size_t sharedMemPerBlock() const + { + return deviceProps.get(device_id_)->sharedMemPerBlock; + } - bool isCompatible() const - { - // Check PTX compatibility - if (hasEqualOrLessPtx(majorVersion_, minorVersion_)) - return true; + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + { + int prevDeviceID = getDevice(); + if (prevDeviceID != device_id_) + setDevice(device_id_); - // Check BIN compatibility - for (int i = minorVersion_; i >= 0; --i) - if (hasBin(majorVersion_, i)) - return true; + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); - return false; - } + if (prevDeviceID != device_id_) + setDevice(prevDeviceID); + } - void query() - { - const cudaDeviceProp* prop = deviceProps.get(device_id_); + size_t freeMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _freeMemory; + } - name_ = prop->name; - multi_processor_count_ = prop->multiProcessorCount; - majorVersion_ = prop->major; - minorVersion_ = prop->minor; - } + size_t totalMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _totalMemory; + } - int deviceID() const - { - return device_id_; - } + bool supports(FeatureSet feature_set) const + { + int version = majorVersion_ * 10 + minorVersion_; + return version >= feature_set; + } - std::string name() const - { - return name_; - } + bool isCompatible() const + { + // Check PTX compatibility + if (hasEqualOrLessPtx(majorVersion_, minorVersion_)) + return true; - int majorVersion() const - { - return majorVersion_; - } + // Check BIN compatibility + for (int i = minorVersion_; i >= 0; --i) + if (hasBin(majorVersion_, i)) + return true; - int minorVersion() const - { - return minorVersion_; - } + return false; + } - int multiProcessorCount() const - { - return multi_processor_count_; - } + void query() + { + const cudaDeviceProp* prop = deviceProps.get(device_id_); - int getCudaEnabledDeviceCount() const - { - int count; - cudaError_t error = cudaGetDeviceCount( &count ); + name_ = prop->name; + multi_processor_count_ = prop->multiProcessorCount; + majorVersion_ = prop->major; + minorVersion_ = prop->minor; + } - if (error == cudaErrorInsufficientDriver) - return -1; + int deviceID() const + { + return device_id_; + } - if (error == cudaErrorNoDevice) - return 0; + std::string name() const + { + return name_; + } - cudaSafeCall( error ); - return count; - } + int majorVersion() const + { + return majorVersion_; + } - void setDevice(int device) const - { - cudaSafeCall( cudaSetDevice( device ) ); - } + int minorVersion() const + { + return minorVersion_; + } - int getDevice() const - { - int device; - cudaSafeCall( cudaGetDevice( &device ) ); - return device; - } + int multiProcessorCount() const + { + return multi_processor_count_; + } - void resetDevice() const - { - cudaSafeCall( cudaDeviceReset() ); - } - - bool builtWith(FeatureSet feature_set) const - { - return cudaArch.builtWith(feature_set); - } - - bool has(int major, int minor) const - { - return hasPtx(major, minor) || hasBin(major, minor); - } - - bool hasPtx(int major, int minor) const - { - return cudaArch.hasPtx(major, minor); - } - - bool hasBin(int major, int minor) const - { - return cudaArch.hasBin(major, minor); - } - - bool hasEqualOrLessPtx(int major, int minor) const - { - return cudaArch.hasEqualOrLessPtx(major, minor); - } - - bool hasEqualOrGreater(int major, int minor) const - { - return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); - } - - bool hasEqualOrGreaterPtx(int major, int minor) const - { - return cudaArch.hasEqualOrGreaterPtx(major, minor); - } - - bool hasEqualOrGreaterBin(int major, int minor) const - { - return cudaArch.hasEqualOrGreaterBin(major, minor); - } - - bool deviceSupports(FeatureSet feature_set) const - { - static int versions[] = - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); - - const int devId = getDevice(); - - int version; - - if (devId < cache_size && versions[devId] >= 0) - version = versions[devId]; - else - { - DeviceInfo dev(devId); - version = dev.majorVersion() * 10 + dev.minorVersion(); - if (devId < cache_size) - versions[devId] = version; - } - - return TargetArchs::builtWith(feature_set) && (version >= feature_set); - } - - void printCudaDeviceInfo(int device) const - { - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); - printf("Device count: %d\n", count); - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - const char *computeMode[] = { - "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", - "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", - "Prohibited (no host thread can use ::cudaSetDevice() with this device)", - "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", - "Unknown", - NULL - }; - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - printf("\nDevice %d: \"%s\"\n", dev, prop.name); - printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); - printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); - - printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); - - printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", - prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], - prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); - printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", - prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], - prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); - - printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); - printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); - printf(" Warp size: %d\n", prop.warpSize); - printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); - printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); - printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); - - printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); - printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); - printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); - printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); - - printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); - printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); - printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); - printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); - printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); - printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); - printf(" Compute Mode:\n"); - printf(" %s \n", computeMode[prop.computeMode]); - } - - printf("\n"); - printf("deviceQuery, CUDA Driver = CUDART"); - printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); - printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); - printf(", NumDevs = %d\n\n", count); - fflush(stdout); - } - - void printShortCudaDeviceInfo(int device) const - { - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; - printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); - printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(", %d cores", cores * prop.multiProcessorCount); - - printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - } - fflush(stdout); - } - - private: - int device_id_; - - std::string name_; - int multi_processor_count_; - int majorVersion_; - int minorVersion_; - - const CudaArch cudaArch; - - int convertSMVer2Cores(int major, int minor) const - { - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } SMtoCores; - - SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; - - int index = 0; - while (gpuArchCoresPerSM[index].SM != -1) - { - if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) - return gpuArchCoresPerSM[index].Cores; - index++; - } + int getCudaEnabledDeviceCount() const + { + int count; + cudaError_t error = cudaGetDeviceCount( &count ); + if (error == cudaErrorInsufficientDriver) return -1; - } - }; - class CudaFuncTable : public GpuFuncTable + if (error == cudaErrorNoDevice) + return 0; + + cudaSafeCall( error ); + return count; + } + + void setDevice(int device) const { - public: + cudaSafeCall( cudaSetDevice( device ) ); + } - void copy(const Mat& src, GpuMat& dst) const + int getDevice() const + { + int device; + cudaSafeCall( cudaGetDevice( &device ) ); + return device; + } + + void resetDevice() const + { + cudaSafeCall( cudaDeviceReset() ); + } + + bool builtWith(FeatureSet feature_set) const + { + return cudaArch.builtWith(feature_set); + } + + bool has(int major, int minor) const + { + return hasPtx(major, minor) || hasBin(major, minor); + } + + bool hasPtx(int major, int minor) const + { + return cudaArch.hasPtx(major, minor); + } + + bool hasBin(int major, int minor) const + { + return cudaArch.hasBin(major, minor); + } + + bool hasEqualOrLessPtx(int major, int minor) const + { + return cudaArch.hasEqualOrLessPtx(major, minor); + } + + bool hasEqualOrGreater(int major, int minor) const + { + return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); + } + + bool hasEqualOrGreaterPtx(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterPtx(major, minor); + } + + bool hasEqualOrGreaterBin(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterBin(major, minor); + } + + bool deviceSupports(FeatureSet feature_set) const + { + static int versions[] = { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); - } - void copy(const GpuMat& src, Mat& dst) const + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); + + const int devId = getDevice(); + + int version; + + if (devId < cache_size && versions[devId] >= 0) + version = versions[devId]; + else { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); - } - void copy(const GpuMat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); + DeviceInfo dev(devId); + version = dev.majorVersion() * 10 + dev.minorVersion(); + if (devId < cache_size) + versions[devId] = version; } - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + return TargetArchs::builtWith(feature_set) && (version >= feature_set); + } - if (src.depth() == CV_64F) + void printCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); + printf("Device count: %d\n", count); + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + const char *computeMode[] = { + "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", + "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this device)", + "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", + "Unknown", + NULL + }; + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + printf("\nDevice %d: \"%s\"\n", dev, prop.name); + printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); + printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); + + printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); + + printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", + prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], + prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); + printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", + prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], + prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); + + printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); + printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); + printf(" Warp size: %d\n", prop.warpSize); + printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); + printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); + printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); + printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); + printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); + + printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); + printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); + printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); + printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); + + printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); + printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); + printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); + printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); + printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); + printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); + printf(" Compute Mode:\n"); + printf(" %s \n", computeMode[prop.computeMode]); + } + + printf("\n"); + printf("deviceQuery, CUDA Driver = CUDART"); + printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); + printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); + printf(", NumDevs = %d\n\n", count); + fflush(stdout); + } + + void printShortCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; + printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); + printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(", %d cores", cores * prop.multiProcessorCount); + + printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + } + fflush(stdout); + } + +private: + int device_id_; + + std::string name_; + int multi_processor_count_; + int majorVersion_; + int minorVersion_; + + const CudaArch cudaArch; + + int convertSMVer2Cores(int major, int minor) const + { + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } SMtoCores; + + SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; + + int index = 0; + while (gpuArchCoresPerSM[index].SM != -1) + { + if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) + return gpuArchCoresPerSM[index].Cores; + index++; + } + + return -1; + } +}; + +class CudaFuncTable : public GpuFuncTable +{ +public: + + void copy(const Mat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); + } + + void copy(const GpuMat& src, Mat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); + } + + void copy(const GpuMat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); + } + + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + if (src.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); + static const func_t funcs[7][4] = + { + /* 8U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }, + /* 16U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 16S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32F */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask } + }; + + const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask; + + func(src, dst, mask, 0); + } + + void convert(const GpuMat& src, GpuMat& dst) const + { + typedef void (*func_t)(const GpuMat& src, GpuMat& dst); + static const func_t funcs[7][7][4] = + { { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + /* 8U -> 8U */ {0, 0, 0, 0}, + /* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 8S */ {0,0,0,0}, + /* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 16U -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 16U */ {0,0,0,0}, + /* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 16S -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16S */ {0,0,0,0}, + /* 16S -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 32S */ {0,0,0,0}, + /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 32F -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32F */ {0,0,0,0}, + /* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 64F */ {0,0,0,0} } + }; - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); - static const func_t funcs[7][4] = - { - /* 8U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }, - /* 16U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 16S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32F */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask } - }; + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask; - - func(src, dst, mask, 0); + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - void convert(const GpuMat& src, GpuMat& dst) const + bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); + if (!aligned) { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst); - static const func_t funcs[7][7][4] = - { - { - /* 8U -> 8U */ {0, 0, 0, 0}, - /* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 8U -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, - /* 8U -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, - /* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 8U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } - }, - { - /* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 8S */ {0,0,0,0}, - /* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} - }, - { - /* 16U -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, - /* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16U -> 16U */ {0,0,0,0}, - /* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16U -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } - }, - { - /* 16S -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, - /* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16S -> 16S */ {0,0,0,0}, - /* 16S -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16S -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } - }, - { - /* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 32S */ {0,0,0,0}, - /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} - }, - { - /* 32F -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 32F */ {0,0,0,0}, - /* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} - }, - { - /* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 64F */ {0,0,0,0} - } - }; + cv::gpu::device::convertTo(src, dst); + return; + } - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); + const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; + CV_DbgAssert(func != 0); - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } + func(src, dst); + } - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - if (!aligned) + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + cv::gpu::device::convertTo(src, dst, alpha, beta, stream); + } + + void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const + { + if (mask.empty()) + { + if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) { - cv::gpu::device::convertTo(src, dst); + cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); return; } - const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; - CV_DbgAssert(func != 0); - - func(src, dst); - } - - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) + if (m.depth() == CV_8U) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } + int cn = m.channels(); - cv::gpu::device::convertTo(src, dst, alpha, beta, stream); - } - - void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const - { - if (mask.empty()) - { - if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) + if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) { - cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); + int val = saturate_cast(s[0]); + cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); return; } - - if (m.depth() == CV_8U) - { - int cn = m.channels(); - - if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) - { - int val = saturate_cast(s[0]); - cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); - return; - } - } - - typedef void (*func_t)(GpuMat& src, Scalar s); - static const func_t funcs[7][4] = - { - {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }, - {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, - {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, - {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, - {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - if (stream) - cv::gpu::device::setTo(m, s, stream); - else - funcs[m.depth()][m.channels() - 1](m, s); } - else + + typedef void (*func_t)(GpuMat& src, Scalar s); + static const func_t funcs[7][4] = { - typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); - static const func_t funcs[7][4] = - { - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }, - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo } - }; + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } + }; - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - if (stream) - cv::gpu::device::setTo(m, s, mask, stream); - else - funcs[m.depth()][m.channels() - 1](m, s, mask); + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - } - void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const - { - cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); + if (stream) + cv::gpu::device::setTo(m, s, stream); + else + funcs[m.depth()][m.channels() - 1](m, s); } + else + { + typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); + static const func_t funcs[7][4] = + { + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo } + }; - void free(void* devPtr) const - { - cudaFree(devPtr); + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + if (stream) + cv::gpu::device::setTo(m, s, mask, stream); + else + funcs[m.depth()][m.channels() - 1](m, s, mask); } - }; + } + + void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const + { + cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); + } + + void free(void* devPtr) const + { + cudaFree(devPtr); + } +}; #endif #endif \ No newline at end of file diff --git a/modules/dynamicuda/src/main.cpp b/modules/dynamicuda/src/main.cpp index 4a05d8696..8eb66fd98 100644 --- a/modules/dynamicuda/src/main.cpp +++ b/modules/dynamicuda/src/main.cpp @@ -39,6 +39,9 @@ static EmptyFuncTable gpuTable; extern "C" { +DeviceInfoFuncTable* deviceInfoFactory(); +GpuFuncTable* gpuFactory(); + DeviceInfoFuncTable* deviceInfoFactory() { return (DeviceInfoFuncTable*)&deviceInfoTable; diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt index 291295fb5..3a6ebe836 100644 --- a/modules/java/CMakeLists.txt +++ b/modules/java/CMakeLists.txt @@ -297,7 +297,7 @@ if(BUILD_FAT_JAVA_LIB) list(REMOVE_ITEM __deps ${m}) endif() endforeach() - if (HAVE_opencv_dynamicuda) + if (ENABLE_DYNAMIC_CUDA) list(REMOVE_ITEM __deps "opencv_dynamicuda") endif() if (ANDROID AND HAVE_opencv_gpu)