From d4087f19a2aa38c00b101b01d06c60dc70edf5d0 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Wed, 11 Dec 2013 16:38:30 +0400 Subject: [PATCH 01/13] All CUDA related stuff were moved to separate dynamic library. --- modules/core/CMakeLists.txt | 23 +- modules/core/cuda/CMakeLists.txt | 11 + modules/core/cuda/main.cpp | 23 + modules/core/include/opencv2/core/gpumat.hpp | 2 + modules/core/src/gpumat.cpp | 1145 ++---------------- modules/core/src/gpumat_cuda.hpp | 1069 ++++++++++++++++ 6 files changed, 1201 insertions(+), 1072 deletions(-) create mode 100644 modules/core/cuda/CMakeLists.txt create mode 100644 modules/core/cuda/main.cpp create mode 100644 modules/core/src/gpumat_cuda.hpp diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 66b8ae0d2..595198292 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,22 +1,27 @@ set(the_description "The Core Functionality") -ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) -ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) if(HAVE_WINRT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") endif() -if(HAVE_CUDA) - ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") - ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) -endif() - file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) +if(DYNAMIC_CUDA_SUPPORT) + add_definitions(-DDYNAMIC_CUDA_SUPPORT) +endif() + +ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) +ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) + +if(HAVE_CUDA) + ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") + ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) +endif() + ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) @@ -25,3 +30,7 @@ ocv_add_precompiled_headers(${the_module}) ocv_add_accuracy_tests() ocv_add_perf_tests() + +if(DYNAMIC_CUDA_SUPPORT) + add_subdirectory(cuda) +endif() diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt new file mode 100644 index 000000000..0b1c9428d --- /dev/null +++ b/modules/core/cuda/CMakeLists.txt @@ -0,0 +1,11 @@ +project(opencv_core_cuda) +set(HAVE_CUDA FALSE) +add_definitions("-DHAVE_CUDA") +include_directories(${CUDA_INCLUDE_DIRS} + "../src/" + "../include/opencv2/core/" + "${OpenCV_SOURCE_DIR}/modules/gpu/include" + ) +ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) +cuda_add_library(opencv_core_cuda SHARED main.cpp ../src/cuda/matrix_operations.cu) +target_link_libraries(opencv_core_cuda ${CUDA_LIBRARIES}) \ No newline at end of file diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp new file mode 100644 index 000000000..c4b8cbe1d --- /dev/null +++ b/modules/core/cuda/main.cpp @@ -0,0 +1,23 @@ +#include "opencv2/core/core.hpp" +#include "opencv2/core/gpumat.hpp" + +#ifdef HAVE_CUDA +#include +#include + +#define CUDART_MINIMUM_REQUIRED_VERSION 4020 +#define NPP_MINIMUM_REQUIRED_VERSION 4200 + +#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) +#error "Insufficient Cuda Runtime library version, please update it." +#endif + +#if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION) +#error "Insufficient NPP version, please update it." +#endif +#endif + +using namespace cv; +using namespace cv::gpu; + +#include "gpumat_cuda.hpp" \ No newline at end of file diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index 193c9aa70..b50210213 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -48,6 +48,8 @@ #include "opencv2/core/core.hpp" #include "opencv2/core/cuda_devptrs.hpp" +#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") + namespace cv { namespace gpu { //////////////////////////////// Initialization & Info //////////////////////// diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 4c4af61c4..9a2e36cb6 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -44,7 +44,7 @@ #include "opencv2/core/gpumat.hpp" #include -#ifdef HAVE_CUDA +#if defined(HAVE_CUDA) #include #include @@ -64,489 +64,62 @@ using namespace std; using namespace cv; using namespace cv::gpu; -#ifndef HAVE_CUDA - -#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") - -#else // HAVE_CUDA +#include "gpumat_cuda.hpp" namespace { -#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) -#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) - - inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") + const GpuFuncTable* gpuFuncTable() { - if (cudaSuccess != err) - cv::gpu::error(cudaGetErrorString(err), file, line, func); - } - - inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") - { - if (err < 0) - { - std::ostringstream msg; - msg << "NPP API Call Error: " << err; - cv::gpu::error(msg.str().c_str(), file, line, func); - } + static EmptyFuncTable funcTable; + return &funcTable; } } -#endif // HAVE_CUDA - //////////////////////////////// Initialization & Info //////////////////////// -#ifndef HAVE_CUDA +int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); } -int cv::gpu::getCudaEnabledDeviceCount() { return 0; } +void cv::gpu::setDevice(int device) { gpuFuncTable()->setDevice(device); } +int cv::gpu::getDevice() { return gpuFuncTable()->getDevice(); } -void cv::gpu::setDevice(int) { throw_nogpu; } -int cv::gpu::getDevice() { throw_nogpu; return 0; } +void cv::gpu::resetDevice() { gpuFuncTable()->resetDevice(); } -void cv::gpu::resetDevice() { throw_nogpu; } +bool cv::gpu::deviceSupports(FeatureSet feature_set) { return gpuFuncTable()->deviceSupports(feature_set); } -bool cv::gpu::deviceSupports(FeatureSet) { throw_nogpu; return false; } +bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return gpuFuncTable()->builtWith(feature_set); } +bool cv::gpu::TargetArchs::has(int major, int minor) { return gpuFuncTable()->has(major, minor); } +bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { return gpuFuncTable()->hasPtx(major, minor); } +bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return gpuFuncTable()->hasBin(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrLessPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuFuncTable()->hasEqualOrGreater(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); } -bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::has(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasBin(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_nogpu; return false; } -bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_nogpu; return false; } +size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); } +void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); } +size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); } +size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); } +bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); } +bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); } +void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); } -size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_nogpu; return 0; } -void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; } -size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; } -size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; } -bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_nogpu; return false; } -bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; } -void cv::gpu::DeviceInfo::query() { throw_nogpu; } +void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); } +void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); } -void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; } -void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; } +#ifdef HAVE_CUDA -#else // HAVE_CUDA - -int cv::gpu::getCudaEnabledDeviceCount() +namespace cv { namespace gpu { - int count; - cudaError_t error = cudaGetDeviceCount( &count ); - - if (error == cudaErrorInsufficientDriver) - return -1; - - if (error == cudaErrorNoDevice) - return 0; - - cudaSafeCall( error ); - return count; -} - -void cv::gpu::setDevice(int device) -{ - cudaSafeCall( cudaSetDevice( device ) ); -} - -int cv::gpu::getDevice() -{ - int device; - cudaSafeCall( cudaGetDevice( &device ) ); - return device; -} - -void cv::gpu::resetDevice() -{ - cudaSafeCall( cudaDeviceReset() ); -} - -namespace -{ - class CudaArch - { - public: - CudaArch(); - - bool builtWith(FeatureSet feature_set) const; - bool hasPtx(int major, int minor) const; - bool hasBin(int major, int minor) const; - bool hasEqualOrLessPtx(int major, int minor) const; - bool hasEqualOrGreaterPtx(int major, int minor) const; - bool hasEqualOrGreaterBin(int major, int minor) const; - - private: - static void fromStr(const string& set_as_str, vector& arr); - - vector bin; - vector ptx; - vector features; - }; - - const CudaArch cudaArch; - - CudaArch::CudaArch() - { - fromStr(CUDA_ARCH_BIN, bin); - fromStr(CUDA_ARCH_PTX, ptx); - fromStr(CUDA_ARCH_FEATURES, features); - } - - bool CudaArch::builtWith(FeatureSet feature_set) const - { - return !features.empty() && (features.back() >= feature_set); - } - - bool CudaArch::hasPtx(int major, int minor) const - { - return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); - } - - bool CudaArch::hasBin(int major, int minor) const - { - return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); - } - - bool CudaArch::hasEqualOrLessPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.front() <= major * 10 + minor); - } - - bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.back() >= major * 10 + minor); - } - - bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const - { - return !bin.empty() && (bin.back() >= major * 10 + minor); - } - - void CudaArch::fromStr(const string& set_as_str, vector& arr) - { - if (set_as_str.find_first_not_of(" ") == string::npos) - return; - - istringstream stream(set_as_str); - int cur_value; - - while (!stream.eof()) - { - stream >> cur_value; - arr.push_back(cur_value); - } - - sort(arr.begin(), arr.end()); - } -} - -bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) -{ - return cudaArch.builtWith(feature_set); -} - -bool cv::gpu::TargetArchs::has(int major, int minor) -{ - return hasPtx(major, minor) || hasBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasPtx(int major, int minor) -{ - return cudaArch.hasPtx(major, minor); -} - -bool cv::gpu::TargetArchs::hasBin(int major, int minor) -{ - return cudaArch.hasBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) -{ - return cudaArch.hasEqualOrLessPtx(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) -{ - return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) -{ - return cudaArch.hasEqualOrGreaterPtx(major, minor); -} - -bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) -{ - return cudaArch.hasEqualOrGreaterBin(major, minor); -} - -bool cv::gpu::deviceSupports(FeatureSet feature_set) -{ - static int versions[] = - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); - - const int devId = getDevice(); - - int version; - - if (devId < cache_size && versions[devId] >= 0) - version = versions[devId]; - else - { - DeviceInfo dev(devId); - version = dev.majorVersion() * 10 + dev.minorVersion(); - if (devId < cache_size) - versions[devId] = version; - } - - return TargetArchs::builtWith(feature_set) && (version >= feature_set); -} - -namespace -{ - class DeviceProps - { - public: - DeviceProps(); - ~DeviceProps(); - - cudaDeviceProp* get(int devID); - - private: - std::vector props_; - }; - - DeviceProps::DeviceProps() - { - props_.resize(10, 0); - } - - DeviceProps::~DeviceProps() - { - for (size_t i = 0; i < props_.size(); ++i) - { - if (props_[i]) - delete props_[i]; - } - props_.clear(); - } - - cudaDeviceProp* DeviceProps::get(int devID) - { - if (devID >= (int) props_.size()) - props_.resize(devID + 5, 0); - - if (!props_[devID]) - { - props_[devID] = new cudaDeviceProp; - cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); - } - - return props_[devID]; - } - - DeviceProps deviceProps; -} - -size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const -{ - return deviceProps.get(device_id_)->sharedMemPerBlock; -} - -void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const -{ - int prevDeviceID = getDevice(); - if (prevDeviceID != device_id_) - setDevice(device_id_); - - cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); - - if (prevDeviceID != device_id_) - setDevice(prevDeviceID); -} - -size_t cv::gpu::DeviceInfo::freeMemory() const -{ - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _freeMemory; -} - -size_t cv::gpu::DeviceInfo::totalMemory() const -{ - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _totalMemory; -} - -bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const -{ - int version = majorVersion() * 10 + minorVersion(); - return version >= feature_set; -} - -bool cv::gpu::DeviceInfo::isCompatible() const -{ - // Check PTX compatibility - if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion())) - return true; - - // Check BIN compatibility - for (int i = minorVersion(); i >= 0; --i) - if (TargetArchs::hasBin(majorVersion(), i)) - return true; - - return false; -} - -void cv::gpu::DeviceInfo::query() -{ - const cudaDeviceProp* prop = deviceProps.get(device_id_); - - name_ = prop->name; - multi_processor_count_ = prop->multiProcessorCount; - majorVersion_ = prop->major; - minorVersion_ = prop->minor; -} - -namespace -{ - int convertSMVer2Cores(int major, int minor) - { - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } SMtoCores; - - SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; - - int index = 0; - while (gpuArchCoresPerSM[index].SM != -1) - { - if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) - return gpuArchCoresPerSM[index].Cores; - index++; - } - - return -1; - } -} - -void cv::gpu::printCudaDeviceInfo(int device) -{ - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); - printf("Device count: %d\n", count); - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - const char *computeMode[] = { - "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", - "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", - "Prohibited (no host thread can use ::cudaSetDevice() with this device)", - "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", - "Unknown", - NULL - }; - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - printf("\nDevice %d: \"%s\"\n", dev, prop.name); - printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); - printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); - - printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); - - printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", - prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], - prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); - printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", - prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], - prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); - - printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); - printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); - printf(" Warp size: %d\n", prop.warpSize); - printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); - printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); - printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); - - printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); - printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); - printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); - printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); - - printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); - printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); - printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); - printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); - printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); - printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); - printf(" Compute Mode:\n"); - printf(" %s \n", computeMode[prop.computeMode]); - } - - printf("\n"); - printf("deviceQuery, CUDA Driver = CUDART"); - printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); - printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); - printf(", NumDevs = %d\n\n", count); - fflush(stdout); -} - -void cv::gpu::printShortCudaDeviceInfo(int device) -{ - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; - printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); - printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(", %d cores", cores * prop.multiProcessorCount); - - printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - } - fflush(stdout); -} - -#endif // HAVE_CUDA + CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, cudaStream_t); + CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&); + CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, cudaStream_t = 0); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, cudaStream_t); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar); + CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&); +}} + +#endif //////////////////////////////// GpuMat /////////////////////////////// @@ -830,601 +403,6 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat) return mat = GpuMat(rows, cols, type); } -namespace -{ - class GpuFuncTable - { - public: - virtual ~GpuFuncTable() {} - - virtual void copy(const Mat& src, GpuMat& dst) const = 0; - virtual void copy(const GpuMat& src, Mat& dst) const = 0; - virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; - - virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; - - virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; - virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0; - - virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0; - - virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; - virtual void free(void* devPtr) const = 0; - }; -} - -#ifndef HAVE_CUDA - -namespace -{ - class EmptyFuncTable : public GpuFuncTable - { - public: - void copy(const Mat&, GpuMat&) const { throw_nogpu; } - void copy(const GpuMat&, Mat&) const { throw_nogpu; } - void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } - - void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } - - void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } - void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu; } - - void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu; } - - void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } - void free(void*) const {} - }; - - const GpuFuncTable* gpuFuncTable() - { - static EmptyFuncTable empty; - return ∅ - } -} - -#else // HAVE_CUDA - -namespace cv { namespace gpu { namespace device -{ - void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream); - - template - void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); - - void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); -}}} - -namespace -{ - template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) - { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); - } - - template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); - } -} - - -namespace cv { namespace gpu -{ - CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*); - CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&); - CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar); - CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&); -}} - - -namespace cv { namespace gpu -{ - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) - { - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - - cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); - } - - void convertTo(const GpuMat& src, GpuMat& dst) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); - } - - void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); - } - - void setTo(GpuMat& src, Scalar s, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, stream); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, mask, stream); - } - - void setTo(GpuMat& src, Scalar s) - { - setTo(src, s, 0); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask) - { - setTo(src, s, mask, 0); - } -}} - -namespace -{ - template struct NPPTypeTraits; - template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; - template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; - - ////////////////////////////////////////////////////////////////////////// - // Convert - - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // Set - - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template<> struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // CopyMasked - - template struct NppCopyMaskedFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppCopyMasked - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template static inline bool isAligned(const T* ptr, size_t size) - { - return reinterpret_cast(ptr) % size == 0; - } - - ////////////////////////////////////////////////////////////////////////// - // CudaFuncTable - - class CudaFuncTable : public GpuFuncTable - { - public: - void copy(const Mat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); - } - void copy(const GpuMat& src, Mat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); - } - void copy(const GpuMat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); - } - - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - - if (src.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); - static const func_t funcs[7][4] = - { - /* 8U */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 8S */ {cv::gpu::copyWithMask , cv::gpu::copyWithMask, cv::gpu::copyWithMask , cv::gpu::copyWithMask }, - /* 16U */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 16S */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32S */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32F */ {NppCopyMasked::call, cv::gpu::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 64F */ {cv::gpu::copyWithMask , cv::gpu::copyWithMask, cv::gpu::copyWithMask , cv::gpu::copyWithMask } - }; - - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::copyWithMask; - - func(src, dst, mask, 0); - } - - void convert(const GpuMat& src, GpuMat& dst) const - { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst); - static const func_t funcs[7][7][4] = - { - { - /* 8U -> 8U */ {0, 0, 0, 0}, - /* 8U -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 16U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 8U -> 16S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 8U -> 32S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 8U -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 8S -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 8S */ {0,0,0,0}, - /* 8S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 8S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 16U -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 16U -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 16U */ {0,0,0,0}, - /* 16U -> 16S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 32S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16U -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 16S -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, NppCvt::call}, - /* 16S -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 16U */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 16S */ {0,0,0,0}, - /* 16S -> 32S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 32F */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo }, - /* 16S -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo } - }, - { - /* 32S -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 32S */ {0,0,0,0}, - /* 32S -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32S -> 64F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 32F -> 8U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 8S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 16U */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 16S */ {NppCvt::call, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 32S */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 32F -> 32F */ {0,0,0,0}, - /* 32F -> 64F */ {cv::gpu::convertTo , cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo} - }, - { - /* 64F -> 8U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 16U */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 16S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 32S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 32F */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo}, - /* 64F -> 64F */ {0,0,0,0} - } - }; - - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - if (!aligned) - { - cv::gpu::convertTo(src, dst); - return; - } - - const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; - CV_DbgAssert(func != 0); - - func(src, dst); - } - - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - cv::gpu::convertTo(src, dst, alpha, beta); - } - - void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const - { - if (mask.empty()) - { - if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) - { - cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); - return; - } - - if (m.depth() == CV_8U) - { - int cn = m.channels(); - - if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) - { - int val = saturate_cast(s[0]); - cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); - return; - } - } - - typedef void (*func_t)(GpuMat& src, Scalar s); - static const func_t funcs[7][4] = - { - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo }, - {NppSet::call, NppSet::call, cv::gpu::setTo , NppSet::call}, - {NppSet::call, NppSet::call, cv::gpu::setTo , NppSet::call}, - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {NppSet::call, cv::gpu::setTo , cv::gpu::setTo , NppSet::call}, - {cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo , cv::gpu::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - funcs[m.depth()][m.channels() - 1](m, s); - } - else - { - typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); - static const func_t funcs[7][4] = - { - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {cv::gpu::setTo , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo }, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::setTo, cv::gpu::setTo, NppSetMask::call}, - {cv::gpu::setTo , cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - funcs[m.depth()][m.channels() - 1](m, s, mask); - } - } - - void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const - { - cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); - } - - void free(void* devPtr) const - { - cudaFree(devPtr); - } - }; - - const GpuFuncTable* gpuFuncTable() - { - static CudaFuncTable funcTable; - return &funcTable; - } -} - -#endif // HAVE_CUDA - void cv::gpu::GpuMat::upload(const Mat& m) { CV_DbgAssert(!m.empty()); @@ -1492,9 +470,9 @@ void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double bet dst.create(size(), rtype); if (noScale) - gpuFuncTable()->convert(*psrc, dst); + cv::gpu::convertTo(*psrc, dst); else - gpuFuncTable()->convert(*psrc, dst, alpha, beta); + cv::gpu::convertTo(*psrc, dst, alpha, beta); } GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) @@ -1502,7 +480,7 @@ GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) CV_Assert(mask.empty() || mask.type() == CV_8UC1); CV_DbgAssert(!empty()); - gpuFuncTable()->setTo(*this, s, mask); + gpu::setTo(*this, s, mask); return *this; } @@ -1562,6 +540,43 @@ void cv::gpu::GpuMat::release() refcount = 0; } +#ifdef HAVE_CUDA + +namespace cv { namespace gpu +{ + void convertTo(const GpuMat& src, GpuMat& dst) + { + gpuFuncTable()->convert(src, dst); + } + + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) + { + gpuFuncTable()->convert(src, dst, alpha, beta, stream); + } + + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) + { + gpuFuncTable()->setTo(src, s, stream); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + gpuFuncTable()->setTo(src, s, mask, stream); + } + + void setTo(GpuMat& src, Scalar s) + { + setTo(src, s, 0); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) + { + setTo(src, s, mask, 0); + } +}} + +#endif + //////////////////////////////////////////////////////////////////////// // Error handling @@ -1578,5 +593,5 @@ void cv::gpu::error(const char *error_string, const char *file, const int line, cerr.flush(); } else - cv::error( cv::Exception(code, error_string, func, file, line) ); + ::cv::error( ::cv::Exception(code, error_string, func, file, line) ); } diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp new file mode 100644 index 000000000..631d6ea8c --- /dev/null +++ b/modules/core/src/gpumat_cuda.hpp @@ -0,0 +1,1069 @@ +namespace +{ +#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT) + + #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) + #define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) + + inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") + { + if (cudaSuccess != err) + cv::gpu::error(cudaGetErrorString(err), file, line, func); + } + + inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") + { + if (err < 0) + { + std::ostringstream msg; + msg << "NPP API Call Error: " << err; + cv::gpu::error(msg.str().c_str(), file, line, func); + } + } +#endif +} + +namespace +{ + class GpuFuncTable + { + public: + virtual ~GpuFuncTable() {} + + // DeviceInfo routines + virtual int getCudaEnabledDeviceCount() const = 0; + + virtual void setDevice(int) const = 0; + virtual int getDevice() const = 0; + + virtual void resetDevice() const = 0; + + virtual bool deviceSupports(FeatureSet) const = 0; + + virtual bool builtWith(FeatureSet) const = 0; + virtual bool has(int, int) const = 0; + virtual bool hasPtx(int, int) const = 0; + virtual bool hasBin(int, int) const = 0; + virtual bool hasEqualOrLessPtx(int, int) const = 0; + virtual bool hasEqualOrGreater(int, int) const = 0; + virtual bool hasEqualOrGreaterPtx(int, int) const = 0; + virtual bool hasEqualOrGreaterBin(int, int) const = 0; + + virtual size_t sharedMemPerBlock() const = 0; + virtual void queryMemory(size_t&, size_t&) const = 0; + virtual size_t freeMemory() const = 0; + virtual size_t totalMemory() const = 0; + virtual bool supports(FeatureSet) const = 0; + virtual bool isCompatible() const = 0; + virtual void query() const = 0; + + virtual void printCudaDeviceInfo(int) const = 0; + virtual void printShortCudaDeviceInfo(int) const = 0; + + // GpuMat routines + virtual void copy(const Mat& src, GpuMat& dst) const = 0; + virtual void copy(const GpuMat& src, Mat& dst) const = 0; + virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; + + virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; + + // gpu::device::convertTo funcs + virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0; + virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; + + // for gpu::device::setTo funcs + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0; + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; + + virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; + virtual void free(void* devPtr) const = 0; + }; +} + +#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT) +namespace +{ + class EmptyFuncTable : public GpuFuncTable + { + public: + + // DeviceInfo routines + int getCudaEnabledDeviceCount() const { return 0; } + + void setDevice(int) const { throw_nogpu; } + int getDevice() const { throw_nogpu; return 0; } + + void resetDevice() const { throw_nogpu; } + + bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } + + bool builtWith(FeatureSet) const { throw_nogpu; return false; } + bool has(int, int) const { throw_nogpu; return false; } + bool hasPtx(int, int) const { throw_nogpu; return false; } + bool hasBin(int, int) const { throw_nogpu; return false; } + bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } + + size_t sharedMemPerBlock() const { throw_nogpu; return 0; } + void queryMemory(size_t&, size_t&) const { throw_nogpu; } + size_t freeMemory() const { throw_nogpu; return 0; } + size_t totalMemory() const { throw_nogpu; return 0; } + bool supports(FeatureSet) const { throw_nogpu; return false; } + bool isCompatible() const { throw_nogpu; return false; } + void query() const { throw_nogpu; } + + void printCudaDeviceInfo(int) const { throw_nogpu; } + void printShortCudaDeviceInfo(int) const { throw_nogpu; } + + void copy(const Mat&, GpuMat&) const { throw_nogpu; } + void copy(const GpuMat&, Mat&) const { throw_nogpu; } + void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } + + void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } + + void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } + void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } + + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; } + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } + + void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } + void free(void*) const {} + }; +} + +#else + +namespace cv { namespace gpu { namespace device +{ + void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); + + template + void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream); + + template + void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream); + + void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); +}}} + +namespace +{ + template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) + { + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); + } + + template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); + } +} + +namespace +{ + template struct NPPTypeTraits; + template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; + template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; + template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; + template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; + template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; + template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; + template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; + + ////////////////////////////////////////////////////////////////////////// + // Convert + + template struct NppConvertFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + template struct NppConvertFunc + { + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); + }; + + template::func_ptr func> struct NppCvt + { + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppCvt + { + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + ////////////////////////////////////////////////////////////////////////// + // Set + + template struct NppSetFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template struct NppSetFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template struct NppSetFunc + { + typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + template<> struct NppSetFunc + { + typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); + }; + + template::func_ptr func> struct NppSet + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppSet + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + template struct NppSetMaskFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + template struct NppSetMaskFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + + template::func_ptr func> struct NppSetMask + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppSetMask + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + ////////////////////////////////////////////////////////////////////////// + // CopyMasked + + template struct NppCopyMaskedFunc + { + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); + }; + + template::func_ptr func> struct NppCopyMasked + { + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + + template static inline bool isAligned(const T* ptr, size_t size) + { + return reinterpret_cast(ptr) % size == 0; + } +} + + namespace cv { namespace gpu { namespace devices + { + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) + { + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); + } + + void convertTo(const GpuMat& src, GpuMat& dst) + { + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); + } + + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) + { + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); + } + + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) + { + typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); + + static const caller_t callers[] = + { + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; + + callers[src.depth()](src, s, stream); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + { + typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + + static const caller_t callers[] = + { + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; + + callers[src.depth()](src, s, mask, stream); + } + + void setTo(GpuMat& src, Scalar s) + { + setTo(src, s, 0); + } + + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) + { + setTo(src, s, mask, 0); + } + }} + +namespace +{ + class CudaFuncTable : public GpuFuncTable + { + protected: + + class CudaArch + { + public: + CudaArch(); + + bool builtWith(FeatureSet feature_set) const; + bool hasPtx(int major, int minor) const; + bool hasBin(int major, int minor) const; + bool hasEqualOrLessPtx(int major, int minor) const; + bool hasEqualOrGreaterPtx(int major, int minor) const; + bool hasEqualOrGreaterBin(int major, int minor) const; + + private: + static void fromStr(const string& set_as_str, vector& arr); + + vector bin; + vector ptx; + vector features; + }; + + const CudaArch cudaArch; + + CudaArch::CudaArch() + { + fromStr(CUDA_ARCH_BIN, bin); + fromStr(CUDA_ARCH_PTX, ptx); + fromStr(CUDA_ARCH_FEATURES, features); + } + + bool CudaArch::builtWith(FeatureSet feature_set) const + { + return !features.empty() && (features.back() >= feature_set); + } + + bool CudaArch::hasPtx(int major, int minor) const + { + return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); + } + + bool CudaArch::hasBin(int major, int minor) const + { + return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); + } + + bool CudaArch::hasEqualOrLessPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.front() <= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.back() >= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const + { + return !bin.empty() && (bin.back() >= major * 10 + minor); + } + + void CudaArch::fromStr(const string& set_as_str, vector& arr) + { + if (set_as_str.find_first_not_of(" ") == string::npos) + return; + + istringstream stream(set_as_str); + int cur_value; + + while (!stream.eof()) + { + stream >> cur_value; + arr.push_back(cur_value); + } + + sort(arr.begin(), arr.end()); + } + + class DeviceProps + { + public: + DeviceProps(); + ~DeviceProps(); + + cudaDeviceProp* get(int devID); + + private: + std::vector props_; + }; + + DeviceProps::DeviceProps() + { + props_.resize(10, 0); + } + + DeviceProps::~DeviceProps() + { + for (size_t i = 0; i < props_.size(); ++i) + { + if (props_[i]) + delete props_[i]; + } + props_.clear(); + } + + cudaDeviceProp* DeviceProps::get(int devID) + { + if (devID >= (int) props_.size()) + props_.resize(devID + 5, 0); + + if (!props_[devID]) + { + props_[devID] = new cudaDeviceProp; + cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); + } + + return props_[devID]; + } + + DeviceProps deviceProps; + + int convertSMVer2Cores(int major, int minor) + { + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } SMtoCores; + + SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; + + int index = 0; + while (gpuArchCoresPerSM[index].SM != -1) + { + if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) + return gpuArchCoresPerSM[index].Cores; + index++; + } + + return -1; + } + + public: + + int getCudaEnabledDeviceCount() const + { + int count; + cudaError_t error = cudaGetDeviceCount( &count ); + + if (error == cudaErrorInsufficientDriver) + return -1; + + if (error == cudaErrorNoDevice) + return 0; + + cudaSafeCall( error ); + return count; + } + + void setDevice(int device) const + { + cudaSafeCall( cudaSetDevice( device ) ); + } + + int getDevice() const + { + int device; + cudaSafeCall( cudaGetDevice( &device ) ); + return device; + } + + void resetDevice() const + { + cudaSafeCall( cudaDeviceReset() ); + } + + bool TargetArchs::builtWith(FeatureSet feature_set) const + { + return cudaArch.builtWith(feature_set); + } + + bool TargetArchs::has(int major, int minor) const + { + return hasPtx(major, minor) || hasBin(major, minor); + } + + bool TargetArchs::hasPtx(int major, int minor) const + { + return cudaArch.hasPtx(major, minor); + } + + bool TargetArchs::hasBin(int major, int minor) const + { + return cudaArch.hasBin(major, minor); + } + + bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const + { + return cudaArch.hasEqualOrLessPtx(major, minor); + } + + bool TargetArchs::hasEqualOrGreater(int major, int minor) const + { + return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); + } + + bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterPtx(major, minor); + } + + bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterBin(major, minor); + } + + bool deviceSupports(FeatureSet feature_set) const + { + static int versions[] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); + + const int devId = getDevice(); + + int version; + + if (devId < cache_size && versions[devId] >= 0) + version = versions[devId]; + else + { + DeviceInfo dev(devId); + version = dev.majorVersion() * 10 + dev.minorVersion(); + if (devId < cache_size) + versions[devId] = version; + } + + return TargetArchs::builtWith(feature_set) && (version >= feature_set); + } + + size_t sharedMemPerBlock() const + { + return deviceProps.get(device_id_)->sharedMemPerBlock; + } + + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + { + int prevDeviceID = getDevice(); + if (prevDeviceID != device_id_) + setDevice(device_id_); + + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); + + if (prevDeviceID != device_id_) + setDevice(prevDeviceID); + } + + size_t freeMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _freeMemory; + } + + size_t totalMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _totalMemory; + } + + bool supports(FeatureSet feature_set) const + { + int version = majorVersion() * 10 + minorVersion(); + return version >= feature_set; + } + + bool isCompatible() const + { + // Check PTX compatibility + if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion())) + return true; + + // Check BIN compatibility + for (int i = minorVersion(); i >= 0; --i) + if (TargetArchs::hasBin(majorVersion(), i)) + return true; + + return false; + } + + void query() const + { + const cudaDeviceProp* prop = deviceProps.get(device_id_); + + name_ = prop->name; + multi_processor_count_ = prop->multiProcessorCount; + majorVersion_ = prop->major; + minorVersion_ = prop->minor; + } + + void printCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); + printf("Device count: %d\n", count); + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + const char *computeMode[] = { + "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", + "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this device)", + "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", + "Unknown", + NULL + }; + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + printf("\nDevice %d: \"%s\"\n", dev, prop.name); + printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); + printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); + + printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); + + printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", + prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], + prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); + printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", + prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], + prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); + + printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); + printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); + printf(" Warp size: %d\n", prop.warpSize); + printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); + printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); + printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); + printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); + printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); + + printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); + printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); + printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); + printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); + + printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); + printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); + printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); + printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); + printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); + printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); + printf(" Compute Mode:\n"); + printf(" %s \n", computeMode[prop.computeMode]); + } + + printf("\n"); + printf("deviceQuery, CUDA Driver = CUDART"); + printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); + printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); + printf(", NumDevs = %d\n\n", count); + fflush(stdout); + } + + void printShortCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; + printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); + printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(", %d cores", cores * prop.multiProcessorCount); + + printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + } + fflush(stdout); + } + + void copy(const Mat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); + } + void copy(const GpuMat& src, Mat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); + } + void copy(const GpuMat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); + } + + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + if (src.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); + static const func_t funcs[7][4] = + { + /* 8U */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 8S */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask }, + /* 16U */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 16S */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32S */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32F */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 64F */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask } + }; + + const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask; + + func(src, dst, mask, 0); + } + + void convert(const GpuMat& src, GpuMat& dst) const + { + typedef void (*func_t)(const GpuMat& src, GpuMat& dst); + static const func_t funcs[7][7][4] = + { + { + /* 8U -> 8U */ {0, 0, 0, 0}, + /* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 8S */ {0,0,0,0}, + /* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 16U -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 16U */ {0,0,0,0}, + /* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 16S -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16S */ {0,0,0,0}, + /* 16S -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 32S */ {0,0,0,0}, + /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 32F -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32F */ {0,0,0,0}, + /* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 64F */ {0,0,0,0} + } + }; + + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); + if (!aligned) + { + cv::gpu::device::convertTo(src, dst); + return; + } + + const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; + CV_DbgAssert(func != 0); + + func(src, dst); + } + + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + cv::gpu::device::convertTo(src, dst, alpha, beta); + } + + void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const + { + if (mask.empty()) + { + if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) + { + cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); + return; + } + + if (m.depth() == CV_8U) + { + int cn = m.channels(); + + if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) + { + int val = saturate_cast(s[0]); + cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); + return; + } + } + + typedef void (*func_t)(GpuMat& src, Scalar s); + static const func_t funcs[7][4] = + { + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } + }; + + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + funcs[m.depth()][m.channels() - 1](m, s); + } + else + { + typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); + static const func_t funcs[7][4] = + { + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo } + }; + + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + funcs[m.depth()][m.channels() - 1](m, s, mask); + } + } + + void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const + { + cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); + } + + void free(void* devPtr) const + { + cudaFree(devPtr); + } + }; +} +#endif \ No newline at end of file From 8660e048bc12c348ccfc17d42e97ea7af3aa34b0 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Fri, 13 Dec 2013 17:28:29 +0400 Subject: [PATCH 02/13] Dynamic CUDA support library loading implemented for Linux. Logical mistake in macro fixed; DeviceInfo deligate reimplemented; Build and warning fixes. --- modules/core/CMakeLists.txt | 68 +++- modules/core/cuda/CMakeLists.txt | 3 +- modules/core/cuda/main.cpp | 29 +- modules/core/include/opencv2/core/gpumat.hpp | 3 + modules/core/src/gpumat.cpp | 97 ++++- modules/core/src/gpumat_cuda.hpp | 384 +++++++++---------- 6 files changed, 357 insertions(+), 227 deletions(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 595198292..a7a997f67 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,36 +1,76 @@ set(the_description "The Core Functionality") +macro(ocv_glob_module_sources_no_cuda) + file(GLOB_RECURSE lib_srcs "src/*.cpp") + file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h") + file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") + file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h") + + set(cuda_objs "") + set(lib_cuda_hdrs "") + if(HAVE_CUDA) + ocv_include_directories(${CUDA_INCLUDE_DIRS}) + file(GLOB lib_cuda_hdrs "src/cuda/*.hpp") + endif() + + source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs}) + + file(GLOB cl_kernels "src/opencl/*.cl") + if(HAVE_opencv_ocl AND cl_kernels) + ocv_include_directories(${OPENCL_INCLUDE_DIRS}) + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp" + COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake" + DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake") + source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") + list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") + endif() + + source_group("Include" FILES ${lib_hdrs}) + source_group("Include\\detail" FILES ${lib_hdrs_detail}) + + ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} + SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs}) +endmacro() + +ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) +ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) + if(HAVE_WINRT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") endif() +if(DYNAMIC_CUDA_SUPPORT) + add_definitions(-DDYNAMIC_CUDA_SUPPORT) +else() + add_definitions(-DUSE_CUDA) +endif() + +if(HAVE_CUDA) + ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") + ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) +endif() + file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) -if(DYNAMIC_CUDA_SUPPORT) - add_definitions(-DDYNAMIC_CUDA_SUPPORT) +if (DYNAMIC_CUDA_SUPPORT) + ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" + HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) +else() + ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" + HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) endif() -ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) -ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) - -if(HAVE_CUDA) - ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") - ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) -endif() - -ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" - HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) - ocv_create_module() ocv_add_precompiled_headers(${the_module}) ocv_add_accuracy_tests() ocv_add_perf_tests() -if(DYNAMIC_CUDA_SUPPORT) +if (DYNAMIC_CUDA_SUPPORT) add_subdirectory(cuda) endif() diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt index 0b1c9428d..72ecea7a4 100644 --- a/modules/core/cuda/CMakeLists.txt +++ b/modules/core/cuda/CMakeLists.txt @@ -1,6 +1,5 @@ project(opencv_core_cuda) -set(HAVE_CUDA FALSE) -add_definitions("-DHAVE_CUDA") +add_definitions(-DUSE_CUDA) include_directories(${CUDA_INCLUDE_DIRS} "../src/" "../include/opencv2/core/" diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp index c4b8cbe1d..26d483420 100644 --- a/modules/core/cuda/main.cpp +++ b/modules/core/cuda/main.cpp @@ -1,6 +1,10 @@ +#include "cvconfig.h" #include "opencv2/core/core.hpp" #include "opencv2/core/gpumat.hpp" +#include +#include + #ifdef HAVE_CUDA #include #include @@ -17,7 +21,30 @@ #endif #endif +using namespace std; using namespace cv; using namespace cv::gpu; -#include "gpumat_cuda.hpp" \ No newline at end of file +#include "gpumat_cuda.hpp" + +#ifdef HAVE_CUDA +static CudaDeviceInfoFuncTable deviceInfoTable; +static CudaFuncTable gpuTable; +#else +static EmptyDeviceInfoFuncTable deviceInfoTable; +static EmptyFuncTable gpuTable; +#endif + +extern "C" { + +DeviceInfoFuncTable* deviceInfoFactory() +{ + return (DeviceInfoFuncTable*)&deviceInfoTable; +} + +GpuFuncTable* gpuFactory() +{ + return (GpuFuncTable*)&gpuTable; +} + +} diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index b50210213..d62c8749b 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -137,6 +137,9 @@ namespace cv { namespace gpu int deviceID() const { return device_id_; } private: + // Private section is fictive to preserve bin compatibility. + // Changes in the private fields there have no effects. + // see deligate code. void query(); int device_id_; diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 9a2e36cb6..f438dfd8b 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -43,8 +43,9 @@ #include "precomp.hpp" #include "opencv2/core/gpumat.hpp" #include +#include -#if defined(HAVE_CUDA) +#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT) #include #include @@ -66,15 +67,81 @@ using namespace cv::gpu; #include "gpumat_cuda.hpp" -namespace +typedef GpuFuncTable* (*GpuFactoryType)(); +typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)(); + +static GpuFactoryType gpuFactory = NULL; +static DeviceInfoFactoryType deviceInfoFactory = NULL; + +static const std::string getCudaSupportLibName() { - const GpuFuncTable* gpuFuncTable() - { - static EmptyFuncTable funcTable; - return &funcTable; - } + return "libopencv_core_cuda.so"; } +static bool loadCudaSupportLib() +{ + void* handle; + const std::string name = getCudaSupportLibName(); + handle = dlopen(name.c_str(), RTLD_LAZY); + if (!handle) + return false; + + deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory"); + if (!deviceInfoFactory) + { + dlclose(handle); + return false; + } + + gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory"); + if (!gpuFactory) + { + dlclose(handle); + return false; + } + + dlclose(handle); + + return true; +} + +static GpuFuncTable* gpuFuncTable() +{ +#ifdef DYNAMIC_CUDA_SUPPORT + static EmptyFuncTable stub; + static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub; + static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub; +#else +# ifdef USE_CUDA + static CudaFuncTable impl; + static GpuFuncTable* funcTable = &impl; +#else + static EmptyFuncTable stub; + static GpuFuncTable* funcTable = &stub; +#endif +#endif + return funcTable; +} + +static DeviceInfoFuncTable* deviceInfoFuncTable() +{ +#ifdef DYNAMIC_CUDA_SUPPORT + static EmptyDeviceInfoFuncTable stub; + static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub; + static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub; +#else +# ifdef USE_CUDA + static CudaDeviceInfoFuncTable impl; + static DeviceInfoFuncTable* funcTable = &impl; +#else + static EmptyFuncTable stub; + static DeviceInfoFuncTable* funcTable = &stub; +#endif +#endif + return funcTable; +} + + //////////////////////////////// Initialization & Info //////////////////////// int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); } @@ -95,13 +162,13 @@ bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuF bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); } bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); } -size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); } -void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); } -size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); } -size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); } -bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); } -bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); } -void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); } +size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); } +void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); } +size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); } +size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); } +bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); } +bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); } +void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); } void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); } void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); } @@ -556,7 +623,7 @@ namespace cv { namespace gpu void setTo(GpuMat& src, Scalar s, cudaStream_t stream) { - gpuFuncTable()->setTo(src, s, stream); + gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream); } void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp index 631d6ea8c..56d626a5c 100644 --- a/modules/core/src/gpumat_cuda.hpp +++ b/modules/core/src/gpumat_cuda.hpp @@ -1,30 +1,19 @@ -namespace -{ -#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT) +#ifndef __GPUMAT_CUDA_HPP__ +#define __GPUMAT_CUDA_HPP__ - #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) - #define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) - - inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") + class DeviceInfoFuncTable { - if (cudaSuccess != err) - cv::gpu::error(cudaGetErrorString(err), file, line, func); - } - - inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") - { - if (err < 0) - { - std::ostringstream msg; - msg << "NPP API Call Error: " << err; - cv::gpu::error(msg.str().c_str(), file, line, func); - } - } -#endif -} - -namespace -{ + public: + virtual size_t sharedMemPerBlock() const = 0; + virtual void queryMemory(size_t&, size_t&) const = 0; + virtual size_t freeMemory() const = 0; + virtual size_t totalMemory() const = 0; + virtual bool supports(FeatureSet) const = 0; + virtual bool isCompatible() const = 0; + virtual void query() = 0; + virtual ~DeviceInfoFuncTable() {}; + }; + class GpuFuncTable { public: @@ -40,6 +29,7 @@ namespace virtual bool deviceSupports(FeatureSet) const = 0; + // TargetArchs virtual bool builtWith(FeatureSet) const = 0; virtual bool has(int, int) const = 0; virtual bool hasPtx(int, int) const = 0; @@ -49,14 +39,6 @@ namespace virtual bool hasEqualOrGreaterPtx(int, int) const = 0; virtual bool hasEqualOrGreaterBin(int, int) const = 0; - virtual size_t sharedMemPerBlock() const = 0; - virtual void queryMemory(size_t&, size_t&) const = 0; - virtual size_t freeMemory() const = 0; - virtual size_t totalMemory() const = 0; - virtual bool supports(FeatureSet) const = 0; - virtual bool isCompatible() const = 0; - virtual void query() const = 0; - virtual void printCudaDeviceInfo(int) const = 0; virtual void printShortCudaDeviceInfo(int) const = 0; @@ -72,17 +54,24 @@ namespace virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; // for gpu::device::setTo funcs - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0; virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; virtual void free(void* devPtr) const = 0; }; -} -#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT) -namespace -{ + class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable + { + public: + size_t sharedMemPerBlock() const { throw_nogpu; return 0; } + void queryMemory(size_t&, size_t&) const { throw_nogpu; } + size_t freeMemory() const { throw_nogpu; return 0; } + size_t totalMemory() const { throw_nogpu; return 0; } + bool supports(FeatureSet) const { throw_nogpu; return false; } + bool isCompatible() const { throw_nogpu; return false; } + void query() { throw_nogpu; } + }; + class EmptyFuncTable : public GpuFuncTable { public: @@ -105,15 +94,7 @@ namespace bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } - - size_t sharedMemPerBlock() const { throw_nogpu; return 0; } - void queryMemory(size_t&, size_t&) const { throw_nogpu; } - size_t freeMemory() const { throw_nogpu; return 0; } - size_t totalMemory() const { throw_nogpu; return 0; } - bool supports(FeatureSet) const { throw_nogpu; return false; } - bool isCompatible() const { throw_nogpu; return false; } - void query() const { throw_nogpu; } - + void printCudaDeviceInfo(int) const { throw_nogpu; } void printShortCudaDeviceInfo(int) const { throw_nogpu; } @@ -126,15 +107,32 @@ namespace void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; } virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } void free(void*) const {} }; + +#if defined(USE_CUDA) + +#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) +#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) + +inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") +{ + if (cudaSuccess != err) + cv::gpu::error(cudaGetErrorString(err), file, line, func); } -#else +inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") +{ + if (err < 0) + { + std::ostringstream msg; + msg << "NPP API Call Error: " << err; + cv::gpu::error(msg.str().c_str(), file, line, func); + } +} namespace cv { namespace gpu { namespace device { @@ -149,8 +147,6 @@ namespace cv { namespace gpu { namespace device void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); }}} -namespace -{ template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) { Scalar_ sf = s; @@ -162,10 +158,7 @@ namespace Scalar_ sf = s; cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); } -} -namespace -{ template struct NPPTypeTraits; template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; @@ -208,6 +201,7 @@ namespace cudaSafeCall( cudaDeviceSynchronize() ); } }; + template::func_ptr func> struct NppCvt { typedef typename NPPTypeTraits::npp_type dst_t; @@ -361,9 +355,8 @@ namespace { return reinterpret_cast(ptr) % size == 0; } -} - namespace cv { namespace gpu { namespace devices + namespace cv { namespace gpu { namespace device { void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) { @@ -418,74 +411,52 @@ namespace { setTo(src, s, mask, 0); } - }} + }}} -namespace -{ - class CudaFuncTable : public GpuFuncTable + + class CudaArch { - protected: - - class CudaArch - { - public: - CudaArch(); - - bool builtWith(FeatureSet feature_set) const; - bool hasPtx(int major, int minor) const; - bool hasBin(int major, int minor) const; - bool hasEqualOrLessPtx(int major, int minor) const; - bool hasEqualOrGreaterPtx(int major, int minor) const; - bool hasEqualOrGreaterBin(int major, int minor) const; - - private: - static void fromStr(const string& set_as_str, vector& arr); - - vector bin; - vector ptx; - vector features; - }; - - const CudaArch cudaArch; - - CudaArch::CudaArch() + public: + CudaArch() { fromStr(CUDA_ARCH_BIN, bin); fromStr(CUDA_ARCH_PTX, ptx); fromStr(CUDA_ARCH_FEATURES, features); } - bool CudaArch::builtWith(FeatureSet feature_set) const + bool builtWith(FeatureSet feature_set) const { return !features.empty() && (features.back() >= feature_set); } - bool CudaArch::hasPtx(int major, int minor) const + bool hasPtx(int major, int minor) const { return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); } - bool CudaArch::hasBin(int major, int minor) const + bool hasBin(int major, int minor) const { return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); } - bool CudaArch::hasEqualOrLessPtx(int major, int minor) const + bool hasEqualOrLessPtx(int major, int minor) const { return !ptx.empty() && (ptx.front() <= major * 10 + minor); } - bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const + bool hasEqualOrGreaterPtx(int major, int minor) const { return !ptx.empty() && (ptx.back() >= major * 10 + minor); } - bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const + bool hasEqualOrGreaterBin(int major, int minor) const { return !bin.empty() && (bin.back() >= major * 10 + minor); } - void CudaArch::fromStr(const string& set_as_str, vector& arr) + + private: + void fromStr(const string& set_as_str, vector& arr) { if (set_as_str.find_first_not_of(" ") == string::npos) return; @@ -501,25 +472,21 @@ namespace sort(arr.begin(), arr.end()); } - - class DeviceProps - { - public: - DeviceProps(); - ~DeviceProps(); - - cudaDeviceProp* get(int devID); - - private: - std::vector props_; - }; - DeviceProps::DeviceProps() + vector bin; + vector ptx; + vector features; + }; + + class DeviceProps + { + public: + DeviceProps() { props_.resize(10, 0); } - DeviceProps::~DeviceProps() + ~DeviceProps() { for (size_t i = 0; i < props_.size(); ++i) { @@ -529,7 +496,7 @@ namespace props_.clear(); } - cudaDeviceProp* DeviceProps::get(int devID) + cudaDeviceProp* get(int devID) { if (devID >= (int) props_.size()) props_.resize(devID + 5, 0); @@ -542,10 +509,92 @@ namespace return props_[devID]; } - - DeviceProps deviceProps; + private: + std::vector props_; + }; - int convertSMVer2Cores(int major, int minor) + DeviceProps deviceProps; + + class CudaDeviceInfoFuncTable: DeviceInfoFuncTable + { + public: + size_t sharedMemPerBlock() const + { + return deviceProps.get(device_id_)->sharedMemPerBlock; + } + + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + { + int prevDeviceID = getDevice(); + if (prevDeviceID != device_id_) + setDevice(device_id_); + + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); + + if (prevDeviceID != device_id_) + setDevice(prevDeviceID); + } + + size_t freeMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _freeMemory; + } + + size_t totalMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _totalMemory; + } + + bool supports(FeatureSet feature_set) const + { + int version = majorVersion_ * 10 + minorVersion_; + return version >= feature_set; + } + + bool isCompatible() const + { + // Check PTX compatibility + if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_)) + return true; + + // Check BIN compatibility + for (int i = minorVersion_; i >= 0; --i) + if (TargetArchs::hasBin(majorVersion_, i)) + return true; + + return false; + } + + void query() + { + const cudaDeviceProp* prop = deviceProps.get(device_id_); + + name_ = prop->name; + multi_processor_count_ = prop->multiProcessorCount; + majorVersion_ = prop->major; + minorVersion_ = prop->minor; + } + + private: + int device_id_; + + std::string name_; + int multi_processor_count_; + int majorVersion_; + int minorVersion_; + }; + + class CudaFuncTable : public GpuFuncTable + { + protected: + + const CudaArch cudaArch; + + int convertSMVer2Cores(int major, int minor) const { // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM typedef struct { @@ -600,42 +649,42 @@ namespace cudaSafeCall( cudaDeviceReset() ); } - bool TargetArchs::builtWith(FeatureSet feature_set) const + bool builtWith(FeatureSet feature_set) const { return cudaArch.builtWith(feature_set); } - bool TargetArchs::has(int major, int minor) const + bool has(int major, int minor) const { return hasPtx(major, minor) || hasBin(major, minor); } - bool TargetArchs::hasPtx(int major, int minor) const + bool hasPtx(int major, int minor) const { return cudaArch.hasPtx(major, minor); } - bool TargetArchs::hasBin(int major, int minor) const + bool hasBin(int major, int minor) const { return cudaArch.hasBin(major, minor); } - bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const + bool hasEqualOrLessPtx(int major, int minor) const { return cudaArch.hasEqualOrLessPtx(major, minor); } - bool TargetArchs::hasEqualOrGreater(int major, int minor) const + bool hasEqualOrGreater(int major, int minor) const { return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); } - bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const + bool hasEqualOrGreaterPtx(int major, int minor) const { return cudaArch.hasEqualOrGreaterPtx(major, minor); } - bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const + bool hasEqualOrGreaterBin(int major, int minor) const { return cudaArch.hasEqualOrGreaterBin(major, minor); } @@ -664,68 +713,7 @@ namespace return TargetArchs::builtWith(feature_set) && (version >= feature_set); } - - size_t sharedMemPerBlock() const - { - return deviceProps.get(device_id_)->sharedMemPerBlock; - } - - void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const - { - int prevDeviceID = getDevice(); - if (prevDeviceID != device_id_) - setDevice(device_id_); - - cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); - - if (prevDeviceID != device_id_) - setDevice(prevDeviceID); - } - - size_t freeMemory() const - { - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _freeMemory; - } - - size_t totalMemory() const - { - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _totalMemory; - } - - bool supports(FeatureSet feature_set) const - { - int version = majorVersion() * 10 + minorVersion(); - return version >= feature_set; - } - - bool isCompatible() const - { - // Check PTX compatibility - if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion())) - return true; - - // Check BIN compatibility - for (int i = minorVersion(); i >= 0; --i) - if (TargetArchs::hasBin(majorVersion(), i)) - return true; - - return false; - } - - void query() const - { - const cudaDeviceProp* prop = deviceProps.get(device_id_); - - name_ = prop->name; - multi_processor_count_ = prop->multiProcessorCount; - majorVersion_ = prop->major; - minorVersion_ = prop->minor; - } - + void printCudaDeviceInfo(int device) const { int count = getCudaEnabledDeviceCount(); @@ -864,16 +852,16 @@ namespace typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); static const func_t funcs[7][4] = { - /* 8U */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 8S */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask }, - /* 16U */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 16S */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32S */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32F */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 64F */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask } + /* 8U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }, + /* 16U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 16S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32F */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask } }; - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask; + const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask; func(src, dst, mask, 0); } @@ -971,7 +959,7 @@ namespace func(src, dst); } - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const { CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); CV_Assert(dst.depth() <= CV_64F); @@ -982,10 +970,10 @@ namespace CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - cv::gpu::device::convertTo(src, dst, alpha, beta); + cv::gpu::device::convertTo(src, dst, alpha, beta, stream); } - void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const + void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const { if (mask.empty()) { @@ -1016,7 +1004,7 @@ namespace {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } }; CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); @@ -1027,7 +1015,10 @@ namespace CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - funcs[m.depth()][m.channels() - 1](m, s); + if (stream) + cv::gpu::device::setTo(m, s, stream); + else + funcs[m.depth()][m.channels() - 1](m, s); } else { @@ -1051,7 +1042,10 @@ namespace CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - funcs[m.depth()][m.channels() - 1](m, s, mask); + if (stream) + cv::gpu::device::setTo(m, s, mask, stream); + else + funcs[m.depth()][m.channels() - 1](m, s, mask); } } @@ -1065,5 +1059,5 @@ namespace cudaFree(devPtr); } }; -} +#endif #endif \ No newline at end of file From 88a883e68ee9ab379118a1c68aa14ebaa24d8afd Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Tue, 17 Dec 2013 10:24:00 +0400 Subject: [PATCH 03/13] Build fix. --- modules/core/cuda/main.cpp | 2 ++ modules/core/include/opencv2/core/gpumat.hpp | 2 -- modules/core/src/gpumat.cpp | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp index 26d483420..4f47dc7e9 100644 --- a/modules/core/cuda/main.cpp +++ b/modules/core/cuda/main.cpp @@ -25,6 +25,8 @@ using namespace std; using namespace cv; using namespace cv::gpu; +#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") + #include "gpumat_cuda.hpp" #ifdef HAVE_CUDA diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index d62c8749b..755660461 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -48,8 +48,6 @@ #include "opencv2/core/core.hpp" #include "opencv2/core/cuda_devptrs.hpp" -#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") - namespace cv { namespace gpu { //////////////////////////////// Initialization & Info //////////////////////// diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index f438dfd8b..7e4eab4a1 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -65,6 +65,8 @@ using namespace std; using namespace cv; using namespace cv::gpu; +#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") + #include "gpumat_cuda.hpp" typedef GpuFuncTable* (*GpuFactoryType)(); From be530bd0856c623688e2f2d5842ea171b2afacc1 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Wed, 18 Dec 2013 12:02:15 +0400 Subject: [PATCH 04/13] DeviceInfo class method that were implemented in header moved to cpp file. --- modules/core/include/opencv2/core/gpumat.hpp | 10 +++--- modules/core/src/gpumat.cpp | 5 +++ modules/core/src/gpumat_cuda.hpp | 35 ++++++++++++++++++++ 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index 755660461..d0f415ec3 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -112,13 +112,13 @@ namespace cv { namespace gpu // Creates DeviceInfo object for the given GPU DeviceInfo(int device_id) : device_id_(device_id) { query(); } - std::string name() const { return name_; } + std::string name() const; // Return compute capability versions - int majorVersion() const { return majorVersion_; } - int minorVersion() const { return minorVersion_; } + int majorVersion() const; + int minorVersion() const; - int multiProcessorCount() const { return multi_processor_count_; } + int multiProcessorCount() const; size_t sharedMemPerBlock() const; @@ -132,7 +132,7 @@ namespace cv { namespace gpu // Checks whether the GPU module can be run on the given device bool isCompatible() const; - int deviceID() const { return device_id_; } + int deviceID() const; private: // Private section is fictive to preserve bin compatibility. diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 7e4eab4a1..dc24b6e82 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -170,6 +170,11 @@ size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->f size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); } bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); } bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); } +int cv::gpu::DeviceInfo::deviceID() const { return deviceInfoFuncTable()->deviceID(); }; +int cv::gpu::DeviceInfo::majorVersion() const { return deviceInfoFuncTable()->majorVersion(); } +int cv::gpu::DeviceInfo::minorVersion() const { return deviceInfoFuncTable()->minorVersion(); } +std::string cv::gpu::DeviceInfo::name() const { return deviceInfoFuncTable()->name(); } +int cv::gpu::DeviceInfo::multiProcessorCount() const { return deviceInfoFuncTable()->multiProcessorCount(); } void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); } void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); } diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp index 56d626a5c..83172d5ca 100644 --- a/modules/core/src/gpumat_cuda.hpp +++ b/modules/core/src/gpumat_cuda.hpp @@ -11,6 +11,11 @@ virtual bool supports(FeatureSet) const = 0; virtual bool isCompatible() const = 0; virtual void query() = 0; + virtual int deviceID() const = 0; + virtual std::string name() const = 0; + virtual int majorVersion() const = 0; + virtual int minorVersion() const = 0; + virtual int multiProcessorCount() const = 0; virtual ~DeviceInfoFuncTable() {}; }; @@ -70,6 +75,11 @@ bool supports(FeatureSet) const { throw_nogpu; return false; } bool isCompatible() const { throw_nogpu; return false; } void query() { throw_nogpu; } + int deviceID() const { throw_nogpu; return -1; }; + std::string name() const { throw_nogpu; return std::string(); } + int majorVersion() const { throw_nogpu; return -1; } + int minorVersion() const { throw_nogpu; return -1; } + int multiProcessorCount() const { throw_nogpu; return -1; } }; class EmptyFuncTable : public GpuFuncTable @@ -579,6 +589,31 @@ namespace cv { namespace gpu { namespace device minorVersion_ = prop->minor; } + int deviceID() const + { + return device_id_; + } + + std::string name() const + { + return name_; + } + + int majorVersion() const + { + return majorVersion_; + } + + int minorVersion() const + { + return minorVersion_; + } + + int multiProcessorCount() const + { + return multi_processor_count_; + } + private: int device_id_; From 442082eb0ff51353953c605899d61f1f7fb089eb Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 09:38:46 +0400 Subject: [PATCH 05/13] Fixes for Android support. --- CMakeLists.txt | 2 + modules/core/cuda/CMakeLists.txt | 6 +- modules/core/src/gpumat.cpp | 99 +++++++++++++++++++++++++++++++- 3 files changed, 103 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a7c730bc..01d49ab84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,6 +128,7 @@ OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS) OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE ) OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) ) +OCV_OPTION(DYNAMIC_CUDA_SUPPORT "Make CUDA support dynamic" OFF IF (WITH_CUDA) AND NOT IOS AND NOT WINDOWS) OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) ) OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) ) OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) ) @@ -853,6 +854,7 @@ if(HAVE_CUDA) status("") status(" NVIDIA CUDA") + status(" Dynamic CUDA support:" DYNAMIC_CUDA_SUPPORT THEN YES ELSE NO) status(" Use CUFFT:" HAVE_CUFFT THEN YES ELSE NO) status(" Use CUBLAS:" HAVE_CUBLAS THEN YES ELSE NO) status(" USE NVCUVID:" HAVE_NVCUVID THEN YES ELSE NO) diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt index 72ecea7a4..828e13b80 100644 --- a/modules/core/cuda/CMakeLists.txt +++ b/modules/core/cuda/CMakeLists.txt @@ -7,4 +7,8 @@ include_directories(${CUDA_INCLUDE_DIRS} ) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) cuda_add_library(opencv_core_cuda SHARED main.cpp ../src/cuda/matrix_operations.cu) -target_link_libraries(opencv_core_cuda ${CUDA_LIBRARIES}) \ No newline at end of file +if(BUILD_FAT_JAVA_LIB) + target_link_libraries(opencv_core_cuda ${OPENCV_BUILD_DIR}/${LIBRARY_OUTPUT_PATH}/libopencv_java.so ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) +else() + target_link_libraries(opencv_core_cuda ${OPENCV_BUILD_DIR}/${LIBRARY_OUTPUT_PATH}/libopencv_core.so ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) +endif() \ No newline at end of file diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index dc24b6e82..c8d1d058b 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -43,7 +43,6 @@ #include "precomp.hpp" #include "opencv2/core/gpumat.hpp" #include -#include #if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT) #include @@ -61,6 +60,22 @@ #endif #endif +#ifdef DYNAMIC_CUDA_SUPPORT +#include +#include +#include +#include +#endif + +#ifdef ANDROID +# include + +# define LOG_TAG "OpenCV::CUDA" +# define LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)) +# define LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__)) +# define LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)) +#endif + using namespace std; using namespace cv; using namespace cv::gpu; @@ -69,16 +84,90 @@ using namespace cv::gpu; #include "gpumat_cuda.hpp" +#ifdef DYNAMIC_CUDA_SUPPORT + typedef GpuFuncTable* (*GpuFactoryType)(); typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)(); static GpuFactoryType gpuFactory = NULL; static DeviceInfoFactoryType deviceInfoFactory = NULL; +# if defined(__linux__) || defined(__APPLE__) || defined (ANDROID) +# ifdef ANDROID +static const std::string getCudaSupportLibName() +{ + Dl_info dl_info; + if(0 != dladdr((void *)getCudaSupportLibName, &dl_info)) + { + LOGD("Library name: %s", dl_info.dli_fname); + LOGD("Library base address: %p", dl_info.dli_fbase); + + const char* libName=dl_info.dli_fname; + while( ((*libName)=='/') || ((*libName)=='.') ) + libName++; + + char lineBuf[2048]; + FILE* file = fopen("/proc/self/smaps", "rt"); + + if(file) + { + while (fgets(lineBuf, sizeof lineBuf, file) != NULL) + { + //verify that line ends with library name + int lineLength = strlen(lineBuf); + int libNameLength = strlen(libName); + + //trim end + for(int i = lineLength - 1; i >= 0 && isspace(lineBuf[i]); --i) + { + lineBuf[i] = 0; + --lineLength; + } + + if (0 != strncmp(lineBuf + lineLength - libNameLength, libName, libNameLength)) + { + //the line does not contain the library name + continue; + } + + //extract path from smaps line + char* pathBegin = strchr(lineBuf, '/'); + if (0 == pathBegin) + { + LOGE("Strange error: could not find path beginning in lin \"%s\"", lineBuf); + continue; + } + + char* pathEnd = strrchr(pathBegin, '/'); + pathEnd[1] = 0; + + LOGD("Libraries folder found: %s", pathBegin); + + fclose(file); + return std::string(pathBegin) + "/libopencv_core_cuda.so"; + } + fclose(file); + LOGE("Could not find library path"); + } + else + { + LOGE("Could not read /proc/self/smaps"); + } + } + else + { + LOGE("Could not get library name and base address"); + } + + return string(); +} + +# else static const std::string getCudaSupportLibName() { return "libopencv_core_cuda.so"; } +# endif static bool loadCudaSupportLib() { @@ -102,11 +191,15 @@ static bool loadCudaSupportLib() return false; } - dlclose(handle); - return true; } +# else +# error "Dynamic CUDA support is not implemented for this platform!" +# endif + +#endif + static GpuFuncTable* gpuFuncTable() { #ifdef DYNAMIC_CUDA_SUPPORT From 6da7c50fb53edd291d709a06aad0b46c1311aac2 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 10:27:38 +0400 Subject: [PATCH 06/13] Make dependency from CUDA explicit to prevent from fake dependedcies from CUDA runtime. --- CMakeLists.txt | 12 ------------ cmake/OpenCVModule.cmake | 3 --- modules/core/CMakeLists.txt | 6 +++++- modules/gpu/CMakeLists.txt | 3 ++- modules/superres/CMakeLists.txt | 2 +- 5 files changed, 8 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 01d49ab84..56c176453 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -459,18 +459,6 @@ if(WITH_OPENCL) include(cmake/OpenCVDetectOpenCL.cmake) endif() -# ---------------------------------------------------------------------------- -# Add CUDA libraries (needed for apps/tools, samples) -# ---------------------------------------------------------------------------- -if(HAVE_CUDA) - set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) - if(HAVE_CUBLAS) - set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cublas_LIBRARY}) - endif() - if(HAVE_CUFFT) - set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY}) - endif() -endif() # ---------------------------------------------------------------------------- # Solution folders: # ---------------------------------------------------------------------------- diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index c923aba41..d7e7c4a1c 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -537,9 +537,6 @@ macro(ocv_create_module) target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS}) target_link_libraries(${the_module} LINK_INTERFACE_LIBRARIES ${OPENCV_MODULE_${the_module}_DEPS}) target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN}) - if (HAVE_CUDA) - target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) - endif() endif() add_dependencies(opencv_modules ${the_module}) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index a7a997f67..07fa08925 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -33,7 +33,11 @@ macro(ocv_glob_module_sources_no_cuda) SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs}) endmacro() -ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) +if (DYNAMIC_CUDA_SUPPORT) + ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) +else() + ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) +endif() ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) if(HAVE_WINRT) diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index a61659789..9171febc7 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -3,7 +3,8 @@ if(IOS) endif() set(the_description "GPU-accelerated Computer Vision") -ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy) +ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy + OPTIONAL ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda") diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt index 44e9dc0f3..3da8dc2c6 100644 --- a/modules/superres/CMakeLists.txt +++ b/modules/superres/CMakeLists.txt @@ -4,4 +4,4 @@ endif() set(the_description "Super Resolution") ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef) -ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl) +ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_ocl ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) From 64c94cb22c382aa3b9377d6d94648b91159a8744 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 11:18:04 +0400 Subject: [PATCH 07/13] CUDA related func tables refactored to remove unneeded dependencies. --- modules/core/src/gpumat.cpp | 30 +-- modules/core/src/gpumat_cuda.hpp | 384 +++++++++++++++---------------- 2 files changed, 204 insertions(+), 210 deletions(-) diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index c8d1d058b..03dcad2af 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -239,23 +239,23 @@ static DeviceInfoFuncTable* deviceInfoFuncTable() //////////////////////////////// Initialization & Info //////////////////////// -int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); } +int cv::gpu::getCudaEnabledDeviceCount() { return deviceInfoFuncTable()->getCudaEnabledDeviceCount(); } -void cv::gpu::setDevice(int device) { gpuFuncTable()->setDevice(device); } -int cv::gpu::getDevice() { return gpuFuncTable()->getDevice(); } +void cv::gpu::setDevice(int device) { deviceInfoFuncTable()->setDevice(device); } +int cv::gpu::getDevice() { return deviceInfoFuncTable()->getDevice(); } -void cv::gpu::resetDevice() { gpuFuncTable()->resetDevice(); } +void cv::gpu::resetDevice() { deviceInfoFuncTable()->resetDevice(); } -bool cv::gpu::deviceSupports(FeatureSet feature_set) { return gpuFuncTable()->deviceSupports(feature_set); } +bool cv::gpu::deviceSupports(FeatureSet feature_set) { return deviceInfoFuncTable()->deviceSupports(feature_set); } -bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return gpuFuncTable()->builtWith(feature_set); } -bool cv::gpu::TargetArchs::has(int major, int minor) { return gpuFuncTable()->has(major, minor); } -bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { return gpuFuncTable()->hasPtx(major, minor); } -bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return gpuFuncTable()->hasBin(major, minor); } -bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrLessPtx(major, minor); } -bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuFuncTable()->hasEqualOrGreater(major, minor); } -bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); } -bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); } +bool cv::gpu::TargetArchs::builtWith(FeatureSet feature_set) { return deviceInfoFuncTable()->builtWith(feature_set); } +bool cv::gpu::TargetArchs::has(int major, int minor) { return deviceInfoFuncTable()->has(major, minor); } +bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { return deviceInfoFuncTable()->hasPtx(major, minor); } +bool cv::gpu::TargetArchs::hasBin(int major, int minor) { return deviceInfoFuncTable()->hasBin(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrLessPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreater(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterPtx(major, minor); } +bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return deviceInfoFuncTable()->hasEqualOrGreaterBin(major, minor); } size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); } void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); } @@ -270,8 +270,8 @@ std::string cv::gpu::DeviceInfo::name() const { return deviceInfoFuncTable()->na int cv::gpu::DeviceInfo::multiProcessorCount() const { return deviceInfoFuncTable()->multiProcessorCount(); } void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); } -void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); } -void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); } +void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); } +void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); } #ifdef HAVE_CUDA diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp index 83172d5ca..9281655d7 100644 --- a/modules/core/src/gpumat_cuda.hpp +++ b/modules/core/src/gpumat_cuda.hpp @@ -4,6 +4,7 @@ class DeviceInfoFuncTable { public: + // cv::DeviceInfo virtual size_t sharedMemPerBlock() const = 0; virtual void queryMemory(size_t&, size_t&) const = 0; virtual size_t freeMemory() const = 0; @@ -16,25 +17,13 @@ virtual int majorVersion() const = 0; virtual int minorVersion() const = 0; virtual int multiProcessorCount() const = 0; - virtual ~DeviceInfoFuncTable() {}; - }; - - class GpuFuncTable - { - public: - virtual ~GpuFuncTable() {} - - // DeviceInfo routines virtual int getCudaEnabledDeviceCount() const = 0; - virtual void setDevice(int) const = 0; virtual int getDevice() const = 0; - virtual void resetDevice() const = 0; - virtual bool deviceSupports(FeatureSet) const = 0; - // TargetArchs + // cv::TargetArchs virtual bool builtWith(FeatureSet) const = 0; virtual bool has(int, int) const = 0; virtual bool hasPtx(int, int) const = 0; @@ -46,7 +35,15 @@ virtual void printCudaDeviceInfo(int) const = 0; virtual void printShortCudaDeviceInfo(int) const = 0; - + + virtual ~DeviceInfoFuncTable() {}; + }; + + class GpuFuncTable + { + public: + virtual ~GpuFuncTable() {} + // GpuMat routines virtual void copy(const Mat& src, GpuMat& dst) const = 0; virtual void copy(const GpuMat& src, Mat& dst) const = 0; @@ -60,7 +57,7 @@ // for gpu::device::setTo funcs virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; - + virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; virtual void free(void* devPtr) const = 0; }; @@ -80,20 +77,14 @@ int majorVersion() const { throw_nogpu; return -1; } int minorVersion() const { throw_nogpu; return -1; } int multiProcessorCount() const { throw_nogpu; return -1; } - }; - - class EmptyFuncTable : public GpuFuncTable - { - public: - - // DeviceInfo routines + int getCudaEnabledDeviceCount() const { return 0; } - + void setDevice(int) const { throw_nogpu; } int getDevice() const { throw_nogpu; return 0; } - + void resetDevice() const { throw_nogpu; } - + bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } bool builtWith(FeatureSet) const { throw_nogpu; return false; } @@ -104,10 +95,15 @@ bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } - + void printCudaDeviceInfo(int) const { throw_nogpu; } void printShortCudaDeviceInfo(int) const { throw_nogpu; } - + }; + + class EmptyFuncTable : public GpuFuncTable + { + public: + void copy(const Mat&, GpuMat&) const { throw_nogpu; } void copy(const GpuMat&, Mat&) const { throw_nogpu; } void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } @@ -185,62 +181,62 @@ namespace cv { namespace gpu { namespace device { typedef typename NPPTypeTraits::npp_type src_t; typedef typename NPPTypeTraits::npp_type dst_t; - + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); }; template struct NppConvertFunc { typedef typename NPPTypeTraits::npp_type dst_t; - + typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); }; - + template::func_ptr func> struct NppCvt { typedef typename NPPTypeTraits::npp_type src_t; typedef typename NPPTypeTraits::npp_type dst_t; - + static void call(const GpuMat& src, GpuMat& dst) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + template::func_ptr func> struct NppCvt { typedef typename NPPTypeTraits::npp_type dst_t; - + static void call(const GpuMat& src, GpuMat& dst) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + ////////////////////////////////////////////////////////////////////////// // Set - + template struct NppSetFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); }; template struct NppSetFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); }; template struct NppSetFunc @@ -251,172 +247,172 @@ namespace cv { namespace gpu { namespace device { typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); }; - + template::func_ptr func> struct NppSet { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(GpuMat& src, Scalar s) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + Scalar_ nppS = s; - + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func> struct NppSet { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(GpuMat& src, Scalar s) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + Scalar_ nppS = s; - + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + template struct NppSetMaskFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); }; template struct NppSetMaskFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); }; - + template::func_ptr func> struct NppSetMask { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(GpuMat& src, Scalar s, const GpuMat& mask) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + Scalar_ nppS = s; - + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; template::func_ptr func> struct NppSetMask { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(GpuMat& src, Scalar s, const GpuMat& mask) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + Scalar_ nppS = s; - + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + ////////////////////////////////////////////////////////////////////////// // CopyMasked - + template struct NppCopyMaskedFunc { typedef typename NPPTypeTraits::npp_type src_t; - + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); }; - + template::func_ptr func> struct NppCopyMasked { typedef typename NPPTypeTraits::npp_type src_t; - + static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) { NppiSize sz; sz.width = src.cols; sz.height = src.rows; - + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - + cudaSafeCall( cudaDeviceSynchronize() ); } }; - + template static inline bool isAligned(const T* ptr, size_t size) { return reinterpret_cast(ptr) % size == 0; } - + namespace cv { namespace gpu { namespace device { void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) { CV_Assert(src.size() == dst.size() && src.type() == dst.type()); CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); - + cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); } - + void convertTo(const GpuMat& src, GpuMat& dst) { cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); } - + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) { cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); } - + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) { typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); - + static const caller_t callers[] = { kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller }; - + callers[src.depth()](src, s, stream); } - + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) { typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - + static const caller_t callers[] = { kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller }; - + callers[src.depth()](src, s, mask, stream); } - + void setTo(GpuMat& src, Scalar s) { setTo(src, s, 0); } - + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) { setTo(src, s, mask, 0); @@ -433,56 +429,56 @@ namespace cv { namespace gpu { namespace device fromStr(CUDA_ARCH_PTX, ptx); fromStr(CUDA_ARCH_FEATURES, features); } - + bool builtWith(FeatureSet feature_set) const { return !features.empty() && (features.back() >= feature_set); } - + bool hasPtx(int major, int minor) const { return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); } - + bool hasBin(int major, int minor) const { return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); } - + bool hasEqualOrLessPtx(int major, int minor) const { return !ptx.empty() && (ptx.front() <= major * 10 + minor); } - + bool hasEqualOrGreaterPtx(int major, int minor) const { return !ptx.empty() && (ptx.back() >= major * 10 + minor); } - + bool hasEqualOrGreaterBin(int major, int minor) const { return !bin.empty() && (bin.back() >= major * 10 + minor); } - - + + private: void fromStr(const string& set_as_str, vector& arr) { if (set_as_str.find_first_not_of(" ") == string::npos) return; - + istringstream stream(set_as_str); int cur_value; - + while (!stream.eof()) { stream >> cur_value; arr.push_back(cur_value); } - + sort(arr.begin(), arr.end()); } - + vector bin; vector ptx; vector features; @@ -495,7 +491,7 @@ namespace cv { namespace gpu { namespace device { props_.resize(10, 0); } - + ~DeviceProps() { for (size_t i = 0; i < props_.size(); ++i) @@ -505,18 +501,18 @@ namespace cv { namespace gpu { namespace device } props_.clear(); } - + cudaDeviceProp* get(int devID) { if (devID >= (int) props_.size()) props_.resize(devID + 5, 0); - + if (!props_[devID]) { props_[devID] = new cudaDeviceProp; cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); } - + return props_[devID]; } private: @@ -524,7 +520,7 @@ namespace cv { namespace gpu { namespace device }; DeviceProps deviceProps; - + class CudaDeviceInfoFuncTable: DeviceInfoFuncTable { public: @@ -532,57 +528,57 @@ namespace cv { namespace gpu { namespace device { return deviceProps.get(device_id_)->sharedMemPerBlock; } - + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const { int prevDeviceID = getDevice(); if (prevDeviceID != device_id_) setDevice(device_id_); - + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); - + if (prevDeviceID != device_id_) setDevice(prevDeviceID); } - + size_t freeMemory() const { size_t _totalMemory, _freeMemory; queryMemory(_totalMemory, _freeMemory); return _freeMemory; } - + size_t totalMemory() const { size_t _totalMemory, _freeMemory; queryMemory(_totalMemory, _freeMemory); return _totalMemory; } - + bool supports(FeatureSet feature_set) const { int version = majorVersion_ * 10 + minorVersion_; return version >= feature_set; } - + bool isCompatible() const { // Check PTX compatibility - if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_)) + if (hasEqualOrLessPtx(majorVersion_, minorVersion_)) return true; - + // Check BIN compatibility for (int i = minorVersion_; i >= 0; --i) - if (TargetArchs::hasBin(majorVersion_, i)) + if (hasBin(majorVersion_, i)) return true; - + return false; } - + void query() { const cudaDeviceProp* prop = deviceProps.get(device_id_); - + name_ = prop->name; multi_processor_count_ = prop->multiProcessorCount; majorVersion_ = prop->major; @@ -614,116 +610,78 @@ namespace cv { namespace gpu { namespace device return multi_processor_count_; } - private: - int device_id_; - - std::string name_; - int multi_processor_count_; - int majorVersion_; - int minorVersion_; - }; - - class CudaFuncTable : public GpuFuncTable - { - protected: - - const CudaArch cudaArch; - - int convertSMVer2Cores(int major, int minor) const - { - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } SMtoCores; - - SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; - - int index = 0; - while (gpuArchCoresPerSM[index].SM != -1) - { - if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) - return gpuArchCoresPerSM[index].Cores; - index++; - } - - return -1; - } - - public: - int getCudaEnabledDeviceCount() const { int count; cudaError_t error = cudaGetDeviceCount( &count ); - + if (error == cudaErrorInsufficientDriver) return -1; - + if (error == cudaErrorNoDevice) return 0; - + cudaSafeCall( error ); return count; } - + void setDevice(int device) const { cudaSafeCall( cudaSetDevice( device ) ); } - + int getDevice() const { int device; cudaSafeCall( cudaGetDevice( &device ) ); return device; } - + void resetDevice() const { cudaSafeCall( cudaDeviceReset() ); } - + bool builtWith(FeatureSet feature_set) const { return cudaArch.builtWith(feature_set); } - + bool has(int major, int minor) const { return hasPtx(major, minor) || hasBin(major, minor); } - + bool hasPtx(int major, int minor) const { return cudaArch.hasPtx(major, minor); } - + bool hasBin(int major, int minor) const { return cudaArch.hasBin(major, minor); } - + bool hasEqualOrLessPtx(int major, int minor) const { return cudaArch.hasEqualOrLessPtx(major, minor); } - + bool hasEqualOrGreater(int major, int minor) const { return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); } - + bool hasEqualOrGreaterPtx(int major, int minor) const { return cudaArch.hasEqualOrGreaterPtx(major, minor); } - + bool hasEqualOrGreaterBin(int major, int minor) const { return cudaArch.hasEqualOrGreaterBin(major, minor); } - + bool deviceSupports(FeatureSet feature_set) const { static int versions[] = @@ -731,11 +689,11 @@ namespace cv { namespace gpu { namespace device -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); - + const int devId = getDevice(); - + int version; - + if (devId < cache_size && versions[devId] >= 0) version = versions[devId]; else @@ -745,25 +703,25 @@ namespace cv { namespace gpu { namespace device if (devId < cache_size) versions[devId] = version; } - + return TargetArchs::builtWith(feature_set) && (version >= feature_set); } - + void printCudaDeviceInfo(int device) const { int count = getCudaEnabledDeviceCount(); bool valid = (device >= 0) && (device < count); - + int beg = valid ? device : 0; int end = valid ? device+1 : count; - + printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); printf("Device count: %d\n", count); - + int driverVersion = 0, runtimeVersion = 0; cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - + const char *computeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", @@ -772,30 +730,30 @@ namespace cv { namespace gpu { namespace device "Unknown", NULL }; - + for(int dev = beg; dev < end; ++dev) { cudaDeviceProp prop; cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - + printf("\nDevice %d: \"%s\"\n", dev, prop.name); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); - + int cores = convertSMVer2Cores(prop.major, prop.minor); if (cores > 0) printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); - + printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); - + printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", - prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], - prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); + prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], + prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", - prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], - prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); - + prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], + prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); + printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); @@ -805,12 +763,12 @@ namespace cv { namespace gpu { namespace device printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); - + printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); - + printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); @@ -820,7 +778,7 @@ namespace cv { namespace gpu { namespace device printf(" Compute Mode:\n"); printf(" %s \n", computeMode[prop.computeMode]); } - + printf("\n"); printf("deviceQuery, CUDA Driver = CUDART"); printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); @@ -828,37 +786,73 @@ namespace cv { namespace gpu { namespace device printf(", NumDevs = %d\n\n", count); fflush(stdout); } - + void printShortCudaDeviceInfo(int device) const { int count = getCudaEnabledDeviceCount(); bool valid = (device >= 0) && (device < count); - + int beg = valid ? device : 0; int end = valid ? device+1 : count; - + int driverVersion = 0, runtimeVersion = 0; cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - + for(int dev = beg; dev < end; ++dev) { cudaDeviceProp prop; cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - + const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); - + int cores = convertSMVer2Cores(prop.major, prop.minor); if (cores > 0) printf(", %d cores", cores * prop.multiProcessorCount); - + printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); } fflush(stdout); } - + + private: + int device_id_; + + std::string name_; + int multi_processor_count_; + int majorVersion_; + int minorVersion_; + + const CudaArch cudaArch; + + int convertSMVer2Cores(int major, int minor) const + { + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } SMtoCores; + + SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; + + int index = 0; + while (gpuArchCoresPerSM[index].SM != -1) + { + if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) + return gpuArchCoresPerSM[index].Cores; + index++; + } + + return -1; + } + }; + + class CudaFuncTable : public GpuFuncTable + { + public: + void copy(const Mat& src, GpuMat& dst) const { cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); From 037ffcdf99a821a5a8a3ea7a60b801244fbb93d9 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 16:42:11 +0400 Subject: [PATCH 08/13] Dynamic CUDA support library reimplemented as OpenCV module. --- CMakeLists.txt | 2 - cmake/OpenCVModule.cmake | 2 +- modules/core/CMakeLists.txt | 60 +++++-------------- modules/core/cuda/CMakeLists.txt | 14 ----- modules/core/src/gpumat.cpp | 4 +- modules/dynamicuda/CMakeLists.txt | 14 +++++ .../opencv2/dynamicuda/dynamicuda.hpp} | 0 .../src/cuda/matrix_operations.cu | 0 .../{core/cuda => dynamicuda/src}/main.cpp | 4 +- modules/java/CMakeLists.txt | 6 ++ 10 files changed, 41 insertions(+), 65 deletions(-) delete mode 100644 modules/core/cuda/CMakeLists.txt create mode 100644 modules/dynamicuda/CMakeLists.txt rename modules/{core/src/gpumat_cuda.hpp => dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp} (100%) rename modules/{core => dynamicuda}/src/cuda/matrix_operations.cu (100%) rename modules/{core/cuda => dynamicuda/src}/main.cpp (96%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 56c176453..cf25084bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,7 +128,6 @@ OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS) OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE ) OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) ) -OCV_OPTION(DYNAMIC_CUDA_SUPPORT "Make CUDA support dynamic" OFF IF (WITH_CUDA) AND NOT IOS AND NOT WINDOWS) OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) ) OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT IOS) ) OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) ) @@ -842,7 +841,6 @@ if(HAVE_CUDA) status("") status(" NVIDIA CUDA") - status(" Dynamic CUDA support:" DYNAMIC_CUDA_SUPPORT THEN YES ELSE NO) status(" Use CUFFT:" HAVE_CUFFT THEN YES ELSE NO) status(" Use CUBLAS:" HAVE_CUBLAS THEN YES ELSE NO) status(" USE NVCUVID:" HAVE_NVCUVID THEN YES ELSE NO) diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index d7e7c4a1c..3dd749b05 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -488,7 +488,7 @@ macro(ocv_glob_module_sources) file(GLOB lib_cuda_srcs "src/cuda/*.cu") set(cuda_objs "") set(lib_cuda_hdrs "") - if(HAVE_CUDA AND lib_cuda_srcs) + if(HAVE_CUDA) ocv_include_directories(${CUDA_INCLUDE_DIRS}) file(GLOB lib_cuda_hdrs "src/cuda/*.hpp") diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 07fa08925..e89d6f276 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,50 +1,18 @@ set(the_description "The Core Functionality") -macro(ocv_glob_module_sources_no_cuda) - file(GLOB_RECURSE lib_srcs "src/*.cpp") - file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h") - file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") - file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h") - - set(cuda_objs "") - set(lib_cuda_hdrs "") - if(HAVE_CUDA) - ocv_include_directories(${CUDA_INCLUDE_DIRS}) - file(GLOB lib_cuda_hdrs "src/cuda/*.hpp") - endif() - - source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs}) - - file(GLOB cl_kernels "src/opencl/*.cl") - if(HAVE_opencv_ocl AND cl_kernels) - ocv_include_directories(${OPENCL_INCLUDE_DIRS}) - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp" - COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake" - DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake") - source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") - list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") - endif() - - source_group("Include" FILES ${lib_hdrs}) - source_group("Include\\detail" FILES ${lib_hdrs_detail}) - - ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} - SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs}) -endmacro() - -if (DYNAMIC_CUDA_SUPPORT) +if (HAVE_opencv_dynamicuda) ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) else() ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) endif() -ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) + +ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/dynamicuda/include/" ${ZLIB_INCLUDE_DIR}) if(HAVE_WINRT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") endif() -if(DYNAMIC_CUDA_SUPPORT) +if(HAVE_opencv_dynamicuda) add_definitions(-DDYNAMIC_CUDA_SUPPORT) else() add_definitions(-DUSE_CUDA) @@ -58,15 +26,23 @@ endif() file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") +if (NOT HAVE_opencv_dynamicuda) + file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*") +endif() + source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) -if (DYNAMIC_CUDA_SUPPORT) - ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" - HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) -else() +if (NOT HAVE_opencv_dynamicuda) + source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs}) +endif() + +if (HAVE_opencv_dynamicuda) ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) +else() + ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda} + HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) endif() ocv_create_module() @@ -74,7 +50,3 @@ ocv_add_precompiled_headers(${the_module}) ocv_add_accuracy_tests() ocv_add_perf_tests() - -if (DYNAMIC_CUDA_SUPPORT) - add_subdirectory(cuda) -endif() diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt deleted file mode 100644 index 828e13b80..000000000 --- a/modules/core/cuda/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -project(opencv_core_cuda) -add_definitions(-DUSE_CUDA) -include_directories(${CUDA_INCLUDE_DIRS} - "../src/" - "../include/opencv2/core/" - "${OpenCV_SOURCE_DIR}/modules/gpu/include" - ) -ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) -cuda_add_library(opencv_core_cuda SHARED main.cpp ../src/cuda/matrix_operations.cu) -if(BUILD_FAT_JAVA_LIB) - target_link_libraries(opencv_core_cuda ${OPENCV_BUILD_DIR}/${LIBRARY_OUTPUT_PATH}/libopencv_java.so ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) -else() - target_link_libraries(opencv_core_cuda ${OPENCV_BUILD_DIR}/${LIBRARY_OUTPUT_PATH}/libopencv_core.so ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) -endif() \ No newline at end of file diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 03dcad2af..590685b74 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -82,7 +82,7 @@ using namespace cv::gpu; #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") -#include "gpumat_cuda.hpp" +#include "opencv2/dynamicuda/dynamicuda.hpp" #ifdef DYNAMIC_CUDA_SUPPORT @@ -183,7 +183,7 @@ static bool loadCudaSupportLib() dlclose(handle); return false; } - + gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory"); if (!gpuFactory) { diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt new file mode 100644 index 000000000..2ae5cf84a --- /dev/null +++ b/modules/dynamicuda/CMakeLists.txt @@ -0,0 +1,14 @@ +if(NOT ANDROID) + ocv_module_disable(dynamicuda) +endif() + +set(the_description "Dynamic CUDA linkage") + +add_definitions(-DUSE_CUDA) +ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") +set(OPENCV_MODULE_TYPE SHARED) +if (BUILD_FAT_JAVA_LIB) + ocv_define_module(dynamicuda opencv_java PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) +else() + ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED q${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) +endif() diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp similarity index 100% rename from modules/core/src/gpumat_cuda.hpp rename to modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/dynamicuda/src/cuda/matrix_operations.cu similarity index 100% rename from modules/core/src/cuda/matrix_operations.cu rename to modules/dynamicuda/src/cuda/matrix_operations.cu diff --git a/modules/core/cuda/main.cpp b/modules/dynamicuda/src/main.cpp similarity index 96% rename from modules/core/cuda/main.cpp rename to modules/dynamicuda/src/main.cpp index 4f47dc7e9..4a05d8696 100644 --- a/modules/core/cuda/main.cpp +++ b/modules/dynamicuda/src/main.cpp @@ -27,7 +27,7 @@ using namespace cv::gpu; #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") -#include "gpumat_cuda.hpp" +#include "opencv2/dynamicuda/dynamicuda.hpp" #ifdef HAVE_CUDA static CudaDeviceInfoFuncTable deviceInfoTable; @@ -38,7 +38,7 @@ static EmptyFuncTable gpuTable; #endif extern "C" { - + DeviceInfoFuncTable* deviceInfoFactory() { return (DeviceInfoFuncTable*)&deviceInfoTable; diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt index 5012f914c..291295fb5 100644 --- a/modules/java/CMakeLists.txt +++ b/modules/java/CMakeLists.txt @@ -297,6 +297,12 @@ if(BUILD_FAT_JAVA_LIB) list(REMOVE_ITEM __deps ${m}) endif() endforeach() + if (HAVE_opencv_dynamicuda) + list(REMOVE_ITEM __deps "opencv_dynamicuda") + endif() + if (ANDROID AND HAVE_opencv_gpu) + list(REMOVE_ITEM __deps "opencv_gpu") + endif() ocv_list_unique(__deps) set(__extradeps ${__deps}) ocv_list_filterout(__extradeps "^opencv_") From 5a5c82bb1d395aeb76bd76f14a1db22742c02599 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 17:41:04 +0400 Subject: [PATCH 09/13] Additional ENABLE_DYNAMIC_CUDA option implemented in cmake. Warning fixes and refactoring. --- CMakeLists.txt | 1 + modules/core/CMakeLists.txt | 14 +- modules/dynamicuda/CMakeLists.txt | 1 + .../include/opencv2/dynamicuda/dynamicuda.hpp | 1899 +++++++++-------- modules/dynamicuda/src/main.cpp | 3 + modules/java/CMakeLists.txt | 2 +- 6 files changed, 969 insertions(+), 951 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cf25084bc..2c5165c1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,6 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi # OpenCV build options # =================================================== +OCV_OPTION(ENABLE_DYNAMIC_CUDA "Enabled dynamic CUDA linkage" ON IF ANDROID OR LINUX) OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers" ON IF (NOT IOS) ) OCV_OPTION(ENABLE_SOLUTION_FOLDERS "Solution folder in Visual Studio or in other IDEs" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") ) OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF IF CMAKE_COMPILER_IS_GNUCXX ) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index e89d6f276..f20e32d3a 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,8 +1,12 @@ set(the_description "The Core Functionality") -if (HAVE_opencv_dynamicuda) +message(STATUS "ENABLE_DYNAMIC_CUDA ${ENABLE_DYNAMIC_CUDA}") + +if (ENABLE_DYNAMIC_CUDA) + message(STATUS "Using dynamic cuda approach") ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) else() + message(STATUS "Link CUDA statically") ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) endif() @@ -12,7 +16,7 @@ if(HAVE_WINRT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") endif() -if(HAVE_opencv_dynamicuda) +if(ENABLE_DYNAMIC_CUDA) add_definitions(-DDYNAMIC_CUDA_SUPPORT) else() add_definitions(-DUSE_CUDA) @@ -26,18 +30,18 @@ endif() file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") -if (NOT HAVE_opencv_dynamicuda) +if (NOT ENABLE_DYNAMIC_CUDA) file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*") endif() source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) -if (NOT HAVE_opencv_dynamicuda) +if (NOT ENABLE_DYNAMIC_CUDA) source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs}) endif() -if (HAVE_opencv_dynamicuda) +if (ENABLE_DYNAMIC_CUDA) ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) else() diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt index 2ae5cf84a..def05d19b 100644 --- a/modules/dynamicuda/CMakeLists.txt +++ b/modules/dynamicuda/CMakeLists.txt @@ -5,6 +5,7 @@ endif() set(the_description "Dynamic CUDA linkage") add_definitions(-DUSE_CUDA) +ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) ocv_module_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") set(OPENCV_MODULE_TYPE SHARED) if (BUILD_FAT_JAVA_LIB) diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp index 9281655d7..4f5175513 100644 --- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp +++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp @@ -1,123 +1,123 @@ #ifndef __GPUMAT_CUDA_HPP__ #define __GPUMAT_CUDA_HPP__ - class DeviceInfoFuncTable - { - public: - // cv::DeviceInfo - virtual size_t sharedMemPerBlock() const = 0; - virtual void queryMemory(size_t&, size_t&) const = 0; - virtual size_t freeMemory() const = 0; - virtual size_t totalMemory() const = 0; - virtual bool supports(FeatureSet) const = 0; - virtual bool isCompatible() const = 0; - virtual void query() = 0; - virtual int deviceID() const = 0; - virtual std::string name() const = 0; - virtual int majorVersion() const = 0; - virtual int minorVersion() const = 0; - virtual int multiProcessorCount() const = 0; - virtual int getCudaEnabledDeviceCount() const = 0; - virtual void setDevice(int) const = 0; - virtual int getDevice() const = 0; - virtual void resetDevice() const = 0; - virtual bool deviceSupports(FeatureSet) const = 0; +class DeviceInfoFuncTable +{ +public: + // cv::DeviceInfo + virtual size_t sharedMemPerBlock() const = 0; + virtual void queryMemory(size_t&, size_t&) const = 0; + virtual size_t freeMemory() const = 0; + virtual size_t totalMemory() const = 0; + virtual bool supports(FeatureSet) const = 0; + virtual bool isCompatible() const = 0; + virtual void query() = 0; + virtual int deviceID() const = 0; + virtual std::string name() const = 0; + virtual int majorVersion() const = 0; + virtual int minorVersion() const = 0; + virtual int multiProcessorCount() const = 0; + virtual int getCudaEnabledDeviceCount() const = 0; + virtual void setDevice(int) const = 0; + virtual int getDevice() const = 0; + virtual void resetDevice() const = 0; + virtual bool deviceSupports(FeatureSet) const = 0; - // cv::TargetArchs - virtual bool builtWith(FeatureSet) const = 0; - virtual bool has(int, int) const = 0; - virtual bool hasPtx(int, int) const = 0; - virtual bool hasBin(int, int) const = 0; - virtual bool hasEqualOrLessPtx(int, int) const = 0; - virtual bool hasEqualOrGreater(int, int) const = 0; - virtual bool hasEqualOrGreaterPtx(int, int) const = 0; - virtual bool hasEqualOrGreaterBin(int, int) const = 0; + // cv::TargetArchs + virtual bool builtWith(FeatureSet) const = 0; + virtual bool has(int, int) const = 0; + virtual bool hasPtx(int, int) const = 0; + virtual bool hasBin(int, int) const = 0; + virtual bool hasEqualOrLessPtx(int, int) const = 0; + virtual bool hasEqualOrGreater(int, int) const = 0; + virtual bool hasEqualOrGreaterPtx(int, int) const = 0; + virtual bool hasEqualOrGreaterBin(int, int) const = 0; - virtual void printCudaDeviceInfo(int) const = 0; - virtual void printShortCudaDeviceInfo(int) const = 0; + virtual void printCudaDeviceInfo(int) const = 0; + virtual void printShortCudaDeviceInfo(int) const = 0; - virtual ~DeviceInfoFuncTable() {}; - }; + virtual ~DeviceInfoFuncTable() {}; +}; - class GpuFuncTable - { - public: - virtual ~GpuFuncTable() {} +class GpuFuncTable +{ +public: + virtual ~GpuFuncTable() {} - // GpuMat routines - virtual void copy(const Mat& src, GpuMat& dst) const = 0; - virtual void copy(const GpuMat& src, Mat& dst) const = 0; - virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; + // GpuMat routines + virtual void copy(const Mat& src, GpuMat& dst) const = 0; + virtual void copy(const GpuMat& src, Mat& dst) const = 0; + virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; - virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; + virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; - // gpu::device::convertTo funcs - virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0; - virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; + // gpu::device::convertTo funcs + virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) const = 0; + virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; - // for gpu::device::setTo funcs - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; + // for gpu::device::setTo funcs + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; - virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; - virtual void free(void* devPtr) const = 0; - }; + virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; + virtual void free(void* devPtr) const = 0; +}; - class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable - { - public: - size_t sharedMemPerBlock() const { throw_nogpu; return 0; } - void queryMemory(size_t&, size_t&) const { throw_nogpu; } - size_t freeMemory() const { throw_nogpu; return 0; } - size_t totalMemory() const { throw_nogpu; return 0; } - bool supports(FeatureSet) const { throw_nogpu; return false; } - bool isCompatible() const { throw_nogpu; return false; } - void query() { throw_nogpu; } - int deviceID() const { throw_nogpu; return -1; }; - std::string name() const { throw_nogpu; return std::string(); } - int majorVersion() const { throw_nogpu; return -1; } - int minorVersion() const { throw_nogpu; return -1; } - int multiProcessorCount() const { throw_nogpu; return -1; } +class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable +{ +public: + size_t sharedMemPerBlock() const { throw_nogpu; return 0; } + void queryMemory(size_t&, size_t&) const { throw_nogpu; } + size_t freeMemory() const { throw_nogpu; return 0; } + size_t totalMemory() const { throw_nogpu; return 0; } + bool supports(FeatureSet) const { throw_nogpu; return false; } + bool isCompatible() const { throw_nogpu; return false; } + void query() { throw_nogpu; } + int deviceID() const { throw_nogpu; return -1; }; + std::string name() const { throw_nogpu; return std::string(); } + int majorVersion() const { throw_nogpu; return -1; } + int minorVersion() const { throw_nogpu; return -1; } + int multiProcessorCount() const { throw_nogpu; return -1; } - int getCudaEnabledDeviceCount() const { return 0; } + int getCudaEnabledDeviceCount() const { return 0; } - void setDevice(int) const { throw_nogpu; } - int getDevice() const { throw_nogpu; return 0; } + void setDevice(int) const { throw_nogpu; } + int getDevice() const { throw_nogpu; return 0; } - void resetDevice() const { throw_nogpu; } + void resetDevice() const { throw_nogpu; } - bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } + bool deviceSupports(FeatureSet) const { throw_nogpu; return false; } - bool builtWith(FeatureSet) const { throw_nogpu; return false; } - bool has(int, int) const { throw_nogpu; return false; } - bool hasPtx(int, int) const { throw_nogpu; return false; } - bool hasBin(int, int) const { throw_nogpu; return false; } - bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; } - bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } - bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } - bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } + bool builtWith(FeatureSet) const { throw_nogpu; return false; } + bool has(int, int) const { throw_nogpu; return false; } + bool hasPtx(int, int) const { throw_nogpu; return false; } + bool hasBin(int, int) const { throw_nogpu; return false; } + bool hasEqualOrLessPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } + bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } - void printCudaDeviceInfo(int) const { throw_nogpu; } - void printShortCudaDeviceInfo(int) const { throw_nogpu; } - }; + void printCudaDeviceInfo(int) const { throw_nogpu; } + void printShortCudaDeviceInfo(int) const { throw_nogpu; } +}; - class EmptyFuncTable : public GpuFuncTable - { - public: +class EmptyFuncTable : public GpuFuncTable +{ +public: - void copy(const Mat&, GpuMat&) const { throw_nogpu; } - void copy(const GpuMat&, Mat&) const { throw_nogpu; } - void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } + void copy(const Mat&, GpuMat&) const { throw_nogpu; } + void copy(const GpuMat&, Mat&) const { throw_nogpu; } + void copy(const GpuMat&, GpuMat&) const { throw_nogpu; } - void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } + void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; } - void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } - void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } + void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } + void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } - void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } - void free(void*) const {} - }; + void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } + void free(void*) const {} +}; #if defined(USE_CUDA) @@ -153,940 +153,949 @@ namespace cv { namespace gpu { namespace device void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); }}} - template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) +template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) +{ + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); +} + +template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) +{ + Scalar_ sf = s; + cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); +} + +template struct NPPTypeTraits; +template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; +template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; +template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; +template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; +template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; + +////////////////////////////////////////////////////////////////////////// +// Convert + +template struct NppConvertFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); +}; +template struct NppConvertFunc +{ + typedef typename NPPTypeTraits::npp_type dst_t; + + typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); +}; + +template::func_ptr func> struct NppCvt +{ + typedef typename NPPTypeTraits::npp_type src_t; + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream); + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template::func_ptr func> struct NppCvt +{ + typedef typename NPPTypeTraits::npp_type dst_t; + + static void call(const GpuMat& src, GpuMat& dst) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +////////////////////////////////////////////////////////////////////////// +// Set + +template struct NppSetFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template struct NppSetFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template struct NppSetFunc +{ + typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); +}; +template<> struct NppSetFunc +{ + typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); +}; + +template::func_ptr func> struct NppSet +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; +template::func_ptr func> struct NppSet +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template struct NppSetMaskFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; +template struct NppSetMaskFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; + +template::func_ptr func> struct NppSetMask +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; +template::func_ptr func> struct NppSetMask +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(GpuMat& src, Scalar s, const GpuMat& mask) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Scalar_ nppS = s; + + nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +////////////////////////////////////////////////////////////////////////// +// CopyMasked + +template struct NppCopyMaskedFunc +{ + typedef typename NPPTypeTraits::npp_type src_t; + + typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); +}; + +template::func_ptr func> struct NppCopyMasked +{ + typedef typename NPPTypeTraits::npp_type src_t; + + static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +}; + +template static inline bool isAligned(const T* ptr, size_t size) +{ + return reinterpret_cast(ptr) % size == 0; +} + +namespace cv { namespace gpu { namespace device +{ + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0); + void convertTo(const GpuMat& src, GpuMat& dst); + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0); + void setTo(GpuMat& src, Scalar s, cudaStream_t stream); + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + void setTo(GpuMat& src, Scalar s); + void setTo(GpuMat& src, Scalar s, const GpuMat& mask); + + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) + { + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); } - template void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) + void convertTo(const GpuMat& src, GpuMat& dst) { - Scalar_ sf = s; - cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); } - template struct NPPTypeTraits; - template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16u npp_type; }; - template<> struct NPPTypeTraits { typedef Npp16s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32s npp_type; }; - template<> struct NPPTypeTraits { typedef Npp32f npp_type; }; - template<> struct NPPTypeTraits { typedef Npp64f npp_type; }; - - ////////////////////////////////////////////////////////////////////////// - // Convert - - template struct NppConvertFunc + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI); - }; - template struct NppConvertFunc - { - typedef typename NPPTypeTraits::npp_type dst_t; - - typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode); - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type src_t; - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template::func_ptr func> struct NppCvt - { - typedef typename NPPTypeTraits::npp_type dst_t; - - static void call(const GpuMat& src, GpuMat& dst) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, NPP_RND_NEAR) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // Set - - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - template<> struct NppSetFunc - { - typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI); - }; - - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSet - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - template struct NppSetMaskFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS.val, src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - template::func_ptr func> struct NppSetMask - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(GpuMat& src, Scalar s, const GpuMat& mask) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - Scalar_ nppS = s; - - nppSafeCall( func(nppS[0], src.ptr(), static_cast(src.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - ////////////////////////////////////////////////////////////////////////// - // CopyMasked - - template struct NppCopyMaskedFunc - { - typedef typename NPPTypeTraits::npp_type src_t; - - typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep); - }; - - template::func_ptr func> struct NppCopyMasked - { - typedef typename NPPTypeTraits::npp_type src_t; - - static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/) - { - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; - - nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz, mask.ptr(), static_cast(mask.step)) ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - }; - - template static inline bool isAligned(const T* ptr, size_t size) - { - return reinterpret_cast(ptr) % size == 0; + cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); } - namespace cv { namespace gpu { namespace device + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) { - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) + typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); + + static const caller_t callers[] = { - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; - cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream); - } + callers[src.depth()](src, s, stream); + } - void convertTo(const GpuMat& src, GpuMat& dst) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0); - } - - void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0) - { - cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream); - } - - void setTo(GpuMat& src, Scalar s, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, stream); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) - { - typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); - - static const caller_t callers[] = - { - kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, - kernelSetCaller, kernelSetCaller - }; - - callers[src.depth()](src, s, mask, stream); - } - - void setTo(GpuMat& src, Scalar s) - { - setTo(src, s, 0); - } - - void setTo(GpuMat& src, Scalar s, const GpuMat& mask) - { - setTo(src, s, mask, 0); - } - }}} - - - class CudaArch + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) { - public: - CudaArch() + typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream); + + static const caller_t callers[] = { - fromStr(CUDA_ARCH_BIN, bin); - fromStr(CUDA_ARCH_PTX, ptx); - fromStr(CUDA_ARCH_FEATURES, features); - } + kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, kernelSetCaller, + kernelSetCaller, kernelSetCaller + }; - bool builtWith(FeatureSet feature_set) const - { - return !features.empty() && (features.back() >= feature_set); - } + callers[src.depth()](src, s, mask, stream); + } - bool hasPtx(int major, int minor) const - { - return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); - } - - bool hasBin(int major, int minor) const - { - return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); - } - - bool hasEqualOrLessPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.front() <= major * 10 + minor); - } - - bool hasEqualOrGreaterPtx(int major, int minor) const - { - return !ptx.empty() && (ptx.back() >= major * 10 + minor); - } - - bool hasEqualOrGreaterBin(int major, int minor) const - { - return !bin.empty() && (bin.back() >= major * 10 + minor); - } - - - private: - void fromStr(const string& set_as_str, vector& arr) - { - if (set_as_str.find_first_not_of(" ") == string::npos) - return; - - istringstream stream(set_as_str); - int cur_value; - - while (!stream.eof()) - { - stream >> cur_value; - arr.push_back(cur_value); - } - - sort(arr.begin(), arr.end()); - } - - vector bin; - vector ptx; - vector features; - }; - - class DeviceProps + void setTo(GpuMat& src, Scalar s) { - public: - DeviceProps() - { - props_.resize(10, 0); - } + setTo(src, s, 0); + } - ~DeviceProps() - { - for (size_t i = 0; i < props_.size(); ++i) - { - if (props_[i]) - delete props_[i]; - } - props_.clear(); - } - - cudaDeviceProp* get(int devID) - { - if (devID >= (int) props_.size()) - props_.resize(devID + 5, 0); - - if (!props_[devID]) - { - props_[devID] = new cudaDeviceProp; - cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); - } - - return props_[devID]; - } - private: - std::vector props_; - }; - - DeviceProps deviceProps; - - class CudaDeviceInfoFuncTable: DeviceInfoFuncTable + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) { - public: - size_t sharedMemPerBlock() const + setTo(src, s, mask, 0); + } +}}} + +class CudaArch +{ +public: + CudaArch() + { + fromStr(CUDA_ARCH_BIN, bin); + fromStr(CUDA_ARCH_PTX, ptx); + fromStr(CUDA_ARCH_FEATURES, features); + } + + bool builtWith(FeatureSet feature_set) const + { + return !features.empty() && (features.back() >= feature_set); + } + + bool hasPtx(int major, int minor) const + { + return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); + } + + bool hasBin(int major, int minor) const + { + return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); + } + + bool hasEqualOrLessPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.front() <= major * 10 + minor); + } + + bool hasEqualOrGreaterPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.back() >= major * 10 + minor); + } + + bool hasEqualOrGreaterBin(int major, int minor) const + { + return !bin.empty() && (bin.back() >= major * 10 + minor); + } + + +private: + void fromStr(const string& set_as_str, vector& arr) + { + if (set_as_str.find_first_not_of(" ") == string::npos) + return; + + istringstream stream(set_as_str); + int cur_value; + + while (!stream.eof()) { - return deviceProps.get(device_id_)->sharedMemPerBlock; + stream >> cur_value; + arr.push_back(cur_value); } - void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + sort(arr.begin(), arr.end()); + } + + vector bin; + vector ptx; + vector features; +}; + +class DeviceProps +{ +public: + DeviceProps() + { + props_.resize(10, 0); + } + + ~DeviceProps() + { + for (size_t i = 0; i < props_.size(); ++i) { - int prevDeviceID = getDevice(); - if (prevDeviceID != device_id_) - setDevice(device_id_); + if (props_[i]) + delete props_[i]; + } + props_.clear(); + } - cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); + cudaDeviceProp* get(int devID) + { + if (devID >= (int) props_.size()) + props_.resize(devID + 5, 0); - if (prevDeviceID != device_id_) - setDevice(prevDeviceID); + if (!props_[devID]) + { + props_[devID] = new cudaDeviceProp; + cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) ); } - size_t freeMemory() const - { - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _freeMemory; - } + return props_[devID]; + } +private: + std::vector props_; +}; - size_t totalMemory() const - { - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _totalMemory; - } +DeviceProps deviceProps; - bool supports(FeatureSet feature_set) const - { - int version = majorVersion_ * 10 + minorVersion_; - return version >= feature_set; - } +class CudaDeviceInfoFuncTable: DeviceInfoFuncTable +{ +public: + size_t sharedMemPerBlock() const + { + return deviceProps.get(device_id_)->sharedMemPerBlock; + } - bool isCompatible() const - { - // Check PTX compatibility - if (hasEqualOrLessPtx(majorVersion_, minorVersion_)) - return true; + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + { + int prevDeviceID = getDevice(); + if (prevDeviceID != device_id_) + setDevice(device_id_); - // Check BIN compatibility - for (int i = minorVersion_; i >= 0; --i) - if (hasBin(majorVersion_, i)) - return true; + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); - return false; - } + if (prevDeviceID != device_id_) + setDevice(prevDeviceID); + } - void query() - { - const cudaDeviceProp* prop = deviceProps.get(device_id_); + size_t freeMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _freeMemory; + } - name_ = prop->name; - multi_processor_count_ = prop->multiProcessorCount; - majorVersion_ = prop->major; - minorVersion_ = prop->minor; - } + size_t totalMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _totalMemory; + } - int deviceID() const - { - return device_id_; - } + bool supports(FeatureSet feature_set) const + { + int version = majorVersion_ * 10 + minorVersion_; + return version >= feature_set; + } - std::string name() const - { - return name_; - } + bool isCompatible() const + { + // Check PTX compatibility + if (hasEqualOrLessPtx(majorVersion_, minorVersion_)) + return true; - int majorVersion() const - { - return majorVersion_; - } + // Check BIN compatibility + for (int i = minorVersion_; i >= 0; --i) + if (hasBin(majorVersion_, i)) + return true; - int minorVersion() const - { - return minorVersion_; - } + return false; + } - int multiProcessorCount() const - { - return multi_processor_count_; - } + void query() + { + const cudaDeviceProp* prop = deviceProps.get(device_id_); - int getCudaEnabledDeviceCount() const - { - int count; - cudaError_t error = cudaGetDeviceCount( &count ); + name_ = prop->name; + multi_processor_count_ = prop->multiProcessorCount; + majorVersion_ = prop->major; + minorVersion_ = prop->minor; + } - if (error == cudaErrorInsufficientDriver) - return -1; + int deviceID() const + { + return device_id_; + } - if (error == cudaErrorNoDevice) - return 0; + std::string name() const + { + return name_; + } - cudaSafeCall( error ); - return count; - } + int majorVersion() const + { + return majorVersion_; + } - void setDevice(int device) const - { - cudaSafeCall( cudaSetDevice( device ) ); - } + int minorVersion() const + { + return minorVersion_; + } - int getDevice() const - { - int device; - cudaSafeCall( cudaGetDevice( &device ) ); - return device; - } + int multiProcessorCount() const + { + return multi_processor_count_; + } - void resetDevice() const - { - cudaSafeCall( cudaDeviceReset() ); - } - - bool builtWith(FeatureSet feature_set) const - { - return cudaArch.builtWith(feature_set); - } - - bool has(int major, int minor) const - { - return hasPtx(major, minor) || hasBin(major, minor); - } - - bool hasPtx(int major, int minor) const - { - return cudaArch.hasPtx(major, minor); - } - - bool hasBin(int major, int minor) const - { - return cudaArch.hasBin(major, minor); - } - - bool hasEqualOrLessPtx(int major, int minor) const - { - return cudaArch.hasEqualOrLessPtx(major, minor); - } - - bool hasEqualOrGreater(int major, int minor) const - { - return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); - } - - bool hasEqualOrGreaterPtx(int major, int minor) const - { - return cudaArch.hasEqualOrGreaterPtx(major, minor); - } - - bool hasEqualOrGreaterBin(int major, int minor) const - { - return cudaArch.hasEqualOrGreaterBin(major, minor); - } - - bool deviceSupports(FeatureSet feature_set) const - { - static int versions[] = - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); - - const int devId = getDevice(); - - int version; - - if (devId < cache_size && versions[devId] >= 0) - version = versions[devId]; - else - { - DeviceInfo dev(devId); - version = dev.majorVersion() * 10 + dev.minorVersion(); - if (devId < cache_size) - versions[devId] = version; - } - - return TargetArchs::builtWith(feature_set) && (version >= feature_set); - } - - void printCudaDeviceInfo(int device) const - { - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); - printf("Device count: %d\n", count); - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - const char *computeMode[] = { - "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", - "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", - "Prohibited (no host thread can use ::cudaSetDevice() with this device)", - "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", - "Unknown", - NULL - }; - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - printf("\nDevice %d: \"%s\"\n", dev, prop.name); - printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); - printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); - - printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); - - printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", - prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], - prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); - printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", - prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], - prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); - - printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); - printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); - printf(" Warp size: %d\n", prop.warpSize); - printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); - printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); - printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); - - printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); - printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); - printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); - printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); - - printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); - printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); - printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); - printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); - printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); - printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); - printf(" Compute Mode:\n"); - printf(" %s \n", computeMode[prop.computeMode]); - } - - printf("\n"); - printf("deviceQuery, CUDA Driver = CUDART"); - printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); - printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); - printf(", NumDevs = %d\n\n", count); - fflush(stdout); - } - - void printShortCudaDeviceInfo(int device) const - { - int count = getCudaEnabledDeviceCount(); - bool valid = (device >= 0) && (device < count); - - int beg = valid ? device : 0; - int end = valid ? device+1 : count; - - int driverVersion = 0, runtimeVersion = 0; - cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); - cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); - - for(int dev = beg; dev < end; ++dev) - { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); - - const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; - printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); - printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); - - int cores = convertSMVer2Cores(prop.major, prop.minor); - if (cores > 0) - printf(", %d cores", cores * prop.multiProcessorCount); - - printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); - } - fflush(stdout); - } - - private: - int device_id_; - - std::string name_; - int multi_processor_count_; - int majorVersion_; - int minorVersion_; - - const CudaArch cudaArch; - - int convertSMVer2Cores(int major, int minor) const - { - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } SMtoCores; - - SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; - - int index = 0; - while (gpuArchCoresPerSM[index].SM != -1) - { - if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) - return gpuArchCoresPerSM[index].Cores; - index++; - } + int getCudaEnabledDeviceCount() const + { + int count; + cudaError_t error = cudaGetDeviceCount( &count ); + if (error == cudaErrorInsufficientDriver) return -1; - } - }; - class CudaFuncTable : public GpuFuncTable + if (error == cudaErrorNoDevice) + return 0; + + cudaSafeCall( error ); + return count; + } + + void setDevice(int device) const { - public: + cudaSafeCall( cudaSetDevice( device ) ); + } - void copy(const Mat& src, GpuMat& dst) const + int getDevice() const + { + int device; + cudaSafeCall( cudaGetDevice( &device ) ); + return device; + } + + void resetDevice() const + { + cudaSafeCall( cudaDeviceReset() ); + } + + bool builtWith(FeatureSet feature_set) const + { + return cudaArch.builtWith(feature_set); + } + + bool has(int major, int minor) const + { + return hasPtx(major, minor) || hasBin(major, minor); + } + + bool hasPtx(int major, int minor) const + { + return cudaArch.hasPtx(major, minor); + } + + bool hasBin(int major, int minor) const + { + return cudaArch.hasBin(major, minor); + } + + bool hasEqualOrLessPtx(int major, int minor) const + { + return cudaArch.hasEqualOrLessPtx(major, minor); + } + + bool hasEqualOrGreater(int major, int minor) const + { + return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); + } + + bool hasEqualOrGreaterPtx(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterPtx(major, minor); + } + + bool hasEqualOrGreaterBin(int major, int minor) const + { + return cudaArch.hasEqualOrGreaterBin(major, minor); + } + + bool deviceSupports(FeatureSet feature_set) const + { + static int versions[] = { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); - } - void copy(const GpuMat& src, Mat& dst) const + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); + + const int devId = getDevice(); + + int version; + + if (devId < cache_size && versions[devId] >= 0) + version = versions[devId]; + else { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); - } - void copy(const GpuMat& src, GpuMat& dst) const - { - cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); + DeviceInfo dev(devId); + version = dev.majorVersion() * 10 + dev.minorVersion(); + if (devId < cache_size) + versions[devId] = version; } - void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(src.size() == dst.size() && src.type() == dst.type()); - CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + return TargetArchs::builtWith(feature_set) && (version >= feature_set); + } - if (src.depth() == CV_64F) + void printCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); + printf("Device count: %d\n", count); + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + const char *computeMode[] = { + "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", + "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", + "Prohibited (no host thread can use ::cudaSetDevice() with this device)", + "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", + "Unknown", + NULL + }; + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + printf("\nDevice %d: \"%s\"\n", dev, prop.name); + printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); + printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount); + + printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); + + printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", + prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], + prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); + printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", + prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], + prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); + + printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); + printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); + printf(" Warp size: %d\n", prop.warpSize); + printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); + printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); + printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); + printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); + printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); + + printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); + printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); + printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); + printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); + + printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); + printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); + printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); + printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); + printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); + printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); + printf(" Compute Mode:\n"); + printf(" %s \n", computeMode[prop.computeMode]); + } + + printf("\n"); + printf("deviceQuery, CUDA Driver = CUDART"); + printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); + printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); + printf(", NumDevs = %d\n\n", count); + fflush(stdout); + } + + void printShortCudaDeviceInfo(int device) const + { + int count = getCudaEnabledDeviceCount(); + bool valid = (device >= 0) && (device < count); + + int beg = valid ? device : 0; + int end = valid ? device+1 : count; + + int driverVersion = 0, runtimeVersion = 0; + cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); + cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); + + for(int dev = beg; dev < end; ++dev) + { + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); + + const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; + printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); + printf(", sm_%d%d%s", prop.major, prop.minor, arch_str); + + int cores = convertSMVer2Cores(prop.major, prop.minor); + if (cores > 0) + printf(", %d cores", cores * prop.multiProcessorCount); + + printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); + } + fflush(stdout); + } + +private: + int device_id_; + + std::string name_; + int multi_processor_count_; + int majorVersion_; + int minorVersion_; + + const CudaArch cudaArch; + + int convertSMVer2Cores(int major, int minor) const + { + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } SMtoCores; + + SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } }; + + int index = 0; + while (gpuArchCoresPerSM[index].SM != -1) + { + if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) + return gpuArchCoresPerSM[index].Cores; + index++; + } + + return -1; + } +}; + +class CudaFuncTable : public GpuFuncTable +{ +public: + + void copy(const Mat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) ); + } + + void copy(const GpuMat& src, Mat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) ); + } + + void copy(const GpuMat& src, GpuMat& dst) const + { + cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) ); + } + + void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(src.size() == dst.size() && src.type() == dst.type()); + CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels())); + + if (src.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); + static const func_t funcs[7][4] = + { + /* 8U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }, + /* 16U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 16S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32F */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask } + }; + + const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask; + + func(src, dst, mask, 0); + } + + void convert(const GpuMat& src, GpuMat& dst) const + { + typedef void (*func_t)(const GpuMat& src, GpuMat& dst); + static const func_t funcs[7][7][4] = + { { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + /* 8U -> 8U */ {0, 0, 0, 0}, + /* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 8S */ {0,0,0,0}, + /* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 16U -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 16U */ {0,0,0,0}, + /* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 16S -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, + /* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 16S */ {0,0,0,0}, + /* 16S -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, + /* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } + }, + { + /* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 32S */ {0,0,0,0}, + /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 32F -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 32F -> 32F */ {0,0,0,0}, + /* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} + }, + { + /* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, + /* 64F -> 64F */ {0,0,0,0} } + }; - typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); - static const func_t funcs[7][4] = - { - /* 8U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }, - /* 16U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 16S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32F */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask } - }; + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask; - - func(src, dst, mask, 0); + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - void convert(const GpuMat& src, GpuMat& dst) const + bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); + if (!aligned) { - typedef void (*func_t)(const GpuMat& src, GpuMat& dst); - static const func_t funcs[7][7][4] = - { - { - /* 8U -> 8U */ {0, 0, 0, 0}, - /* 8U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 8U -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, - /* 8U -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, - /* 8U -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 8U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 8U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } - }, - { - /* 8S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 8S */ {0,0,0,0}, - /* 8S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 8S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} - }, - { - /* 16U -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, - /* 16U -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16U -> 16U */ {0,0,0,0}, - /* 16U -> 16S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16U -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16U -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16U -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } - }, - { - /* 16S -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, NppCvt::call}, - /* 16S -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16S -> 16U */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16S -> 16S */ {0,0,0,0}, - /* 16S -> 32S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16S -> 32F */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo }, - /* 16S -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo } - }, - { - /* 32S -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 32S */ {0,0,0,0}, - /* 32S -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32S -> 64F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} - }, - { - /* 32F -> 8U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 8S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 16U */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 16S */ {NppCvt::call, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 32S */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 32F -> 32F */ {0,0,0,0}, - /* 32F -> 64F */ {cv::gpu::device::convertTo , cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo} - }, - { - /* 64F -> 8U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 8S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 16U */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 16S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 32S */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 32F */ {cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo, cv::gpu::device::convertTo}, - /* 64F -> 64F */ {0,0,0,0} - } - }; + cv::gpu::device::convertTo(src, dst); + return; + } - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - CV_Assert(src.size() == dst.size() && src.channels() == dst.channels()); + const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; + CV_DbgAssert(func != 0); - if (src.depth() == CV_64F || dst.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } + func(src, dst); + } - bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16); - if (!aligned) + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const + { + CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); + CV_Assert(dst.depth() <= CV_64F); + + if (src.depth() == CV_64F || dst.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + cv::gpu::device::convertTo(src, dst, alpha, beta, stream); + } + + void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const + { + if (mask.empty()) + { + if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) { - cv::gpu::device::convertTo(src, dst); + cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); return; } - const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1]; - CV_DbgAssert(func != 0); - - func(src, dst); - } - - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const - { - CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); - CV_Assert(dst.depth() <= CV_64F); - - if (src.depth() == CV_64F || dst.depth() == CV_64F) + if (m.depth() == CV_8U) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } + int cn = m.channels(); - cv::gpu::device::convertTo(src, dst, alpha, beta, stream); - } - - void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const - { - if (mask.empty()) - { - if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0) + if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) { - cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) ); + int val = saturate_cast(s[0]); + cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); return; } - - if (m.depth() == CV_8U) - { - int cn = m.channels(); - - if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3])) - { - int val = saturate_cast(s[0]); - cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) ); - return; - } - } - - typedef void (*func_t)(GpuMat& src, Scalar s); - static const func_t funcs[7][4] = - { - {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }, - {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, - {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, - {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, - {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } - }; - - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - if (stream) - cv::gpu::device::setTo(m, s, stream); - else - funcs[m.depth()][m.channels() - 1](m, s); } - else + + typedef void (*func_t)(GpuMat& src, Scalar s); + static const func_t funcs[7][4] = { - typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); - static const func_t funcs[7][4] = - { - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }, - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo } - }; + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } + }; - CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); - if (m.depth() == CV_64F) - { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) - CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); - } - - if (stream) - cv::gpu::device::setTo(m, s, mask, stream); - else - funcs[m.depth()][m.channels() - 1](m, s, mask); + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - } - void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const - { - cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); + if (stream) + cv::gpu::device::setTo(m, s, stream); + else + funcs[m.depth()][m.channels() - 1](m, s); } + else + { + typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask); + static const func_t funcs[7][4] = + { + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo }, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {NppSetMask::call, cv::gpu::device::setTo, cv::gpu::device::setTo, NppSetMask::call}, + {cv::gpu::device::setTo , cv::gpu::device::setTo, cv::gpu::device::setTo, cv::gpu::device::setTo } + }; - void free(void* devPtr) const - { - cudaFree(devPtr); + CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); + + if (m.depth() == CV_64F) + { + if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); + } + + if (stream) + cv::gpu::device::setTo(m, s, mask, stream); + else + funcs[m.depth()][m.channels() - 1](m, s, mask); } - }; + } + + void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const + { + cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) ); + } + + void free(void* devPtr) const + { + cudaFree(devPtr); + } +}; #endif #endif \ No newline at end of file diff --git a/modules/dynamicuda/src/main.cpp b/modules/dynamicuda/src/main.cpp index 4a05d8696..8eb66fd98 100644 --- a/modules/dynamicuda/src/main.cpp +++ b/modules/dynamicuda/src/main.cpp @@ -39,6 +39,9 @@ static EmptyFuncTable gpuTable; extern "C" { +DeviceInfoFuncTable* deviceInfoFactory(); +GpuFuncTable* gpuFactory(); + DeviceInfoFuncTable* deviceInfoFactory() { return (DeviceInfoFuncTable*)&deviceInfoTable; diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt index 291295fb5..3a6ebe836 100644 --- a/modules/java/CMakeLists.txt +++ b/modules/java/CMakeLists.txt @@ -297,7 +297,7 @@ if(BUILD_FAT_JAVA_LIB) list(REMOVE_ITEM __deps ${m}) endif() endforeach() - if (HAVE_opencv_dynamicuda) + if (ENABLE_DYNAMIC_CUDA) list(REMOVE_ITEM __deps "opencv_dynamicuda") endif() if (ANDROID AND HAVE_opencv_gpu) From 2509fa8080962256e31b178e67d1b404341eb537 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 18:02:59 +0400 Subject: [PATCH 10/13] Warious fixes for case where HAVE_CUDA==OFF. --- modules/core/CMakeLists.txt | 4 ---- modules/core/src/gpumat.cpp | 22 ++++++------------- modules/dynamicuda/CMakeLists.txt | 2 +- .../include/opencv2/dynamicuda/dynamicuda.hpp | 19 ++++++++++++---- 4 files changed, 23 insertions(+), 24 deletions(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index f20e32d3a..2409ee9e9 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,12 +1,8 @@ set(the_description "The Core Functionality") -message(STATUS "ENABLE_DYNAMIC_CUDA ${ENABLE_DYNAMIC_CUDA}") - if (ENABLE_DYNAMIC_CUDA) - message(STATUS "Using dynamic cuda approach") ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) else() - message(STATUS "Link CUDA statically") ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) endif() diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 590685b74..17d46abcc 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -44,7 +44,7 @@ #include "opencv2/core/gpumat.hpp" #include -#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT) +#if defined(HAVE_CUDA) #include #include @@ -273,8 +273,6 @@ void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); } void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); } void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); } -#ifdef HAVE_CUDA - namespace cv { namespace gpu { CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, cudaStream_t); @@ -286,8 +284,6 @@ namespace cv { namespace gpu CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&); }} -#endif - //////////////////////////////// GpuMat /////////////////////////////// cv::gpu::GpuMat::GpuMat(const GpuMat& m) @@ -707,43 +703,39 @@ void cv::gpu::GpuMat::release() refcount = 0; } -#ifdef HAVE_CUDA - namespace cv { namespace gpu { void convertTo(const GpuMat& src, GpuMat& dst) { gpuFuncTable()->convert(src, dst); } - + void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) { gpuFuncTable()->convert(src, dst, alpha, beta, stream); } - + void setTo(GpuMat& src, Scalar s, cudaStream_t stream) { gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream); } - + void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) { - gpuFuncTable()->setTo(src, s, mask, stream); + gpuFuncTable()->setTo(src, s, mask, stream); } - + void setTo(GpuMat& src, Scalar s) { setTo(src, s, 0); } - + void setTo(GpuMat& src, Scalar s, const GpuMat& mask) { setTo(src, s, mask, 0); } }} -#endif - //////////////////////////////////////////////////////////////////////// // Error handling diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt index def05d19b..031b5e48d 100644 --- a/modules/dynamicuda/CMakeLists.txt +++ b/modules/dynamicuda/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT ANDROID) +if(NOT ANDROID OR NOT HAVE_CUDA) ocv_module_disable(dynamicuda) endif() diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp index 4f5175513..c5057ab99 100644 --- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp +++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp @@ -1,6 +1,10 @@ #ifndef __GPUMAT_CUDA_HPP__ #define __GPUMAT_CUDA_HPP__ +#ifndef HAVE_CUDA +typedef void* cudaStream_t; +#endif + class DeviceInfoFuncTable { public: @@ -56,7 +60,7 @@ public: virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; // for gpu::device::setTo funcs - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const = 0; virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; virtual void free(void* devPtr) const = 0; @@ -96,8 +100,15 @@ public: bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } - void printCudaDeviceInfo(int) const { throw_nogpu; } - void printShortCudaDeviceInfo(int) const { throw_nogpu; } + void printCudaDeviceInfo(int) const + { + printf("The library is compiled without CUDA support\n"); + } + + void printShortCudaDeviceInfo(int) const + { + printf("The library is compiled without CUDA support\n"); + } }; class EmptyFuncTable : public GpuFuncTable @@ -113,7 +124,7 @@ public: void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } + virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, cudaStream_t) const { throw_nogpu; } void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } void free(void*) const {} From 069f3d8d9a1b5c500e56d4547cf42105542efb62 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Thu, 19 Dec 2013 18:36:02 +0400 Subject: [PATCH 11/13] Build fixes for GPU module. --- modules/core/src/gpumat.cpp | 2 +- modules/gpu/perf4au/CMakeLists.txt | 30 ++++++++++--------- modules/stitching/src/blenders.cpp | 6 ++-- modules/stitching/src/matchers.cpp | 10 +++---- modules/stitching/src/precomp.hpp | 2 +- modules/stitching/src/seam_finders.cpp | 2 +- modules/stitching/src/stitcher.cpp | 2 +- modules/stitching/src/warpers.cpp | 2 +- .../opencv2/videostab/optical_flow.hpp | 4 +-- modules/videostab/src/inpainting.cpp | 2 +- modules/videostab/src/optical_flow.cpp | 2 +- 11 files changed, 33 insertions(+), 31 deletions(-) diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 17d46abcc..7a7b91d1d 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -752,5 +752,5 @@ void cv::gpu::error(const char *error_string, const char *file, const int line, cerr.flush(); } else - ::cv::error( ::cv::Exception(code, error_string, func, file, line) ); + cv::error( cv::Exception(code, error_string, func, file, line) ); } diff --git a/modules/gpu/perf4au/CMakeLists.txt b/modules/gpu/perf4au/CMakeLists.txt index 376e7b270..13efe7ffa 100644 --- a/modules/gpu/perf4au/CMakeLists.txt +++ b/modules/gpu/perf4au/CMakeLists.txt @@ -2,26 +2,28 @@ set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video ocv_check_dependencies(${PERF4AU_REQUIRED_DEPS}) -set(the_target gpu_perf4au) -project(${the_target}) +if (OCV_DEPENDENCIES_FOUND) + set(the_target gpu_perf4au) + project(${the_target}) -ocv_include_modules(${PERF4AU_REQUIRED_DEPS}) + ocv_include_modules(${PERF4AU_REQUIRED_DEPS}) -if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS) + if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function") -endif() + endif() -file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp) -add_executable(${the_target} ${srcs}) + file(GLOB srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.h *.hpp) + add_executable(${the_target} ${srcs}) -target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS}) + target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${PERF4AU_REQUIRED_DEPS}) -if(ENABLE_SOLUTION_FOLDERS) - set_target_properties(${the_target} PROPERTIES FOLDER "tests performance") -endif() + if(ENABLE_SOLUTION_FOLDERS) + set_target_properties(${the_target} PROPERTIES FOLDER "tests performance") + endif() -if(WIN32) + if(WIN32) if(MSVC AND NOT BUILD_SHARED_LIBS) - set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG") + set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG") endif() -endif() + endif() +endif() \ No newline at end of file diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp index e65023a55..fb3c0d666 100644 --- a/modules/stitching/src/blenders.cpp +++ b/modules/stitching/src/blenders.cpp @@ -189,7 +189,7 @@ Rect FeatherBlender::createWeightMaps(const vector &masks, const vector &pyr) void createLaplacePyrGpu(const Mat &img, int num_levels, vector &pyr) { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) pyr.resize(num_levels + 1); vector gpu_pyr(num_levels + 1); @@ -531,7 +531,7 @@ void restoreImageFromLaplacePyr(vector &pyr) void restoreImageFromLaplacePyrGpu(vector &pyr) { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (pyr.empty()) return; diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp index d918cfff2..d86206233 100644 --- a/modules/stitching/src/matchers.cpp +++ b/modules/stitching/src/matchers.cpp @@ -46,7 +46,7 @@ using namespace std; using namespace cv; using namespace cv::detail; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) using namespace cv::gpu; #endif @@ -129,7 +129,7 @@ private: float match_conf_; }; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class GpuMatcher : public FeaturesMatcher { public: @@ -204,7 +204,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat LOG("1->2 & 2->1 matches: " << matches_info.matches.size() << endl); } -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &features2, MatchesInfo& matches_info) { matches_info.matches.clear(); @@ -432,7 +432,7 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features) } } -#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) +#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID) SurfFeaturesFinderGpu::SurfFeaturesFinderGpu(double hess_thresh, int num_octaves, int num_layers, int num_octaves_descr, int num_layers_descr) { @@ -533,7 +533,7 @@ void FeaturesMatcher::operator ()(const vector &features, vector< BestOf2NearestMatcher::BestOf2NearestMatcher(bool try_use_gpu, float match_conf, int num_matches_thresh1, int num_matches_thresh2) { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_use_gpu && getCudaEnabledDeviceCount() > 0) impl_ = new GpuMatcher(match_conf); else diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp index 1050856d3..54b672143 100644 --- a/modules/stitching/src/precomp.hpp +++ b/modules/stitching/src/precomp.hpp @@ -68,7 +68,7 @@ #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/features2d/features2d.hpp" #include "opencv2/calib3d/calib3d.hpp" -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) #include "opencv2/gpu/gpu.hpp" #ifdef HAVE_OPENCV_NONFREE diff --git a/modules/stitching/src/seam_finders.cpp b/modules/stitching/src/seam_finders.cpp index 784209c93..a198c1ebb 100644 --- a/modules/stitching/src/seam_finders.cpp +++ b/modules/stitching/src/seam_finders.cpp @@ -1318,7 +1318,7 @@ void GraphCutSeamFinder::find(const vector &src, const vector &corne } -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) void GraphCutSeamFinderGpu::find(const vector &src, const vector &corners, vector &masks) { diff --git a/modules/stitching/src/stitcher.cpp b/modules/stitching/src/stitcher.cpp index 5da26f6db..4a36ab0a4 100644 --- a/modules/stitching/src/stitcher.cpp +++ b/modules/stitching/src/stitcher.cpp @@ -58,7 +58,7 @@ Stitcher Stitcher::createDefault(bool try_use_gpu) stitcher.setFeaturesMatcher(new detail::BestOf2NearestMatcher(try_use_gpu)); stitcher.setBundleAdjuster(new detail::BundleAdjusterRay()); -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_use_gpu && gpu::getCudaEnabledDeviceCount() > 0) { #if defined(HAVE_OPENCV_NONFREE) diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp index 932958c6f..935831950 100644 --- a/modules/stitching/src/warpers.cpp +++ b/modules/stitching/src/warpers.cpp @@ -212,7 +212,7 @@ void SphericalWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_b } -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, gpu::GpuMat &xmap, gpu::GpuMat &ymap) { return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap); diff --git a/modules/videostab/include/opencv2/videostab/optical_flow.hpp b/modules/videostab/include/opencv2/videostab/optical_flow.hpp index 18b7d3f28..2c1742fc7 100644 --- a/modules/videostab/include/opencv2/videostab/optical_flow.hpp +++ b/modules/videostab/include/opencv2/videostab/optical_flow.hpp @@ -46,7 +46,7 @@ #include "opencv2/core/core.hpp" #include "opencv2/opencv_modules.hpp" -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) # include "opencv2/gpu/gpu.hpp" #endif @@ -98,7 +98,7 @@ public: OutputArray status, OutputArray errors); }; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class CV_EXPORTS DensePyrLkOptFlowEstimatorGpu : public PyrLkOptFlowEstimatorBase, public IDenseOptFlowEstimator { diff --git a/modules/videostab/src/inpainting.cpp b/modules/videostab/src/inpainting.cpp index 4377c007c..c6568e071 100644 --- a/modules/videostab/src/inpainting.cpp +++ b/modules/videostab/src/inpainting.cpp @@ -323,7 +323,7 @@ public: MotionInpainter::MotionInpainter() { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) setOptFlowEstimator(new DensePyrLkOptFlowEstimatorGpu()); #else CV_Error(CV_StsNotImplemented, "Current implementation of MotionInpainter requires GPU"); diff --git a/modules/videostab/src/optical_flow.cpp b/modules/videostab/src/optical_flow.cpp index 46100fdb5..3441df168 100644 --- a/modules/videostab/src/optical_flow.cpp +++ b/modules/videostab/src/optical_flow.cpp @@ -59,7 +59,7 @@ void SparsePyrLkOptFlowEstimator::run( } -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) DensePyrLkOptFlowEstimatorGpu::DensePyrLkOptFlowEstimatorGpu() { CV_Assert(gpu::getCudaEnabledDeviceCount() > 0); From 529bd41751e526604726ccc9bff68a448693a3be Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Fri, 20 Dec 2013 09:46:03 +0400 Subject: [PATCH 12/13] Build fixes for case where HAVE_CUDA==OFF. --- modules/core/CMakeLists.txt | 14 ++++++++------ modules/core/src/gpumat.cpp | 2 +- samples/cpp/stitching_detailed.cpp | 8 ++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 2409ee9e9..0d985f288 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,6 +1,6 @@ set(the_description "The Core Functionality") -if (ENABLE_DYNAMIC_CUDA) +if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA) ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) else() ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) @@ -15,7 +15,9 @@ endif() if(ENABLE_DYNAMIC_CUDA) add_definitions(-DDYNAMIC_CUDA_SUPPORT) else() - add_definitions(-DUSE_CUDA) + if (HAVE_CUDA) + add_definitions(-DUSE_CUDA) + endif() endif() if(HAVE_CUDA) @@ -26,18 +28,18 @@ endif() file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") -if (NOT ENABLE_DYNAMIC_CUDA) - file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*") +if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA) + file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*") endif() source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) -if (NOT ENABLE_DYNAMIC_CUDA) +if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA) source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs}) endif() -if (ENABLE_DYNAMIC_CUDA) +if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA) ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) else() diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 7a7b91d1d..310aabd58 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -229,7 +229,7 @@ static DeviceInfoFuncTable* deviceInfoFuncTable() static CudaDeviceInfoFuncTable impl; static DeviceInfoFuncTable* funcTable = &impl; #else - static EmptyFuncTable stub; + static EmptyDeviceInfoFuncTable stub; static DeviceInfoFuncTable* funcTable = &stub; #endif #endif diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp index 49d86086d..7394a7282 100644 --- a/samples/cpp/stitching_detailed.cpp +++ b/samples/cpp/stitching_detailed.cpp @@ -355,7 +355,7 @@ int main(int argc, char* argv[]) Ptr finder; if (features_type == "surf") { -#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) +#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0) finder = new SurfFeaturesFinderGpu(); else @@ -543,7 +543,7 @@ int main(int argc, char* argv[]) // Warp images and their masks Ptr warper_creator; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0) { if (warp_type == "plane") warper_creator = new cv::PlaneWarperGpu(); @@ -608,7 +608,7 @@ int main(int argc, char* argv[]) seam_finder = new detail::VoronoiSeamFinder(); else if (seam_find_type == "gc_color") { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0) seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR); else @@ -617,7 +617,7 @@ int main(int argc, char* argv[]) } else if (seam_find_type == "gc_colorgrad") { -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0) seam_finder = new detail::GraphCutSeamFinderGpu(GraphCutSeamFinderBase::COST_COLOR_GRAD); else From bc72f4d2a2bb75af19edeb6bf5ed0128b891a2cd Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Fri, 20 Dec 2013 16:32:34 +0400 Subject: [PATCH 13/13] Code review fixes. --- CMakeLists.txt | 19 ++++++++++++++++++- modules/core/CMakeLists.txt | 6 ++++-- modules/core/include/opencv2/core/gpumat.hpp | 13 +++++-------- modules/core/src/gpumat.cpp | 15 +++++++++------ modules/dynamicuda/CMakeLists.txt | 4 ++-- .../include/opencv2/dynamicuda/dynamicuda.hpp | 4 ++-- modules/stitching/CMakeLists.txt | 6 +++++- .../opencv2/stitching/detail/seam_finders.hpp | 2 +- .../opencv2/stitching/detail/warpers.hpp | 4 ++-- .../include/opencv2/stitching/warpers.hpp | 2 +- modules/videostab/CMakeLists.txt | 6 +++++- 11 files changed, 54 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c5165c1e..06863804d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,7 +201,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi # OpenCV build options # =================================================== -OCV_OPTION(ENABLE_DYNAMIC_CUDA "Enabled dynamic CUDA linkage" ON IF ANDROID OR LINUX) +OCV_OPTION(ENABLE_DYNAMIC_CUDA "Enabled dynamic CUDA linkage" ON IF ANDROID ) OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers" ON IF (NOT IOS) ) OCV_OPTION(ENABLE_SOLUTION_FOLDERS "Solution folder in Visual Studio or in other IDEs" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") ) OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF IF CMAKE_COMPILER_IS_GNUCXX ) @@ -459,6 +459,23 @@ if(WITH_OPENCL) include(cmake/OpenCVDetectOpenCL.cmake) endif() +# ---------------------------------------------------------------------------- +# Add CUDA libraries (needed for apps/tools, samples) +# ---------------------------------------------------------------------------- +if(NOT HAVE_CUDA) + set(ENABLE_DYNAMIC_CUDA OFF) +endif() + +if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA) + set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) + if(HAVE_CUBLAS) + set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cublas_LIBRARY}) + endif() + if(HAVE_CUFFT) + set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY}) + endif() +endif() + # ---------------------------------------------------------------------------- # Solution folders: # ---------------------------------------------------------------------------- diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 0d985f288..a1e71bf4f 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -28,8 +28,10 @@ endif() file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") -if (HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA) +if(HAVE_CUDA AND NOT ENABLE_DYNAMIC_CUDA) file(GLOB lib_cuda "../dynamicuda/src/cuda/*.cu*") + ocv_include_directories(${CUDA_INCLUDE_DIRS}) + ocv_cuda_compile(cuda_objs ${lib_cuda}) endif() source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) @@ -43,7 +45,7 @@ if (NOT HAVE_CUDA OR ENABLE_DYNAMIC_CUDA) ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) else() - ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda} + ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" ${lib_cuda} ${cuda_objs} HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) endif() diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index d0f415ec3..193c9aa70 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -112,13 +112,13 @@ namespace cv { namespace gpu // Creates DeviceInfo object for the given GPU DeviceInfo(int device_id) : device_id_(device_id) { query(); } - std::string name() const; + std::string name() const { return name_; } // Return compute capability versions - int majorVersion() const; - int minorVersion() const; + int majorVersion() const { return majorVersion_; } + int minorVersion() const { return minorVersion_; } - int multiProcessorCount() const; + int multiProcessorCount() const { return multi_processor_count_; } size_t sharedMemPerBlock() const; @@ -132,12 +132,9 @@ namespace cv { namespace gpu // Checks whether the GPU module can be run on the given device bool isCompatible() const; - int deviceID() const; + int deviceID() const { return device_id_; } private: - // Private section is fictive to preserve bin compatibility. - // Changes in the private fields there have no effects. - // see deligate code. void query(); int device_id_; diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 310aabd58..94bb54823 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -263,12 +263,15 @@ size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->f size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); } bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); } bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); } -int cv::gpu::DeviceInfo::deviceID() const { return deviceInfoFuncTable()->deviceID(); }; -int cv::gpu::DeviceInfo::majorVersion() const { return deviceInfoFuncTable()->majorVersion(); } -int cv::gpu::DeviceInfo::minorVersion() const { return deviceInfoFuncTable()->minorVersion(); } -std::string cv::gpu::DeviceInfo::name() const { return deviceInfoFuncTable()->name(); } -int cv::gpu::DeviceInfo::multiProcessorCount() const { return deviceInfoFuncTable()->multiProcessorCount(); } -void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); } + +void cv::gpu::DeviceInfo::query() +{ + deviceInfoFuncTable()->query(); + name_ = deviceInfoFuncTable()->name(); + multi_processor_count_ = deviceInfoFuncTable()->multiProcessorCount(); + majorVersion_ = deviceInfoFuncTable()->majorVersion(); + minorVersion_ = deviceInfoFuncTable()->minorVersion(); +} void cv::gpu::printCudaDeviceInfo(int device) { deviceInfoFuncTable()->printCudaDeviceInfo(device); } void cv::gpu::printShortCudaDeviceInfo(int device) { deviceInfoFuncTable()->printShortCudaDeviceInfo(device); } diff --git a/modules/dynamicuda/CMakeLists.txt b/modules/dynamicuda/CMakeLists.txt index 031b5e48d..f67879ef9 100644 --- a/modules/dynamicuda/CMakeLists.txt +++ b/modules/dynamicuda/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT ANDROID OR NOT HAVE_CUDA) +if(NOT DYNAMIC_CUDA_SUPPORT) ocv_module_disable(dynamicuda) endif() @@ -11,5 +11,5 @@ set(OPENCV_MODULE_TYPE SHARED) if (BUILD_FAT_JAVA_LIB) ocv_define_module(dynamicuda opencv_java PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) else() - ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED q${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) + ocv_define_module(dynamicuda opencv_core PRIVATE_REQUIRED ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) endif() diff --git a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp index c5057ab99..8973c5304 100644 --- a/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp +++ b/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp @@ -539,7 +539,7 @@ private: DeviceProps deviceProps; -class CudaDeviceInfoFuncTable: DeviceInfoFuncTable +class CudaDeviceInfoFuncTable : public DeviceInfoFuncTable { public: size_t sharedMemPerBlock() const @@ -1109,4 +1109,4 @@ public: } }; #endif -#endif \ No newline at end of file +#endif diff --git a/modules/stitching/CMakeLists.txt b/modules/stitching/CMakeLists.txt index fda44591f..6e9a35ba7 100644 --- a/modules/stitching/CMakeLists.txt +++ b/modules/stitching/CMakeLists.txt @@ -1,2 +1,6 @@ set(the_description "Images stitching") -ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree) +if (ENABLE_DYNAMIC_CUDA) + ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_nonfree) +else() + ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree) +endif() \ No newline at end of file diff --git a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp index 09a1a106f..9301dc5eb 100644 --- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp +++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp @@ -227,7 +227,7 @@ private: }; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class CV_EXPORTS GraphCutSeamFinderGpu : public GraphCutSeamFinderBase, public PairwiseSeamFinder { public: diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp index 2bd46f75a..d44bfe69e 100644 --- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp +++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp @@ -46,7 +46,7 @@ #include "opencv2/core/core.hpp" #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/opencv_modules.hpp" -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) # include "opencv2/gpu/gpu.hpp" #endif @@ -331,7 +331,7 @@ public: }; -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class CV_EXPORTS PlaneWarperGpu : public PlaneWarper { public: diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp index 7475d1304..87efa7e80 100644 --- a/modules/stitching/include/opencv2/stitching/warpers.hpp +++ b/modules/stitching/include/opencv2/stitching/warpers.hpp @@ -145,7 +145,7 @@ public: -#ifdef HAVE_OPENCV_GPU +#if defined(HAVE_OPENCV_GPU) && !defined(ANDROID) class PlaneWarperGpu: public WarperCreator { public: diff --git a/modules/videostab/CMakeLists.txt b/modules/videostab/CMakeLists.txt index ac5cb0d69..84ec1d2e8 100644 --- a/modules/videostab/CMakeLists.txt +++ b/modules/videostab/CMakeLists.txt @@ -1,2 +1,6 @@ set(the_description "Video stabilization") -ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu) +if(ENABLE_DYNAMIC_CUDA) + ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui) +else() + ocv_define_module(videostab opencv_imgproc opencv_features2d opencv_video opencv_photo opencv_calib3d opencv_highgui OPTIONAL opencv_gpu) +endif()