Dynamic CUDA support library loading implemented for Linux.
Logical mistake in macro fixed; DeviceInfo deligate reimplemented; Build and warning fixes.
This commit is contained in:
parent
d4087f19a2
commit
8660e048bc
@ -1,36 +1,76 @@
|
||||
set(the_description "The Core Functionality")
|
||||
|
||||
macro(ocv_glob_module_sources_no_cuda)
|
||||
file(GLOB_RECURSE lib_srcs "src/*.cpp")
|
||||
file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
|
||||
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
|
||||
file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
|
||||
|
||||
set(cuda_objs "")
|
||||
set(lib_cuda_hdrs "")
|
||||
if(HAVE_CUDA)
|
||||
ocv_include_directories(${CUDA_INCLUDE_DIRS})
|
||||
file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
|
||||
endif()
|
||||
|
||||
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
|
||||
|
||||
file(GLOB cl_kernels "src/opencl/*.cl")
|
||||
if(HAVE_opencv_ocl AND cl_kernels)
|
||||
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
|
||||
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
|
||||
DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
|
||||
source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
|
||||
list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
|
||||
endif()
|
||||
|
||||
source_group("Include" FILES ${lib_hdrs})
|
||||
source_group("Include\\detail" FILES ${lib_hdrs_detail})
|
||||
|
||||
ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
|
||||
SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs})
|
||||
endmacro()
|
||||
|
||||
ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
|
||||
ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
|
||||
|
||||
if(HAVE_WINRT)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
|
||||
endif()
|
||||
|
||||
if(DYNAMIC_CUDA_SUPPORT)
|
||||
add_definitions(-DDYNAMIC_CUDA_SUPPORT)
|
||||
else()
|
||||
add_definitions(-DUSE_CUDA)
|
||||
endif()
|
||||
|
||||
if(HAVE_CUDA)
|
||||
ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
|
||||
endif()
|
||||
|
||||
file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h")
|
||||
file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
|
||||
|
||||
source_group("Cuda Headers" FILES ${lib_cuda_hdrs})
|
||||
source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
|
||||
|
||||
if(DYNAMIC_CUDA_SUPPORT)
|
||||
add_definitions(-DDYNAMIC_CUDA_SUPPORT)
|
||||
if (DYNAMIC_CUDA_SUPPORT)
|
||||
ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
|
||||
HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
|
||||
else()
|
||||
ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
|
||||
HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
|
||||
endif()
|
||||
|
||||
ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
|
||||
ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
|
||||
|
||||
if(HAVE_CUDA)
|
||||
ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
|
||||
endif()
|
||||
|
||||
ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
|
||||
HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
|
||||
|
||||
ocv_create_module()
|
||||
ocv_add_precompiled_headers(${the_module})
|
||||
|
||||
ocv_add_accuracy_tests()
|
||||
ocv_add_perf_tests()
|
||||
|
||||
if(DYNAMIC_CUDA_SUPPORT)
|
||||
if (DYNAMIC_CUDA_SUPPORT)
|
||||
add_subdirectory(cuda)
|
||||
endif()
|
||||
|
@ -1,6 +1,5 @@
|
||||
project(opencv_core_cuda)
|
||||
set(HAVE_CUDA FALSE)
|
||||
add_definitions("-DHAVE_CUDA")
|
||||
add_definitions(-DUSE_CUDA)
|
||||
include_directories(${CUDA_INCLUDE_DIRS}
|
||||
"../src/"
|
||||
"../include/opencv2/core/"
|
||||
|
@ -1,6 +1,10 @@
|
||||
#include "cvconfig.h"
|
||||
#include "opencv2/core/core.hpp"
|
||||
#include "opencv2/core/gpumat.hpp"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
#include <cuda_runtime.h>
|
||||
#include <npp.h>
|
||||
@ -17,7 +21,30 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
using namespace cv::gpu;
|
||||
|
||||
#include "gpumat_cuda.hpp"
|
||||
#include "gpumat_cuda.hpp"
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
static CudaDeviceInfoFuncTable deviceInfoTable;
|
||||
static CudaFuncTable gpuTable;
|
||||
#else
|
||||
static EmptyDeviceInfoFuncTable deviceInfoTable;
|
||||
static EmptyFuncTable gpuTable;
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
DeviceInfoFuncTable* deviceInfoFactory()
|
||||
{
|
||||
return (DeviceInfoFuncTable*)&deviceInfoTable;
|
||||
}
|
||||
|
||||
GpuFuncTable* gpuFactory()
|
||||
{
|
||||
return (GpuFuncTable*)&gpuTable;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -137,6 +137,9 @@ namespace cv { namespace gpu
|
||||
int deviceID() const { return device_id_; }
|
||||
|
||||
private:
|
||||
// Private section is fictive to preserve bin compatibility.
|
||||
// Changes in the private fields there have no effects.
|
||||
// see deligate code.
|
||||
void query();
|
||||
|
||||
int device_id_;
|
||||
|
@ -43,8 +43,9 @@
|
||||
#include "precomp.hpp"
|
||||
#include "opencv2/core/gpumat.hpp"
|
||||
#include <iostream>
|
||||
#include <dlfcn.h>
|
||||
|
||||
#if defined(HAVE_CUDA)
|
||||
#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
|
||||
#include <cuda_runtime.h>
|
||||
#include <npp.h>
|
||||
|
||||
@ -66,15 +67,81 @@ using namespace cv::gpu;
|
||||
|
||||
#include "gpumat_cuda.hpp"
|
||||
|
||||
namespace
|
||||
typedef GpuFuncTable* (*GpuFactoryType)();
|
||||
typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)();
|
||||
|
||||
static GpuFactoryType gpuFactory = NULL;
|
||||
static DeviceInfoFactoryType deviceInfoFactory = NULL;
|
||||
|
||||
static const std::string getCudaSupportLibName()
|
||||
{
|
||||
const GpuFuncTable* gpuFuncTable()
|
||||
{
|
||||
static EmptyFuncTable funcTable;
|
||||
return &funcTable;
|
||||
}
|
||||
return "libopencv_core_cuda.so";
|
||||
}
|
||||
|
||||
static bool loadCudaSupportLib()
|
||||
{
|
||||
void* handle;
|
||||
const std::string name = getCudaSupportLibName();
|
||||
handle = dlopen(name.c_str(), RTLD_LAZY);
|
||||
if (!handle)
|
||||
return false;
|
||||
|
||||
deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory");
|
||||
if (!deviceInfoFactory)
|
||||
{
|
||||
dlclose(handle);
|
||||
return false;
|
||||
}
|
||||
|
||||
gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory");
|
||||
if (!gpuFactory)
|
||||
{
|
||||
dlclose(handle);
|
||||
return false;
|
||||
}
|
||||
|
||||
dlclose(handle);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static GpuFuncTable* gpuFuncTable()
|
||||
{
|
||||
#ifdef DYNAMIC_CUDA_SUPPORT
|
||||
static EmptyFuncTable stub;
|
||||
static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub;
|
||||
static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub;
|
||||
#else
|
||||
# ifdef USE_CUDA
|
||||
static CudaFuncTable impl;
|
||||
static GpuFuncTable* funcTable = &impl;
|
||||
#else
|
||||
static EmptyFuncTable stub;
|
||||
static GpuFuncTable* funcTable = &stub;
|
||||
#endif
|
||||
#endif
|
||||
return funcTable;
|
||||
}
|
||||
|
||||
static DeviceInfoFuncTable* deviceInfoFuncTable()
|
||||
{
|
||||
#ifdef DYNAMIC_CUDA_SUPPORT
|
||||
static EmptyDeviceInfoFuncTable stub;
|
||||
static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub;
|
||||
static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub;
|
||||
#else
|
||||
# ifdef USE_CUDA
|
||||
static CudaDeviceInfoFuncTable impl;
|
||||
static DeviceInfoFuncTable* funcTable = &impl;
|
||||
#else
|
||||
static EmptyFuncTable stub;
|
||||
static DeviceInfoFuncTable* funcTable = &stub;
|
||||
#endif
|
||||
#endif
|
||||
return funcTable;
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////// Initialization & Info ////////////////////////
|
||||
|
||||
int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); }
|
||||
@ -95,13 +162,13 @@ bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuF
|
||||
bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); }
|
||||
bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); }
|
||||
|
||||
size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); }
|
||||
void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); }
|
||||
size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); }
|
||||
size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); }
|
||||
bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); }
|
||||
bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); }
|
||||
void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); }
|
||||
size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
|
||||
void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
|
||||
size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); }
|
||||
size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
|
||||
bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
|
||||
bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
|
||||
void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
|
||||
|
||||
void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
|
||||
void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); }
|
||||
@ -556,7 +623,7 @@ namespace cv { namespace gpu
|
||||
|
||||
void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
|
||||
{
|
||||
gpuFuncTable()->setTo(src, s, stream);
|
||||
gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream);
|
||||
}
|
||||
|
||||
void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
|
||||
|
@ -1,30 +1,19 @@
|
||||
namespace
|
||||
{
|
||||
#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT)
|
||||
#ifndef __GPUMAT_CUDA_HPP__
|
||||
#define __GPUMAT_CUDA_HPP__
|
||||
|
||||
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
|
||||
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
|
||||
|
||||
inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
|
||||
class DeviceInfoFuncTable
|
||||
{
|
||||
if (cudaSuccess != err)
|
||||
cv::gpu::error(cudaGetErrorString(err), file, line, func);
|
||||
}
|
||||
|
||||
inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (err < 0)
|
||||
{
|
||||
std::ostringstream msg;
|
||||
msg << "NPP API Call Error: " << err;
|
||||
cv::gpu::error(msg.str().c_str(), file, line, func);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
public:
|
||||
virtual size_t sharedMemPerBlock() const = 0;
|
||||
virtual void queryMemory(size_t&, size_t&) const = 0;
|
||||
virtual size_t freeMemory() const = 0;
|
||||
virtual size_t totalMemory() const = 0;
|
||||
virtual bool supports(FeatureSet) const = 0;
|
||||
virtual bool isCompatible() const = 0;
|
||||
virtual void query() = 0;
|
||||
virtual ~DeviceInfoFuncTable() {};
|
||||
};
|
||||
|
||||
class GpuFuncTable
|
||||
{
|
||||
public:
|
||||
@ -40,6 +29,7 @@ namespace
|
||||
|
||||
virtual bool deviceSupports(FeatureSet) const = 0;
|
||||
|
||||
// TargetArchs
|
||||
virtual bool builtWith(FeatureSet) const = 0;
|
||||
virtual bool has(int, int) const = 0;
|
||||
virtual bool hasPtx(int, int) const = 0;
|
||||
@ -49,14 +39,6 @@ namespace
|
||||
virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
|
||||
virtual bool hasEqualOrGreaterBin(int, int) const = 0;
|
||||
|
||||
virtual size_t sharedMemPerBlock() const = 0;
|
||||
virtual void queryMemory(size_t&, size_t&) const = 0;
|
||||
virtual size_t freeMemory() const = 0;
|
||||
virtual size_t totalMemory() const = 0;
|
||||
virtual bool supports(FeatureSet) const = 0;
|
||||
virtual bool isCompatible() const = 0;
|
||||
virtual void query() const = 0;
|
||||
|
||||
virtual void printCudaDeviceInfo(int) const = 0;
|
||||
virtual void printShortCudaDeviceInfo(int) const = 0;
|
||||
|
||||
@ -72,17 +54,24 @@ namespace
|
||||
virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
|
||||
|
||||
// for gpu::device::setTo funcs
|
||||
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0;
|
||||
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
|
||||
|
||||
virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
|
||||
virtual void free(void* devPtr) const = 0;
|
||||
};
|
||||
}
|
||||
|
||||
#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
|
||||
namespace
|
||||
{
|
||||
class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
|
||||
{
|
||||
public:
|
||||
size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
|
||||
void queryMemory(size_t&, size_t&) const { throw_nogpu; }
|
||||
size_t freeMemory() const { throw_nogpu; return 0; }
|
||||
size_t totalMemory() const { throw_nogpu; return 0; }
|
||||
bool supports(FeatureSet) const { throw_nogpu; return false; }
|
||||
bool isCompatible() const { throw_nogpu; return false; }
|
||||
void query() { throw_nogpu; }
|
||||
};
|
||||
|
||||
class EmptyFuncTable : public GpuFuncTable
|
||||
{
|
||||
public:
|
||||
@ -105,15 +94,7 @@ namespace
|
||||
bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
|
||||
bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
|
||||
bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
|
||||
|
||||
size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
|
||||
void queryMemory(size_t&, size_t&) const { throw_nogpu; }
|
||||
size_t freeMemory() const { throw_nogpu; return 0; }
|
||||
size_t totalMemory() const { throw_nogpu; return 0; }
|
||||
bool supports(FeatureSet) const { throw_nogpu; return false; }
|
||||
bool isCompatible() const { throw_nogpu; return false; }
|
||||
void query() const { throw_nogpu; }
|
||||
|
||||
|
||||
void printCudaDeviceInfo(int) const { throw_nogpu; }
|
||||
void printShortCudaDeviceInfo(int) const { throw_nogpu; }
|
||||
|
||||
@ -126,15 +107,32 @@ namespace
|
||||
void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
|
||||
void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
|
||||
|
||||
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; }
|
||||
virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
|
||||
|
||||
void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
|
||||
void free(void*) const {}
|
||||
};
|
||||
|
||||
#if defined(USE_CUDA)
|
||||
|
||||
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
|
||||
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
|
||||
|
||||
inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (cudaSuccess != err)
|
||||
cv::gpu::error(cudaGetErrorString(err), file, line, func);
|
||||
}
|
||||
|
||||
#else
|
||||
inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if (err < 0)
|
||||
{
|
||||
std::ostringstream msg;
|
||||
msg << "NPP API Call Error: " << err;
|
||||
cv::gpu::error(msg.str().c_str(), file, line, func);
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -149,8 +147,6 @@ namespace cv { namespace gpu { namespace device
|
||||
void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
|
||||
{
|
||||
Scalar_<T> sf = s;
|
||||
@ -162,10 +158,7 @@ namespace
|
||||
Scalar_<T> sf = s;
|
||||
cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
template<int n> struct NPPTypeTraits;
|
||||
template<> struct NPPTypeTraits<CV_8U> { typedef Npp8u npp_type; };
|
||||
template<> struct NPPTypeTraits<CV_8S> { typedef Npp8s npp_type; };
|
||||
@ -208,6 +201,7 @@ namespace
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
|
||||
{
|
||||
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
|
||||
@ -361,9 +355,8 @@ namespace
|
||||
{
|
||||
return reinterpret_cast<size_t>(ptr) % size == 0;
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace devices
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
|
||||
{
|
||||
@ -418,74 +411,52 @@ namespace
|
||||
{
|
||||
setTo(src, s, mask, 0);
|
||||
}
|
||||
}}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
class CudaFuncTable : public GpuFuncTable
|
||||
|
||||
class CudaArch
|
||||
{
|
||||
protected:
|
||||
|
||||
class CudaArch
|
||||
{
|
||||
public:
|
||||
CudaArch();
|
||||
|
||||
bool builtWith(FeatureSet feature_set) const;
|
||||
bool hasPtx(int major, int minor) const;
|
||||
bool hasBin(int major, int minor) const;
|
||||
bool hasEqualOrLessPtx(int major, int minor) const;
|
||||
bool hasEqualOrGreaterPtx(int major, int minor) const;
|
||||
bool hasEqualOrGreaterBin(int major, int minor) const;
|
||||
|
||||
private:
|
||||
static void fromStr(const string& set_as_str, vector<int>& arr);
|
||||
|
||||
vector<int> bin;
|
||||
vector<int> ptx;
|
||||
vector<int> features;
|
||||
};
|
||||
|
||||
const CudaArch cudaArch;
|
||||
|
||||
CudaArch::CudaArch()
|
||||
public:
|
||||
CudaArch()
|
||||
{
|
||||
fromStr(CUDA_ARCH_BIN, bin);
|
||||
fromStr(CUDA_ARCH_PTX, ptx);
|
||||
fromStr(CUDA_ARCH_FEATURES, features);
|
||||
}
|
||||
|
||||
bool CudaArch::builtWith(FeatureSet feature_set) const
|
||||
bool builtWith(FeatureSet feature_set) const
|
||||
{
|
||||
return !features.empty() && (features.back() >= feature_set);
|
||||
}
|
||||
|
||||
bool CudaArch::hasPtx(int major, int minor) const
|
||||
bool hasPtx(int major, int minor) const
|
||||
{
|
||||
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
|
||||
}
|
||||
|
||||
bool CudaArch::hasBin(int major, int minor) const
|
||||
bool hasBin(int major, int minor) const
|
||||
{
|
||||
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
|
||||
bool hasEqualOrLessPtx(int major, int minor) const
|
||||
{
|
||||
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
|
||||
bool hasEqualOrGreaterPtx(int major, int minor) const
|
||||
{
|
||||
return !ptx.empty() && (ptx.back() >= major * 10 + minor);
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
|
||||
bool hasEqualOrGreaterBin(int major, int minor) const
|
||||
{
|
||||
return !bin.empty() && (bin.back() >= major * 10 + minor);
|
||||
}
|
||||
|
||||
void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
|
||||
|
||||
private:
|
||||
void fromStr(const string& set_as_str, vector<int>& arr)
|
||||
{
|
||||
if (set_as_str.find_first_not_of(" ") == string::npos)
|
||||
return;
|
||||
@ -501,25 +472,21 @@ namespace
|
||||
|
||||
sort(arr.begin(), arr.end());
|
||||
}
|
||||
|
||||
class DeviceProps
|
||||
{
|
||||
public:
|
||||
DeviceProps();
|
||||
~DeviceProps();
|
||||
|
||||
cudaDeviceProp* get(int devID);
|
||||
|
||||
private:
|
||||
std::vector<cudaDeviceProp*> props_;
|
||||
};
|
||||
|
||||
DeviceProps::DeviceProps()
|
||||
vector<int> bin;
|
||||
vector<int> ptx;
|
||||
vector<int> features;
|
||||
};
|
||||
|
||||
class DeviceProps
|
||||
{
|
||||
public:
|
||||
DeviceProps()
|
||||
{
|
||||
props_.resize(10, 0);
|
||||
}
|
||||
|
||||
DeviceProps::~DeviceProps()
|
||||
~DeviceProps()
|
||||
{
|
||||
for (size_t i = 0; i < props_.size(); ++i)
|
||||
{
|
||||
@ -529,7 +496,7 @@ namespace
|
||||
props_.clear();
|
||||
}
|
||||
|
||||
cudaDeviceProp* DeviceProps::get(int devID)
|
||||
cudaDeviceProp* get(int devID)
|
||||
{
|
||||
if (devID >= (int) props_.size())
|
||||
props_.resize(devID + 5, 0);
|
||||
@ -542,10 +509,92 @@ namespace
|
||||
|
||||
return props_[devID];
|
||||
}
|
||||
|
||||
DeviceProps deviceProps;
|
||||
private:
|
||||
std::vector<cudaDeviceProp*> props_;
|
||||
};
|
||||
|
||||
int convertSMVer2Cores(int major, int minor)
|
||||
DeviceProps deviceProps;
|
||||
|
||||
class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
|
||||
{
|
||||
public:
|
||||
size_t sharedMemPerBlock() const
|
||||
{
|
||||
return deviceProps.get(device_id_)->sharedMemPerBlock;
|
||||
}
|
||||
|
||||
void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
|
||||
{
|
||||
int prevDeviceID = getDevice();
|
||||
if (prevDeviceID != device_id_)
|
||||
setDevice(device_id_);
|
||||
|
||||
cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
|
||||
|
||||
if (prevDeviceID != device_id_)
|
||||
setDevice(prevDeviceID);
|
||||
}
|
||||
|
||||
size_t freeMemory() const
|
||||
{
|
||||
size_t _totalMemory, _freeMemory;
|
||||
queryMemory(_totalMemory, _freeMemory);
|
||||
return _freeMemory;
|
||||
}
|
||||
|
||||
size_t totalMemory() const
|
||||
{
|
||||
size_t _totalMemory, _freeMemory;
|
||||
queryMemory(_totalMemory, _freeMemory);
|
||||
return _totalMemory;
|
||||
}
|
||||
|
||||
bool supports(FeatureSet feature_set) const
|
||||
{
|
||||
int version = majorVersion_ * 10 + minorVersion_;
|
||||
return version >= feature_set;
|
||||
}
|
||||
|
||||
bool isCompatible() const
|
||||
{
|
||||
// Check PTX compatibility
|
||||
if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_))
|
||||
return true;
|
||||
|
||||
// Check BIN compatibility
|
||||
for (int i = minorVersion_; i >= 0; --i)
|
||||
if (TargetArchs::hasBin(majorVersion_, i))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void query()
|
||||
{
|
||||
const cudaDeviceProp* prop = deviceProps.get(device_id_);
|
||||
|
||||
name_ = prop->name;
|
||||
multi_processor_count_ = prop->multiProcessorCount;
|
||||
majorVersion_ = prop->major;
|
||||
minorVersion_ = prop->minor;
|
||||
}
|
||||
|
||||
private:
|
||||
int device_id_;
|
||||
|
||||
std::string name_;
|
||||
int multi_processor_count_;
|
||||
int majorVersion_;
|
||||
int minorVersion_;
|
||||
};
|
||||
|
||||
class CudaFuncTable : public GpuFuncTable
|
||||
{
|
||||
protected:
|
||||
|
||||
const CudaArch cudaArch;
|
||||
|
||||
int convertSMVer2Cores(int major, int minor) const
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
@ -600,42 +649,42 @@ namespace
|
||||
cudaSafeCall( cudaDeviceReset() );
|
||||
}
|
||||
|
||||
bool TargetArchs::builtWith(FeatureSet feature_set) const
|
||||
bool builtWith(FeatureSet feature_set) const
|
||||
{
|
||||
return cudaArch.builtWith(feature_set);
|
||||
}
|
||||
|
||||
bool TargetArchs::has(int major, int minor) const
|
||||
bool has(int major, int minor) const
|
||||
{
|
||||
return hasPtx(major, minor) || hasBin(major, minor);
|
||||
}
|
||||
|
||||
bool TargetArchs::hasPtx(int major, int minor) const
|
||||
bool hasPtx(int major, int minor) const
|
||||
{
|
||||
return cudaArch.hasPtx(major, minor);
|
||||
}
|
||||
|
||||
bool TargetArchs::hasBin(int major, int minor) const
|
||||
bool hasBin(int major, int minor) const
|
||||
{
|
||||
return cudaArch.hasBin(major, minor);
|
||||
}
|
||||
|
||||
bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const
|
||||
bool hasEqualOrLessPtx(int major, int minor) const
|
||||
{
|
||||
return cudaArch.hasEqualOrLessPtx(major, minor);
|
||||
}
|
||||
|
||||
bool TargetArchs::hasEqualOrGreater(int major, int minor) const
|
||||
bool hasEqualOrGreater(int major, int minor) const
|
||||
{
|
||||
return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
|
||||
}
|
||||
|
||||
bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const
|
||||
bool hasEqualOrGreaterPtx(int major, int minor) const
|
||||
{
|
||||
return cudaArch.hasEqualOrGreaterPtx(major, minor);
|
||||
}
|
||||
|
||||
bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const
|
||||
bool hasEqualOrGreaterBin(int major, int minor) const
|
||||
{
|
||||
return cudaArch.hasEqualOrGreaterBin(major, minor);
|
||||
}
|
||||
@ -664,68 +713,7 @@ namespace
|
||||
|
||||
return TargetArchs::builtWith(feature_set) && (version >= feature_set);
|
||||
}
|
||||
|
||||
size_t sharedMemPerBlock() const
|
||||
{
|
||||
return deviceProps.get(device_id_)->sharedMemPerBlock;
|
||||
}
|
||||
|
||||
void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
|
||||
{
|
||||
int prevDeviceID = getDevice();
|
||||
if (prevDeviceID != device_id_)
|
||||
setDevice(device_id_);
|
||||
|
||||
cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
|
||||
|
||||
if (prevDeviceID != device_id_)
|
||||
setDevice(prevDeviceID);
|
||||
}
|
||||
|
||||
size_t freeMemory() const
|
||||
{
|
||||
size_t _totalMemory, _freeMemory;
|
||||
queryMemory(_totalMemory, _freeMemory);
|
||||
return _freeMemory;
|
||||
}
|
||||
|
||||
size_t totalMemory() const
|
||||
{
|
||||
size_t _totalMemory, _freeMemory;
|
||||
queryMemory(_totalMemory, _freeMemory);
|
||||
return _totalMemory;
|
||||
}
|
||||
|
||||
bool supports(FeatureSet feature_set) const
|
||||
{
|
||||
int version = majorVersion() * 10 + minorVersion();
|
||||
return version >= feature_set;
|
||||
}
|
||||
|
||||
bool isCompatible() const
|
||||
{
|
||||
// Check PTX compatibility
|
||||
if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
|
||||
return true;
|
||||
|
||||
// Check BIN compatibility
|
||||
for (int i = minorVersion(); i >= 0; --i)
|
||||
if (TargetArchs::hasBin(majorVersion(), i))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void query() const
|
||||
{
|
||||
const cudaDeviceProp* prop = deviceProps.get(device_id_);
|
||||
|
||||
name_ = prop->name;
|
||||
multi_processor_count_ = prop->multiProcessorCount;
|
||||
majorVersion_ = prop->major;
|
||||
minorVersion_ = prop->minor;
|
||||
}
|
||||
|
||||
|
||||
void printCudaDeviceInfo(int device) const
|
||||
{
|
||||
int count = getCudaEnabledDeviceCount();
|
||||
@ -864,16 +852,16 @@ namespace
|
||||
typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
|
||||
static const func_t funcs[7][4] =
|
||||
{
|
||||
/* 8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
|
||||
/* 8S */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask },
|
||||
/* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
|
||||
/* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
|
||||
/* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
|
||||
/* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
|
||||
/* 64F */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask }
|
||||
/* 8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
|
||||
/* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask },
|
||||
/* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
|
||||
/* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
|
||||
/* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
|
||||
/* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
|
||||
/* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }
|
||||
};
|
||||
|
||||
const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask;
|
||||
const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
|
||||
|
||||
func(src, dst, mask, 0);
|
||||
}
|
||||
@ -971,7 +959,7 @@ namespace
|
||||
func(src, dst);
|
||||
}
|
||||
|
||||
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
|
||||
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
|
||||
{
|
||||
CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
|
||||
CV_Assert(dst.depth() <= CV_64F);
|
||||
@ -982,10 +970,10 @@ namespace
|
||||
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
cv::gpu::device::convertTo(src, dst, alpha, beta);
|
||||
cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
|
||||
}
|
||||
|
||||
void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
|
||||
void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
|
||||
{
|
||||
if (mask.empty())
|
||||
{
|
||||
@ -1016,7 +1004,7 @@ namespace
|
||||
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
|
||||
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
|
||||
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
|
||||
{cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }
|
||||
{cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo }
|
||||
};
|
||||
|
||||
CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
|
||||
@ -1027,7 +1015,10 @@ namespace
|
||||
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
funcs[m.depth()][m.channels() - 1](m, s);
|
||||
if (stream)
|
||||
cv::gpu::device::setTo(m, s, stream);
|
||||
else
|
||||
funcs[m.depth()][m.channels() - 1](m, s);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1051,7 +1042,10 @@ namespace
|
||||
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
|
||||
}
|
||||
|
||||
funcs[m.depth()][m.channels() - 1](m, s, mask);
|
||||
if (stream)
|
||||
cv::gpu::device::setTo(m, s, mask, stream);
|
||||
else
|
||||
funcs[m.depth()][m.channels() - 1](m, s, mask);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1065,5 +1059,5 @@ namespace
|
||||
cudaFree(devPtr);
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
||||
#endif
|
Loading…
x
Reference in New Issue
Block a user