Merge pull request #512 from jet47:core-gpu-updates

2013-02-21 12:22:03 +04:00
parent c1a5d1c44a ce2559b332
commit decf04dfec
2 changed files with 219 additions and 165 deletions
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -73,12 +73,16 @@ namespace cv { namespace gpu
        FEATURE_SET_COMPUTE_20 = 20,
        FEATURE_SET_COMPUTE_21 = 21,
        FEATURE_SET_COMPUTE_30 = 30,
        FEATURE_SET_COMPUTE_35 = 35,
        GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
        SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
-        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
+        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30,
        DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35
    };
    // Checks whether current device supports the given feature
    CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
    // Gives information about what GPU archs this OpenCV GPU module was
@@ -116,6 +120,9 @@ namespace cv { namespace gpu
        int multiProcessorCount() const { return multi_processor_count_; }
        size_t sharedMemPerBlock() const;
        void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
        size_t freeMemory() const;
        size_t totalMemory() const;
@@ -129,7 +136,6 @@ namespace cv { namespace gpu
    private:
        void query();
        void queryMemory(size_t& free_memory, size_t& total_memory) const;
        int device_id_;
@@ -549,29 +555,6 @@ namespace cv { namespace gpu
    {
        ensureSizeIsEnough(size.height, size.width, type, m);
    }
    inline void createContinuous(int rows, int cols, int type, GpuMat& m)
    {
        int area = rows * cols;
        if (!m.isContinuous() || m.type() != type || m.size().area() != area)
            ensureSizeIsEnough(1, area, type, m);
        m = m.reshape(0, rows);
    }
    inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
    {
        if (m.type() == type && m.rows >= rows && m.cols >= cols)
            m = m(Rect(0, 0, cols, rows));
        else
            m.create(rows, cols, type);
    }
    inline GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
    {
        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
            return mat(Rect(0, 0, cols, rows));
        return mat = GpuMat(rows, cols, type);
    }
 }}
 #endif // __cplusplus
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -48,8 +48,8 @@
    #include <cuda_runtime.h>
    #include <npp.h>
-    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4020
-    #define NPP_MINIMUM_REQUIRED_VERSION 4100
+    #define NPP_MINIMUM_REQUIRED_VERSION 4200
    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient Cuda Runtime library version, please update it."
@@ -64,8 +64,108 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 #ifndef HAVE_CUDA
 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
 #else // HAVE_CUDA
 namespace
 {
 #if defined(__GNUC__)
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
 #endif
    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
    {
        if (cudaSuccess != err)
            cv::gpu::error(cudaGetErrorString(err), file, line, func);
    }
    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
    {
        if (err < 0)
        {
            std::ostringstream msg;
            msg << "NPP API Call Error: " << err;
            cv::gpu::error(msg.str().c_str(), file, line, func);
        }
    }
 }
 #endif // HAVE_CUDA
 //////////////////////////////// Initialization & Info ////////////////////////
 #ifndef HAVE_CUDA
 int cv::gpu::getCudaEnabledDeviceCount() { return 0; }
 void cv::gpu::setDevice(int) { throw_nogpu; }
 int cv::gpu::getDevice() { throw_nogpu; return 0; }
 void cv::gpu::resetDevice() { throw_nogpu; }
 bool cv::gpu::deviceSupports(FeatureSet) { throw_nogpu; return false; }
 bool cv::gpu::TargetArchs::builtWith(FeatureSet) { throw_nogpu; return false; }
 bool cv::gpu::TargetArchs::has(int, int) { throw_nogpu; return false; }
 bool cv::gpu::TargetArchs::hasPtx(int, int) { throw_nogpu; return false; }
 bool cv::gpu::TargetArchs::hasBin(int, int) { throw_nogpu; return false; }
 bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int, int) { throw_nogpu; return false; }
 bool cv::gpu::TargetArchs::hasEqualOrGreater(int, int) { throw_nogpu; return false; }
 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int, int) { throw_nogpu; return false; }
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int, int) { throw_nogpu; return false; }
 size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { throw_nogpu; return 0; }
 void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; }
 size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; }
 size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; }
 bool cv::gpu::DeviceInfo::supports(FeatureSet) const { throw_nogpu; return false; }
 bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; }
 void cv::gpu::DeviceInfo::query() { throw_nogpu; }
 void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; }
 void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; }
 #else // HAVE_CUDA
 int cv::gpu::getCudaEnabledDeviceCount()
 {
    int count;
    cudaError_t error = cudaGetDeviceCount( &count );
    if (error == cudaErrorInsufficientDriver)
        return -1;
    if (error == cudaErrorNoDevice)
        return 0;
    cudaSafeCall( error );
    return count;
 }
 void cv::gpu::setDevice(int device)
 {
    cudaSafeCall( cudaSetDevice( device ) );
 }
 int cv::gpu::getDevice()
 {
    int device;
    cudaSafeCall( cudaGetDevice( &device ) );
    return device;
 }
 void cv::gpu::resetDevice()
 {
    cudaSafeCall( cudaDeviceReset() );
 }
 namespace
 {
    class CudaArch
@@ -92,11 +192,9 @@ namespace
    CudaArch::CudaArch()
    {
    #ifdef HAVE_CUDA
        fromStr(CUDA_ARCH_BIN, bin);
        fromStr(CUDA_ARCH_PTX, ptx);
        fromStr(CUDA_ARCH_FEATURES, features);
    #endif
    }
    bool CudaArch::builtWith(FeatureSet feature_set) const
@@ -149,12 +247,7 @@ namespace
 bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
 {
 #if defined (HAVE_CUDA)
    return cudaArch.builtWith(feature_set);
 #else
    (void)feature_set;
    return false;
 #endif
 }
 bool cv::gpu::TargetArchs::has(int major, int minor)
@@ -164,35 +257,17 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
 bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
    return cudaArch.hasPtx(major, minor);
 #else
    (void)major;
    (void)minor;
    return false;
 #endif
 }
 bool cv::gpu::TargetArchs::hasBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
    return cudaArch.hasBin(major, minor);
 #else
    (void)major;
    (void)minor;
    return false;
 #endif
 }
 bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
    return cudaArch.hasEqualOrLessPtx(major, minor);
 #else
    (void)major;
    (void)minor;
    return false;
 #endif
 }
 bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
@@ -202,24 +277,12 @@ bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
 {
 #if defined (HAVE_CUDA)
    return cudaArch.hasEqualOrGreaterPtx(major, minor);
 #else
    (void)major;
    (void)minor;
    return false;
 #endif
 }
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
 {
 #if defined (HAVE_CUDA)
    return cudaArch.hasEqualOrGreaterBin(major, minor);
 #else
    (void)major;
    (void)minor;
    return false;
 #endif
 }
 bool cv::gpu::deviceSupports(FeatureSet feature_set)
@@ -247,108 +310,84 @@ bool cv::gpu::deviceSupports(FeatureSet feature_set)
    return TargetArchs::builtWith(feature_set) && (version >= feature_set);
 }
 #if !defined (HAVE_CUDA)
 #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
 int cv::gpu::getCudaEnabledDeviceCount() { return 0; }
 void cv::gpu::setDevice(int) { throw_nogpu; }
 int cv::gpu::getDevice() { throw_nogpu; return 0; }
 void cv::gpu::resetDevice() { throw_nogpu; }
 size_t cv::gpu::DeviceInfo::freeMemory() const { throw_nogpu; return 0; }
 size_t cv::gpu::DeviceInfo::totalMemory() const { throw_nogpu; return 0; }
 bool cv::gpu::DeviceInfo::supports(cv::gpu::FeatureSet) const { throw_nogpu; return false; }
 bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu; return false; }
 void cv::gpu::DeviceInfo::query() { throw_nogpu; }
 void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu; }
 void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu; }
 void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu; }
 #undef throw_nogpu
 #else // HAVE_CUDA
 namespace
 {
-#if defined(__GNUC__)
+    class DeviceProps
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
 #endif
    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
    {
-        if (cudaSuccess != err)
+    public:
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+        DeviceProps();
        ~DeviceProps();
        cudaDeviceProp* get(int devID);
    private:
        std::vector<cudaDeviceProp*> props_;
    };
    DeviceProps::DeviceProps()
    {
        props_.resize(10, 0);
    }
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    DeviceProps::~DeviceProps()
    {
-        if (err < 0)
+        for (size_t i = 0; i < props_.size(); ++i)
        {
-            std::ostringstream msg;
+            if (props_[i])
-            msg << "NPP API Call Error: " << err;
+                delete props_[i];
            cv::gpu::error(msg.str().c_str(), file, line, func);
        }
        props_.clear();
    }
    cudaDeviceProp* DeviceProps::get(int devID)
    {
        if (devID >= (int) props_.size())
            props_.resize(devID + 5, 0);
        if (!props_[devID])
        {
            props_[devID] = new cudaDeviceProp;
            cudaSafeCall( cudaGetDeviceProperties(props_[devID], devID) );
        }
        return props_[devID];
    }
    DeviceProps deviceProps;
 }
-int cv::gpu::getCudaEnabledDeviceCount()
+size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const
 {
-    int count;
+    return deviceProps.get(device_id_)->sharedMemPerBlock;
    cudaError_t error = cudaGetDeviceCount( &count );
    if (error == cudaErrorInsufficientDriver)
        return -1;
    if (error == cudaErrorNoDevice)
        return 0;
    cudaSafeCall(error);
    return count;
 }
-void cv::gpu::setDevice(int device)
+void cv::gpu::DeviceInfo::queryMemory(size_t& totalMemory, size_t& freeMemory) const
 {
-    cudaSafeCall( cudaSetDevice( device ) );
+    int prevDeviceID = getDevice();
-}
+    if (prevDeviceID != device_id_)
        setDevice(device_id_);
-int cv::gpu::getDevice()
+    cudaSafeCall( cudaMemGetInfo(&freeMemory, &totalMemory) );
 {
    int device;
    cudaSafeCall( cudaGetDevice( &device ) );
    return device;
 }
-void cv::gpu::resetDevice()
+    if (prevDeviceID != device_id_)
-{
+        setDevice(prevDeviceID);
    cudaSafeCall( cudaDeviceReset() );
 }
 size_t cv::gpu::DeviceInfo::freeMemory() const
 {
-    size_t free_memory, total_memory;
+    size_t totalMemory, freeMemory;
-    queryMemory(free_memory, total_memory);
+    queryMemory(totalMemory, freeMemory);
-    return free_memory;
+    return freeMemory;
 }
 size_t cv::gpu::DeviceInfo::totalMemory() const
 {
-    size_t free_memory, total_memory;
+    size_t totalMemory, freeMemory;
-    queryMemory(free_memory, total_memory);
+    queryMemory(totalMemory, freeMemory);
-    return total_memory;
+    return totalMemory;
 }
-bool cv::gpu::DeviceInfo::supports(cv::gpu::FeatureSet feature_set) const
+bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const
 {
    int version = majorVersion() * 10 + minorVersion();
    return version >= feature_set;
@@ -370,24 +409,12 @@ bool cv::gpu::DeviceInfo::isCompatible() const
 void cv::gpu::DeviceInfo::query()
 {
-    cudaDeviceProp prop;
+    const cudaDeviceProp* prop = deviceProps.get(device_id_);
    cudaSafeCall(cudaGetDeviceProperties(&prop, device_id_));
    name_ = prop.name;
    multi_processor_count_ = prop.multiProcessorCount;
    majorVersion_ = prop.major;
    minorVersion_ = prop.minor;
 }
-void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory) const
+    name_ = prop->name;
-{
+    multi_processor_count_ = prop->multiProcessorCount;
-    int prev_device_id = getDevice();
+    majorVersion_ = prop->major;
-    if (prev_device_id != device_id_)
+    minorVersion_ = prop->minor;
        setDevice(device_id_);
    cudaSafeCall(cudaMemGetInfo(&free_memory, &total_memory));
    if (prev_device_id != device_id_)
        setDevice(prev_device_id);
 }
 namespace
@@ -764,6 +791,50 @@ cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), re
    m.download(*this);
 }
 void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m)
 {
    int area = rows * cols;
    if (m.empty() || m.type() != type || !m.isContinuous() || m.size().area() < area)
        m.create(1, area, type);
    m.cols = cols;
    m.rows = rows;
    m.step = m.elemSize() * cols;
    m.flags |= Mat::CONTINUOUS_FLAG;
 }
 void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
 {
    if (m.empty() || m.type() != type || m.data != m.datastart)
        m.create(rows, cols, type);
    else
    {
        const size_t esz = m.elemSize();
        const ptrdiff_t delta2 = m.dataend - m.datastart;
        const size_t minstep = m.cols * esz;
        Size wholeSize;
        wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / m.step + 1), m.rows);
        wholeSize.width = std::max(static_cast<int>((delta2 - m.step * (wholeSize.height - 1)) / esz), m.cols);
        if (wholeSize.height < rows || wholeSize.width < cols)
            m.create(rows, cols, type);
        else
        {
            m.cols = cols;
            m.rows = rows;
        }
    }
 }
 GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
 {
    if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
        return mat(Rect(0, 0, cols, rows));
    return mat = GpuMat(rows, cols, type);
 }
 namespace
 {
    class GpuFuncTable
@@ -787,25 +858,25 @@ namespace
    };
 }
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER_)
+#ifndef HAVE_CUDA
 namespace
 {
    class EmptyFuncTable : public GpuFuncTable
    {
    public:
-        void copy(const Mat&, GpuMat&) const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+        void copy(const Mat&, GpuMat&) const { throw_nogpu; }
-        void copy(const GpuMat&, Mat&) const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+        void copy(const GpuMat&, Mat&) const { throw_nogpu; }
-        void copy(const GpuMat&, GpuMat&) const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+        void copy(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&) const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+        void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
-        void convert(const GpuMat&, GpuMat&, double, double) const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+        void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu; }
-        void setTo(GpuMat&, Scalar, const GpuMat&) const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+        void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu; }
-        void mallocPitch(void**, size_t*, size_t, size_t) const { CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support"); }
+        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
        void free(void*) const {}
    };