optimizations:
- new reduce implementation (with kepler optimizations) - saturate_cast via asm command - video SIMD instructions in element operations - float arithmetics instead of double - new deviceSupports function
This commit is contained in:
@@ -79,6 +79,8 @@ namespace cv { namespace gpu
|
||||
WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
|
||||
};
|
||||
|
||||
CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
|
||||
|
||||
// Gives information about what GPU archs this OpenCV GPU module was
|
||||
// compiled for
|
||||
class CV_EXPORTS TargetArchs
|
||||
|
@@ -44,6 +44,7 @@
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/type_traits.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
|
||||
void writeScalar(const int*);
|
||||
void writeScalar(const float*);
|
||||
void writeScalar(const double*);
|
||||
void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
|
||||
void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
|
||||
}}}
|
||||
|
||||
@@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
|
||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct Convertor : unary_function<T, D>
|
||||
template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
|
||||
{
|
||||
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
|
||||
Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
|
||||
|
||||
__device__ __forceinline__ D operator()(const T& src) const
|
||||
__device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return saturate_cast<D>(alpha * src + beta);
|
||||
}
|
||||
|
||||
double alpha, beta;
|
||||
S alpha, beta;
|
||||
};
|
||||
|
||||
namespace detail
|
||||
@@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
|
||||
template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
|
||||
{
|
||||
};
|
||||
|
||||
template<typename T, typename D>
|
||||
template<typename T, typename D, typename S>
|
||||
void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
|
||||
{
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
|
||||
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
|
||||
Convertor<T, D> op(alpha, beta);
|
||||
Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
|
||||
cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
|
||||
}
|
||||
|
||||
@@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
|
||||
|
||||
static const caller_t tab[8][8] =
|
||||
static const caller_t tab[7][7] =
|
||||
{
|
||||
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
|
||||
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
|
||||
|
||||
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
|
||||
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
|
||||
|
||||
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
|
||||
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
|
||||
|
||||
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
|
||||
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
|
||||
|
||||
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
|
||||
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
|
||||
|
||||
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
|
||||
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
|
||||
|
||||
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
|
||||
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
|
||||
|
||||
{0,0,0,0,0,0,0,0}
|
||||
{
|
||||
cvt_<uchar, uchar, float>,
|
||||
cvt_<uchar, schar, float>,
|
||||
cvt_<uchar, ushort, float>,
|
||||
cvt_<uchar, short, float>,
|
||||
cvt_<uchar, int, float>,
|
||||
cvt_<uchar, float, float>,
|
||||
cvt_<uchar, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<schar, uchar, float>,
|
||||
cvt_<schar, schar, float>,
|
||||
cvt_<schar, ushort, float>,
|
||||
cvt_<schar, short, float>,
|
||||
cvt_<schar, int, float>,
|
||||
cvt_<schar, float, float>,
|
||||
cvt_<schar, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<ushort, uchar, float>,
|
||||
cvt_<ushort, schar, float>,
|
||||
cvt_<ushort, ushort, float>,
|
||||
cvt_<ushort, short, float>,
|
||||
cvt_<ushort, int, float>,
|
||||
cvt_<ushort, float, float>,
|
||||
cvt_<ushort, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<short, uchar, float>,
|
||||
cvt_<short, schar, float>,
|
||||
cvt_<short, ushort, float>,
|
||||
cvt_<short, short, float>,
|
||||
cvt_<short, int, float>,
|
||||
cvt_<short, float, float>,
|
||||
cvt_<short, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<int, uchar, float>,
|
||||
cvt_<int, schar, float>,
|
||||
cvt_<int, ushort, float>,
|
||||
cvt_<int, short, float>,
|
||||
cvt_<int, int, double>,
|
||||
cvt_<int, float, double>,
|
||||
cvt_<int, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<float, uchar, float>,
|
||||
cvt_<float, schar, float>,
|
||||
cvt_<float, ushort, float>,
|
||||
cvt_<float, short, float>,
|
||||
cvt_<float, int, float>,
|
||||
cvt_<float, float, float>,
|
||||
cvt_<float, double, double>
|
||||
},
|
||||
{
|
||||
cvt_<double, uchar, double>,
|
||||
cvt_<double, schar, double>,
|
||||
cvt_<double, ushort, double>,
|
||||
cvt_<double, short, double>,
|
||||
cvt_<double, int, double>,
|
||||
cvt_<double, float, double>,
|
||||
cvt_<double, double, double>
|
||||
}
|
||||
};
|
||||
|
||||
caller_t func = tab[sdepth][ddepth];
|
||||
if (!func)
|
||||
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
|
||||
|
||||
func(src, dst, alpha, beta, stream);
|
||||
}
|
||||
|
||||
|
@@ -45,8 +45,7 @@
|
||||
#include <iostream>
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <npp.h>
|
||||
|
||||
#define CUDART_MINIMUM_REQUIRED_VERSION 4010
|
||||
@@ -69,33 +68,89 @@ using namespace cv::gpu;
|
||||
|
||||
namespace
|
||||
{
|
||||
// Compares value to set using the given comparator. Returns true if
|
||||
// there is at least one element x in the set satisfying to: x cmp value
|
||||
// predicate.
|
||||
template <typename Comparer>
|
||||
bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
|
||||
class CudaArch
|
||||
{
|
||||
public:
|
||||
CudaArch();
|
||||
|
||||
bool builtWith(FeatureSet feature_set) const;
|
||||
bool hasPtx(int major, int minor) const;
|
||||
bool hasBin(int major, int minor) const;
|
||||
bool hasEqualOrLessPtx(int major, int minor) const;
|
||||
bool hasEqualOrGreaterPtx(int major, int minor) const;
|
||||
bool hasEqualOrGreaterBin(int major, int minor) const;
|
||||
|
||||
private:
|
||||
static void fromStr(const string& set_as_str, vector<int>& arr);
|
||||
|
||||
vector<int> bin;
|
||||
vector<int> ptx;
|
||||
vector<int> features;
|
||||
};
|
||||
|
||||
const CudaArch cudaArch;
|
||||
|
||||
CudaArch::CudaArch()
|
||||
{
|
||||
#ifdef HAVE_CUDA
|
||||
fromStr(CUDA_ARCH_BIN, bin);
|
||||
fromStr(CUDA_ARCH_PTX, ptx);
|
||||
fromStr(CUDA_ARCH_FEATURES, features);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool CudaArch::builtWith(FeatureSet feature_set) const
|
||||
{
|
||||
return !features.empty() && (features.back() >= feature_set);
|
||||
}
|
||||
|
||||
bool CudaArch::hasPtx(int major, int minor) const
|
||||
{
|
||||
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
|
||||
}
|
||||
|
||||
bool CudaArch::hasBin(int major, int minor) const
|
||||
{
|
||||
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
|
||||
{
|
||||
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
|
||||
{
|
||||
return !ptx.empty() && (ptx.back() >= major * 10 + minor);
|
||||
}
|
||||
|
||||
bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
|
||||
{
|
||||
return !bin.empty() && (bin.back() >= major * 10 + minor);
|
||||
}
|
||||
|
||||
void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
|
||||
{
|
||||
if (set_as_str.find_first_not_of(" ") == string::npos)
|
||||
return false;
|
||||
return;
|
||||
|
||||
std::stringstream stream(set_as_str);
|
||||
istringstream stream(set_as_str);
|
||||
int cur_value;
|
||||
|
||||
while (!stream.eof())
|
||||
{
|
||||
stream >> cur_value;
|
||||
if (cmp(cur_value, value))
|
||||
return true;
|
||||
arr.push_back(cur_value);
|
||||
}
|
||||
|
||||
return false;
|
||||
sort(arr.begin(), arr.end());
|
||||
}
|
||||
}
|
||||
|
||||
bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>());
|
||||
return cudaArch.builtWith(feature_set);
|
||||
#else
|
||||
(void)feature_set;
|
||||
return false;
|
||||
@@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
|
||||
bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>());
|
||||
return cudaArch.hasPtx(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
|
||||
bool cv::gpu::TargetArchs::hasBin(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>());
|
||||
return cudaArch.hasBin(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
|
||||
bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor,
|
||||
std::less_equal<int>());
|
||||
return cudaArch.hasEqualOrLessPtx(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
|
||||
|
||||
bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
|
||||
{
|
||||
return hasEqualOrGreaterPtx(major, minor) ||
|
||||
hasEqualOrGreaterBin(major, minor);
|
||||
return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
|
||||
}
|
||||
|
||||
bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>());
|
||||
return cudaArch.hasEqualOrGreaterPtx(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
|
||||
bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
|
||||
{
|
||||
#if defined (HAVE_CUDA)
|
||||
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor,
|
||||
std::greater_equal<int>());
|
||||
return cudaArch.hasEqualOrGreaterBin(major, minor);
|
||||
#else
|
||||
(void)major;
|
||||
(void)minor;
|
||||
@@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
|
||||
#endif
|
||||
}
|
||||
|
||||
bool cv::gpu::deviceSupports(FeatureSet feature_set)
|
||||
{
|
||||
static int versions[] =
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
};
|
||||
static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
|
||||
|
||||
const int devId = getDevice();
|
||||
|
||||
int version;
|
||||
|
||||
if (devId < cache_size && versions[devId] >= 0)
|
||||
version = versions[devId];
|
||||
else
|
||||
{
|
||||
DeviceInfo dev(devId);
|
||||
version = dev.majorVersion() * 10 + dev.minorVersion();
|
||||
if (devId < cache_size)
|
||||
versions[devId] = version;
|
||||
}
|
||||
|
||||
return TargetArchs::builtWith(feature_set) && (version >= feature_set);
|
||||
}
|
||||
|
||||
#if !defined (HAVE_CUDA)
|
||||
|
||||
#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
|
||||
|
Reference in New Issue
Block a user