refactored gpu module

This commit is contained in:
Alexey Spizhevoy 2011-01-19 10:54:58 +00:00
parent 8503f75212
commit 90ae1e3aed
6 changed files with 493 additions and 433 deletions

View File

@ -69,22 +69,33 @@ Returns true, if the specified GPU has atomics support, otherwise false.
\end{description} \end{description}
\cvCppFunc{gpu::checkPtxVersion} \cvCppFunc{gpu::ptxVersionIs}
Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false. Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false.
\cvdefCpp{template $<$unsigned int cmp\_op$>$\newline \cvdefCpp{bool ptxVersionIs(int major, int minor);}
bool checkPtxVersion(int major, int minor);}
\begin{description} \begin{description}
\cvarg{cmp\_op}{Comparison operation: \cvarg{major}{Major compute capability version.}
\cvarg{minor}{Minor compute capability version.}
\end{description}
\cvCppFunc{gpu::ptxVersionIsLessOrEqual}
Returns true, if the GPU module was built with PTX support of the given compute capability or less, otherwise false.
\cvdefCpp{bool ptxVersionIsLessOrEqual(int major, int minor);}
\begin{description} \begin{description}
\cvarg{CMP\_EQ}{Return true, if at least one of GPU module PTX versions matches the given one, otherwise false} \cvarg{major}{Major compute capability version.}
\cvarg{CMP\_LT}{Return true, if at least one of GPU module PTX versions is less than the given one, otherwise false} \cvarg{minor}{Minor compute capability version.}
\cvarg{CMP\_LE}{Return true, if at least one of GPU module PTX versions is less or equal to the given one, otherwise false} \end{description}
\cvarg{CMP\_GT}{Return true, if at least one of GPU module PTX versions is greater than the given one, otherwise false}
\cvarg{CMP\_GE}{Return true, if at least one of GPU module PTX versions is greater or equal to the given one, otherwise false}
\end{description}} \cvCppFunc{gpu::ptxVersionIsGreaterOrEqual}
\cvarg{major}{Major CC version.} Returns true, if the GPU module was built with PTX support of the given compute capability or greater, otherwise false.
\cvarg{minor}{Minor CC version.}
\cvdefCpp{bool ptxVersionIsGreaterOrEqual(int major, int minor);}
\begin{description}
\cvarg{major}{Major compute capability version.}
\cvarg{minor}{Minor compute capability version.}
\end{description} \end{description}

View File

@ -72,8 +72,9 @@ namespace cv
CV_EXPORTS bool hasNativeDoubleSupport(int device); CV_EXPORTS bool hasNativeDoubleSupport(int device);
CV_EXPORTS bool hasAtomicsSupport(int device); CV_EXPORTS bool hasAtomicsSupport(int device);
template <unsigned int cmp_op> CV_EXPORTS bool ptxVersionIs(int major, int minor);
CV_EXPORTS bool checkPtxVersion(int major, int minor); CV_EXPORTS bool ptxVersionIsLessOrEqual(int major, int minor);
CV_EXPORTS bool ptxVersionIsGreaterOrEqual(int major, int minor);
//! Checks if the GPU module is PTX compatible with the given NVIDIA device //! Checks if the GPU module is PTX compatible with the given NVIDIA device
CV_EXPORTS bool isCompatibleWith(int device); CV_EXPORTS bool isCompatibleWith(int device);

View File

@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
////////////////////////////// Column Sum ////////////////////////////////////// ////////////////////////////// Column Sum //////////////////////////////////////
__global__ void column_sum_kernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst) __global__ void column_sumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
dim3 threads(256); dim3 threads(256);
dim3 grid(divUp(src.cols, threads.x)); dim3 grid(divUp(src.cols, threads.x));
column_sum_kernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst); column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }

File diff suppressed because it is too large Load Diff

View File

@ -133,85 +133,81 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device)
namespace namespace
{ {
template <unsigned int cmp_op> struct ComparerEqual
bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2);
template <>
bool comparePairs<CMP_EQ>(int lhs1, int lhs2, int rhs1, int rhs2)
{ {
return lhs1 == rhs1 && lhs2 == rhs2; bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
} {
return lhs1 == rhs1 && lhs2 == rhs2;
}
};
template <>
bool comparePairs<CMP_GT>(int lhs1, int lhs2, int rhs1, int rhs2) struct ComparerLessOrEqual
{ {
return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 > rhs2); bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
} {
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2);
}
};
template <>
bool comparePairs<CMP_GE>(int lhs1, int lhs2, int rhs1, int rhs2) struct ComparerGreaterOrEqual
{ {
return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2); bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
} {
return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2);
}
};
template <>
bool comparePairs<CMP_LT>(int lhs1, int lhs2, int rhs1, int rhs2) template <typename Comparer>
bool checkPtxVersion(int major, int minor, Comparer cmp)
{ {
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 < rhs2);
}
template <>
bool comparePairs<CMP_LE>(int lhs1, int lhs2, int rhs1, int rhs2)
{
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2);
}
template <>
bool comparePairs<CMP_NE>(int lhs1, int lhs2, int rhs1, int rhs2)
{
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2);
}
}
template <unsigned int cmp_op>
CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor)
{
#ifdef OPENCV_GPU_CUDA_ARCH_10 #ifdef OPENCV_GPU_CUDA_ARCH_10
if (comparePairs<cmp_op>(1, 0, major, minor)) return true; if (cmp(1, 0, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_11 #ifdef OPENCV_GPU_CUDA_ARCH_11
if (comparePairs<cmp_op>(1, 1, major, minor)) return true; if (cmp(1, 1, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_12 #ifdef OPENCV_GPU_CUDA_ARCH_12
if (comparePairs<cmp_op>(1, 2, major, minor)) return true; if (cmp(1, 2, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_13 #ifdef OPENCV_GPU_CUDA_ARCH_13
if (comparePairs<cmp_op>(1, 3, major, minor)) return true; if (cmp(1, 3, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_20 #ifdef OPENCV_GPU_CUDA_ARCH_20
if (comparePairs<cmp_op>(2, 0, major, minor)) return true; if (cmp(2, 0, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_21 #ifdef OPENCV_GPU_CUDA_ARCH_21
if (comparePairs<cmp_op>(2, 1, major, minor)) return true; if (cmp(2, 1, major, minor)) return true;
#endif #endif
return false; return false;
}
} }
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_EQ>(int major, int minor); CV_EXPORTS bool cv::gpu::ptxVersionIs(int major, int minor)
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_GT>(int major, int minor); {
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_GE>(int major, int minor); return checkPtxVersion(major, minor, ComparerEqual());
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_LT>(int major, int minor); }
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_LE>(int major, int minor);
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_NE>(int major, int minor);
CV_EXPORTS bool cv::gpu::ptxVersionIsLessOrEqual(int major, int minor)
{
return checkPtxVersion(major, minor, ComparerLessOrEqual());
}
CV_EXPORTS bool cv::gpu::ptxVersionIsGreaterOrEqual(int major, int minor)
{
return checkPtxVersion(major, minor, ComparerGreaterOrEqual());
}
CV_EXPORTS bool isCompatibleWith(int device) CV_EXPORTS bool isCompatibleWith(int device)
@ -223,7 +219,7 @@ CV_EXPORTS bool isCompatibleWith(int device)
int major, minor; int major, minor;
getComputeCapability(device, major, minor); getComputeCapability(device, major, minor);
return checkPtxVersion<CMP_LE>(major, minor); return ptxVersionIsLessOrEqual(major, minor);
} }
#endif #endif

View File

@ -119,20 +119,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
namespace cv { namespace gpu { namespace mathfunc namespace cv { namespace gpu { namespace mathfunc
{ {
template <typename T> template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); void sumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T> template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); void sumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T> template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); void sqrSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T> template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); void sqrSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
namespace sum namespace sum
{ {
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
} }
}}} }}}
@ -149,19 +149,27 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
using namespace mathfunc; using namespace mathfunc;
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>,
sum_multipass_caller<unsigned short>, sum_multipass_caller<short>,
sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },
{ sum_caller<unsigned char>, sum_caller<char>,
sum_caller<unsigned short>, sum_caller<short>,
sum_caller<int>, sum_caller<float>, 0 } };
Size bufSize; static Caller multipass_callers[7] = {
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
ensureSizeIsEnough(bufSize, CV_8U, buf); sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
sumMultipassCaller<int>, sumMultipassCaller<float>, 0 };
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; static Caller singlepass_callers[7] = {
sumCaller<unsigned char>, sumCaller<char>,
sumCaller<unsigned short>, sumCaller<short>,
sumCaller<int>, sumCaller<float>, 0 };
Size buf_size;
sum::getBufSizeRequired(src.cols, src.rows, src.channels(),
buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Caller caller = callers[src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
double result[4]; double result[4];
@ -182,19 +190,27 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
using namespace mathfunc; using namespace mathfunc;
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>,
sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>,
sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },
{ sqsum_caller<unsigned char>, sqsum_caller<char>,
sqsum_caller<unsigned short>, sqsum_caller<short>,
sqsum_caller<int>, sqsum_caller<float>, 0 } };
Size bufSize; static Caller multipass_callers[7] = {
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
ensureSizeIsEnough(bufSize, CV_8U, buf); sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 };
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; static Caller singlepass_callers[7] = {
sqrSumCaller<unsigned char>, sqrSumCaller<char>,
sqrSumCaller<unsigned short>, sqrSumCaller<short>,
sqrSumCaller<int>, sqrSumCaller<float>, 0 };
Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Size buf_size;
sum::getBufSizeRequired(src.cols, src.rows, src.channels(),
buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller caller = callers[src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
double result[4]; double result[4];
@ -207,19 +223,19 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
namespace cv { namespace gpu { namespace mathfunc { namespace minmax { namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T> template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); void minMaxCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
template <typename T> template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); void minMaxMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
template <typename T> template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); void minMaxMultipassCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
template <typename T> template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); void minMaxMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
}}}} }}}}
@ -238,23 +254,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep); typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep);
typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep); typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep);
static const Caller callers[2][7] = static Caller multipass_callers[7] = {
{ { min_max_multipass_caller<unsigned char>, min_max_multipass_caller<char>, minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
min_max_multipass_caller<unsigned short>, min_max_multipass_caller<short>, minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
min_max_multipass_caller<int>, min_max_multipass_caller<float>, 0 }, minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 };
{ min_max_caller<unsigned char>, min_max_caller<char>,
min_max_caller<unsigned short>, min_max_caller<short>,
min_max_caller<int>, min_max_caller<float>, min_max_caller<double> } };
static const MaskedCaller masked_callers[2][7] = static Caller singlepass_callers[7] = {
{ { min_max_mask_multipass_caller<unsigned char>, min_max_mask_multipass_caller<char>, minMaxCaller<unsigned char>, minMaxCaller<char>,
min_max_mask_multipass_caller<unsigned short>, min_max_mask_multipass_caller<short>, minMaxCaller<unsigned short>, minMaxCaller<short>,
min_max_mask_multipass_caller<int>, min_max_mask_multipass_caller<float>, 0 }, minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> };
{ min_max_mask_caller<unsigned char>, min_max_mask_caller<char>,
min_max_mask_caller<unsigned short>, min_max_mask_caller<short>,
min_max_mask_caller<int>, min_max_mask_caller<float>,
min_max_mask_caller<double> } };
static MaskedCaller masked_multipass_callers[7] = {
minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 };
static MaskedCaller masked_singlepass_callers[7] = {
minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
minMaxMaskCaller<int>, minMaxMaskCaller<float>,
minMaxMaskCaller<double> };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
@ -263,19 +282,27 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
double minVal_; if (!minVal) minVal = &minVal_; double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_;
Size bufSize; Size buf_size;
get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height); getBufSizeRequired(src.cols, src.rows, src.elemSize(), buf_size.width, buf_size.height);
ensureSizeIsEnough(bufSize, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
if (mask.empty()) if (mask.empty())
{ {
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Caller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
caller(src, minVal, maxVal, buf); caller(src, minVal, maxVal, buf);
} }
else else
{ {
MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; MaskedCaller* callers = masked_multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = masked_singlepass_callers;
MaskedCaller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
caller(src, mask, minVal, maxVal, buf); caller(src, mask, minVal, maxVal, buf);
} }
@ -287,23 +314,23 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows); int& b1rows, int& b2cols, int& b2rows);
template <typename T> template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, void minMaxLocCaller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
template <typename T> template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, void minMaxLocMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
template <typename T> template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, void minMaxLocMultipassCaller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
template <typename T> template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, void minMaxLocMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
}}}} }}}}
@ -323,21 +350,26 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
static const Caller callers[2][7] = static Caller multipass_callers[7] = {
{ { min_max_loc_multipass_caller<unsigned char>, min_max_loc_multipass_caller<char>, minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
min_max_loc_multipass_caller<unsigned short>, min_max_loc_multipass_caller<short>, minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
min_max_loc_multipass_caller<int>, min_max_loc_multipass_caller<float>, 0 }, minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 };
{ min_max_loc_caller<unsigned char>, min_max_loc_caller<char>,
min_max_loc_caller<unsigned short>, min_max_loc_caller<short>,
min_max_loc_caller<int>, min_max_loc_caller<float>, min_max_loc_caller<double> } };
static const MaskedCaller masked_callers[2][7] = static Caller singlepass_callers[7] = {
{ { min_max_loc_mask_multipass_caller<unsigned char>, min_max_loc_mask_multipass_caller<char>, minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
min_max_loc_mask_multipass_caller<unsigned short>, min_max_loc_mask_multipass_caller<short>, minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
min_max_loc_mask_multipass_caller<int>, min_max_loc_mask_multipass_caller<float>, 0 }, minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> };
{ min_max_loc_mask_caller<unsigned char>, min_max_loc_mask_caller<char>,
min_max_loc_mask_caller<unsigned short>, min_max_loc_mask_caller<short>, static MaskedCaller masked_multipass_callers[7] = {
min_max_loc_mask_caller<int>, min_max_loc_mask_caller<float>, min_max_loc_mask_caller<double> } }; minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 };
static MaskedCaller masked_singlepass_callers[7] = {
minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>,
minMaxLocMaskCaller<double> };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
@ -348,21 +380,29 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
int minLoc_[2]; int minLoc_[2];
int maxLoc_[2]; int maxLoc_[2];
Size valBufSize, locBufSize; Size valbuf_size, locbuf_size;
get_buf_size_required(src.cols, src.rows, src.elemSize(), valBufSize.width, getBufSizeRequired(src.cols, src.rows, src.elemSize(), valbuf_size.width,
valBufSize.height, locBufSize.width, locBufSize.height); valbuf_size.height, locbuf_size.width, locbuf_size.height);
ensureSizeIsEnough(valBufSize, CV_8U, valBuf); ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
ensureSizeIsEnough(locBufSize, CV_8U, locBuf); ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
if (mask.empty()) if (mask.empty())
{ {
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Caller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
} }
else else
{ {
MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; MaskedCaller* callers = masked_multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = masked_singlepass_callers;
MaskedCaller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
} }
@ -376,13 +416,13 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
template <typename T> template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf); int countNonZeroCaller(const DevMem2D src, PtrStep buf);
template <typename T> template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf); int countNonZeroMultipassCaller(const DevMem2D src, PtrStep buf);
}}}} }}}}
@ -400,22 +440,29 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
typedef int (*Caller)(const DevMem2D src, PtrStep buf); typedef int (*Caller)(const DevMem2D src, PtrStep buf);
static const Caller callers[2][7] = static Caller multipass_callers[7] = {
{ { count_non_zero_multipass_caller<unsigned char>, count_non_zero_multipass_caller<char>, countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
count_non_zero_multipass_caller<unsigned short>, count_non_zero_multipass_caller<short>, countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
count_non_zero_multipass_caller<int>, count_non_zero_multipass_caller<float>, 0}, countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 };
{ count_non_zero_caller<unsigned char>, count_non_zero_caller<char>,
count_non_zero_caller<unsigned short>, count_non_zero_caller<short>, static Caller singlepass_callers[7] = {
count_non_zero_caller<int>, count_non_zero_caller<float>, count_non_zero_caller<double> } }; countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
countNonZeroCaller<int>, countNonZeroCaller<float>,
countNonZeroCaller<double> };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
Size bufSize; Size buf_size;
get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(bufSize, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Caller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
return caller(src, buf); return caller(src, buf);
} }