fixed gpu core tests (added additional check for device's feature support)

added assertion on double types for old devices
2012-03-26 14:33:43 +00:00
parent 98d7b10c16
commit 26691e00d4
6 changed files with 1039 additions and 525 deletions
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -132,7 +132,7 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );

    cudaSafeCall( cudaDeviceSynchronize() );
-    
+
    double* ptrs[2] = {mean.val, stddev.val};
    dbuf.download(ptrs);
 }
@@ -148,6 +148,8 @@ double cv::gpu::norm(const GpuMat& src, int normType)

 double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
 {
+    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
+
    GpuMat src_single_channel = src.reshape(1);

    if (normType == NORM_L1)
@@ -156,22 +158,16 @@ double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
    if (normType == NORM_L2)
        return std::sqrt(sqrSum(src_single_channel, buf)[0]);

-    if (normType == NORM_INF)
-    {
-        double min_val, max_val;
-        minMax(src_single_channel, &min_val, &max_val, GpuMat(), buf);
-        return std::max(std::abs(min_val), std::abs(max_val));
-    }
-
-    CV_Error(CV_StsBadArg, "norm: unsupported norm type");
-    return 0;
+    // NORM_INF
+    double min_val, max_val;
+    minMax(src_single_channel, &min_val, &max_val, GpuMat(), buf);
+    return std::max(std::abs(min_val), std::abs(max_val));
 }

 double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 {
-    CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
-
    CV_Assert(src1.type() == CV_8UC1);
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);

    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
@@ -184,7 +180,7 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    sz.height = src1.rows;

    int funcIdx = normType >> 1;
-    
+
    double retVal;

    DeviceBuffer dbuf;
@@ -192,7 +188,7 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );

    cudaSafeCall( cudaDeviceSynchronize() );
-    
+
    dbuf.download(&retVal);

    return retVal;
@@ -201,9 +197,9 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 ////////////////////////////////////////////////////////////////////////
 // Sum

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
        namespace sum
        {
@@ -230,34 +226,36 @@ namespace cv { namespace gpu { namespace device
    }
 }}}

-Scalar cv::gpu::sum(const GpuMat& src) 
+Scalar cv::gpu::sum(const GpuMat& src)
 {
    GpuMat buf;
    return sum(src, buf);
 }


-Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) 
+Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;

    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);

-    static Caller multipass_callers[7] = 
-    { 
-        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, 
-        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, 
-        sumMultipassCaller<int>, sumMultipassCaller<float>, 0 
+    static Caller multipass_callers[] =
+    {
+        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
+        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
+        sumMultipassCaller<int>, sumMultipassCaller<float>
    };

-    static Caller singlepass_callers[7] = { 
-        sumCaller<unsigned char>, sumCaller<char>, 
-        sumCaller<unsigned short>, sumCaller<short>, 
-        sumCaller<int>, sumCaller<float>, 0 
+    static Caller singlepass_callers[] = {
+        sumCaller<unsigned char>, sumCaller<char>,
+        sumCaller<unsigned short>, sumCaller<short>,
+        sumCaller<int>, sumCaller<float>
    };

+    CV_Assert(src.depth() <= CV_32F);
+
    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    Caller* callers = multipass_callers;
@@ -265,7 +263,6 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;

    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");

    double result[4];
    caller(src, buf, result, src.channels());
@@ -273,35 +270,37 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 }


-Scalar cv::gpu::absSum(const GpuMat& src) 
+Scalar cv::gpu::absSum(const GpuMat& src)
 {
    GpuMat buf;
    return absSum(src, buf);
 }


-Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) 
+Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;

    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);

-    static Caller multipass_callers[7] = 
-    { 
-        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, 
-        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, 
-        absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 
+    static Caller multipass_callers[] =
+    {
+        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
+        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
+        absSumMultipassCaller<int>, absSumMultipassCaller<float>
    };

-    static Caller singlepass_callers[7] = 
-    {        
-        absSumCaller<unsigned char>, absSumCaller<char>, 
-        absSumCaller<unsigned short>, absSumCaller<short>, 
-        absSumCaller<int>, absSumCaller<float>, 0 
+    static Caller singlepass_callers[] =
+    {
+        absSumCaller<unsigned char>, absSumCaller<char>,
+        absSumCaller<unsigned short>, absSumCaller<short>,
+        absSumCaller<int>, absSumCaller<float>
    };

+    CV_Assert(src.depth() <= CV_32F);
+
    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    Caller* callers = multipass_callers;
@@ -309,7 +308,6 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;

    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "absSum: unsupported type");

    double result[4];
    caller(src, buf, result, src.channels());
@@ -317,43 +315,44 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 }


-Scalar cv::gpu::sqrSum(const GpuMat& src) 
+Scalar cv::gpu::sqrSum(const GpuMat& src)
 {
    GpuMat buf;
    return sqrSum(src, buf);
 }


-Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) 
+Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 {
-    using namespace ::cv::gpu::device::matrix_reductions::sum;
+    using namespace cv::gpu::device::matrix_reductions::sum;

    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);

-    static Caller multipass_callers[7] = 
-    { 
-        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, 
-        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, 
-        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 
+    static Caller multipass_callers[] =
+    {
+        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
+        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
+        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>
    };

-    static Caller singlepass_callers[7] = 
-    { 
-        sqrSumCaller<unsigned char>, sqrSumCaller<char>, 
-        sqrSumCaller<unsigned short>, sqrSumCaller<short>, 
-        sqrSumCaller<int>, sqrSumCaller<float>, 0 
+    static Caller singlepass_callers[7] =
+    {
+        sqrSumCaller<unsigned char>, sqrSumCaller<char>,
+        sqrSumCaller<unsigned short>, sqrSumCaller<short>,
+        sqrSumCaller<int>, sqrSumCaller<float>
    };

+    CV_Assert(src.depth() <= CV_32F);
+
    Caller* callers = multipass_callers;
    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
        callers = singlepass_callers;

    Size buf_size;
-    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    Caller caller = callers[src.depth()];
-    if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");

    double result[4];
    caller(src, buf, result, src.channels());
@@ -363,24 +362,24 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 ////////////////////////////////////////////////////////////////////////
 // Find min or max

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
-        namespace minmax 
+        namespace minmax
        {
            void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
-            
-            template <typename T> 
+
+            template <typename T>
            void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);

-            template <typename T> 
+            template <typename T>
            void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);

-            template <typename T> 
+            template <typename T>
            void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);

-            template <typename T> 
+            template <typename T>
            void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
        }
    }
@@ -401,41 +400,47 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);
    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);

-    static Caller multipass_callers[7] = 
-    { 
-        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, 
-        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, 
-        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 
+    static Caller multipass_callers[] =
+    {
+        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
+        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
+        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
    };

-    static Caller singlepass_callers[7] = 
-    { 
-        minMaxCaller<unsigned char>, minMaxCaller<char>, 
-        minMaxCaller<unsigned short>, minMaxCaller<short>, 
-        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> 
+    static Caller singlepass_callers[] =
+    {
+        minMaxCaller<unsigned char>, minMaxCaller<char>,
+        minMaxCaller<unsigned short>, minMaxCaller<short>,
+        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
    };

-    static MaskedCaller masked_multipass_callers[7] = 
-    { 
-        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, 
+    static MaskedCaller masked_multipass_callers[] =
+    {
+        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
        minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
        minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
    };

-    static MaskedCaller masked_singlepass_callers[7] =
-    { 
-        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, 
-        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, 
-        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double> 
+    static MaskedCaller masked_singlepass_callers[] =
+    {
+        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
+        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
+        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double>
    };

+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
-
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));

+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
-    
+
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
@@ -447,7 +452,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
            callers = singlepass_callers;

        Caller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, minVal, maxVal, buf);
    }
    else
@@ -457,7 +462,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
            callers = masked_singlepass_callers;

        MaskedCaller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, mask, minVal, maxVal, buf);
    }
 }
@@ -466,36 +471,36 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 ////////////////////////////////////////////////////////////////////////
 // Locate min and max

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
-        namespace minmaxloc 
+        namespace minmaxloc
        {
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
+            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
                                    int& b1rows, int& b2cols, int& b2rows);

-            template <typename T> 
-            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
+            template <typename T>
+            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
                                 int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

-            template <typename T> 
-            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            template <typename T>
+            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
                                     int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

-            template <typename T> 
-            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
+            template <typename T>
+            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
                                          int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

-            template <typename T> 
-            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            template <typename T>
+            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
                                              int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
        }
    }
 }}}

 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
-{    
+{
    GpuMat valBuf, locBuf;
    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
 }
@@ -508,45 +513,51 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);

-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[] =
    {
-        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, 
-        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, 
-        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 
+        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
+        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
+        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
    };

-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[] =
    {
-        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, 
-        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, 
-        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> 
+        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
+        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
+        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
    };

-    static MaskedCaller masked_multipass_callers[7] = 
+    static MaskedCaller masked_multipass_callers[] =
    {
        minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
-        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, 
-        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 
+        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
+        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
    };

-    static MaskedCaller masked_singlepass_callers[7] = 
-    { 
-        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, 
-        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, 
-        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double> 
+    static MaskedCaller masked_singlepass_callers[] =
+    {
+        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
+        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
+        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
    };

+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);
-
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));

+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    int minLoc_[2];
    int maxLoc_[2];

    Size valbuf_size, locbuf_size;
-    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width, 
+    getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width,
                       valbuf_size.height, locbuf_size.width, locbuf_size.height);
    ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
@@ -558,7 +569,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
            callers = singlepass_callers;

        Caller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }
    else
@@ -568,7 +579,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
            callers = masked_singlepass_callers;

        MaskedCaller caller = callers[src.type()];
-        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
+        CV_Assert(caller != 0);
        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }

@@ -579,18 +590,18 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 //////////////////////////////////////////////////////////////////////////////
 // Count non-zero elements

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
-        namespace countnonzero 
+        namespace countnonzero
        {
            void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);

-            template <typename T> 
+            template <typename T>
            int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);

-            template <typename T> 
+            template <typename T>
            int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
        }
    }
@@ -609,21 +620,28 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)

    typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);

-    static Caller multipass_callers[7] = 
+    static Caller multipass_callers[7] =
    {
        countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
        countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
-        countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 
+        countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0
    };

-    static Caller singlepass_callers[7] = 
+    static Caller singlepass_callers[7] =
    {
        countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
        countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
        countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };

+    CV_Assert(src.depth() <= CV_64F);
    CV_Assert(src.channels() == 1);

+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
@@ -633,16 +651,16 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
        callers = singlepass_callers;

    Caller caller = callers[src.type()];
-    if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
+    CV_Assert(caller != 0);
    return caller(src, buf);
 }

 //////////////////////////////////////////////////////////////////////////////
 // reduce

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
        template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
@@ -666,7 +684,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    {
        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

-        static const caller_t callers[6][6] = 
+        static const caller_t callers[6][6] =
        {
            {
                reduceRows_gpu<unsigned char, int, unsigned char>,
@@ -719,6 +737,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
        };

        const caller_t func = callers[src.depth()][dst.depth()];
+
        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");

@@ -728,7 +747,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    {
        typedef void (*caller_t)(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

-        static const caller_t callers[6][6] = 
+        static const caller_t callers[6][6] =
        {
            {
                reduceCols_gpu<unsigned char, int, unsigned char>,
@@ -781,10 +800,11 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
        };

        const caller_t func = callers[src.depth()][dst.depth()];
+
        if (!func)
            CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");

-        func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));        
+        func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
    }
 }