refactored gpu module

2011-01-19 10:54:58 +00:00 · 2011-01-19 10:54:58 +00:00 · 90ae1e3aed
commit 90ae1e3aed
parent 8503f75212
6 changed files with 493 additions and 433 deletions
--- a/doc/gpu_initialization.tex
+++ b/doc/gpu_initialization.tex
@ -69,22 +69,33 @@ Returns true, if the specified GPU has atomics support, otherwise false.
 \end{description} 


-\cvCppFunc{gpu::checkPtxVersion}
+\cvCppFunc{gpu::ptxVersionIs}
 Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false.

-\cvdefCpp{template $<$unsigned int cmp\_op$>$\newline
-bool checkPtxVersion(int major, int minor);}
+\cvdefCpp{bool ptxVersionIs(int major, int minor);}
 \begin{description}
-\cvarg{cmp\_op}{Comparison operation:
+\cvarg{major}{Major compute capability version.}
+\cvarg{minor}{Minor compute capability version.}
+\end{description}
+
+
+\cvCppFunc{gpu::ptxVersionIsLessOrEqual}
+Returns true, if the GPU module was built with PTX support of the given compute capability or less, otherwise false.
+
+\cvdefCpp{bool ptxVersionIsLessOrEqual(int major, int minor);}
 \begin{description}
-\cvarg{CMP\_EQ}{Return true, if at least one of GPU module PTX versions matches the given one, otherwise false}
-\cvarg{CMP\_LT}{Return true, if at least one of GPU module PTX versions is less than the given one, otherwise false}
-\cvarg{CMP\_LE}{Return true, if at least one of GPU module PTX versions is less or equal to the given one, otherwise false}
-\cvarg{CMP\_GT}{Return true, if at least one of GPU module PTX versions is greater than the given one, otherwise false}
-\cvarg{CMP\_GE}{Return true, if at least one of GPU module PTX versions is greater or equal to the given one, otherwise false}
-\end{description}}
-\cvarg{major}{Major CC version.}
-\cvarg{minor}{Minor CC version.}
+\cvarg{major}{Major compute capability version.}
+\cvarg{minor}{Minor compute capability version.}
+\end{description}
+
+
+\cvCppFunc{gpu::ptxVersionIsGreaterOrEqual}
+Returns true, if the GPU module was built with PTX support of the given compute capability or greater, otherwise false.
+
+\cvdefCpp{bool ptxVersionIsGreaterOrEqual(int major, int minor);}
+\begin{description}
+\cvarg{major}{Major compute capability version.}
+\cvarg{minor}{Minor compute capability version.}
 \end{description}


--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -72,8 +72,9 @@ namespace cv
        CV_EXPORTS bool hasNativeDoubleSupport(int device);
        CV_EXPORTS bool hasAtomicsSupport(int device);

-        template <unsigned int cmp_op>
-        CV_EXPORTS bool checkPtxVersion(int major, int minor);
+        CV_EXPORTS bool ptxVersionIs(int major, int minor);
+        CV_EXPORTS bool ptxVersionIsLessOrEqual(int major, int minor);
+        CV_EXPORTS bool ptxVersionIsGreaterOrEqual(int major, int minor);

        //! Checks if the GPU module is PTX compatible with the given NVIDIA device
        CV_EXPORTS bool isCompatibleWith(int device);
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc

 ////////////////////////////// Column Sum //////////////////////////////////////

-    __global__ void column_sum_kernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
+    __global__ void column_sumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
    {
        int x = blockIdx.x * blockDim.x + threadIdx.x;

@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
        dim3 threads(256);
        dim3 grid(divUp(src.cols, threads.x));

-        column_sum_kernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
+        column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
        cudaSafeCall(cudaThreadSynchronize());
    }

--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
@ -133,85 +133,81 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device)

 namespace 
 {
-    template <unsigned int cmp_op>
-    bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2);
-
-    template <>
-    bool comparePairs<CMP_EQ>(int lhs1, int lhs2, int rhs1, int rhs2)
+    struct ComparerEqual 
    {
-        return lhs1 == rhs1 && lhs2 == rhs2;
-    }
+        bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
+        {
+            return lhs1 == rhs1 && lhs2 == rhs2;
+        }
+    };

-    template <>
-    bool comparePairs<CMP_GT>(int lhs1, int lhs2, int rhs1, int rhs2)
+
+    struct ComparerLessOrEqual
    {
-        return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 > rhs2);
-    }
+        bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
+        {
+            return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2);
+        }
+    };

-    template <>
-    bool comparePairs<CMP_GE>(int lhs1, int lhs2, int rhs1, int rhs2)
+
+    struct ComparerGreaterOrEqual
    {
-        return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2);
-    }
+        bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
+        {
+            return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2);
+        }
+    };

-    template <>
-    bool comparePairs<CMP_LT>(int lhs1, int lhs2, int rhs1, int rhs2)
+
+    template <typename Comparer>
+    bool checkPtxVersion(int major, int minor, Comparer cmp) 
    {
-        return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 < rhs2);
-    }
-
-
-    template <>
-    bool comparePairs<CMP_LE>(int lhs1, int lhs2, int rhs1, int rhs2)
-    {
-        return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2);
-    }
-
-    template <>
-    bool comparePairs<CMP_NE>(int lhs1, int lhs2, int rhs1, int rhs2)
-    {
-        return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2);
-    }
-}
-
-
-template <unsigned int cmp_op>
-CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor) 
-{
 #ifdef OPENCV_GPU_CUDA_ARCH_10
-    if (comparePairs<cmp_op>(1, 0, major, minor)) return true;
+        if (cmp(1, 0, major, minor)) return true;
 #endif

 #ifdef OPENCV_GPU_CUDA_ARCH_11
-    if (comparePairs<cmp_op>(1, 1, major, minor)) return true;
+        if (cmp(1, 1, major, minor)) return true;
 #endif

 #ifdef OPENCV_GPU_CUDA_ARCH_12
-    if (comparePairs<cmp_op>(1, 2, major, minor)) return true;
+        if (cmp(1, 2, major, minor)) return true;
 #endif

 #ifdef OPENCV_GPU_CUDA_ARCH_13
-    if (comparePairs<cmp_op>(1, 3, major, minor)) return true;
+        if (cmp(1, 3, major, minor)) return true;
 #endif

 #ifdef OPENCV_GPU_CUDA_ARCH_20
-    if (comparePairs<cmp_op>(2, 0, major, minor)) return true;
+        if (cmp(2, 0, major, minor)) return true;
 #endif

 #ifdef OPENCV_GPU_CUDA_ARCH_21
-    if (comparePairs<cmp_op>(2, 1, major, minor)) return true;
+        if (cmp(2, 1, major, minor)) return true;
 #endif

-    return false;
+        return false;
+    }
 }


-template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_EQ>(int major, int minor);
-template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_GT>(int major, int minor);
-template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_GE>(int major, int minor);
-template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_LT>(int major, int minor);
-template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_LE>(int major, int minor);
-template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_NE>(int major, int minor);
+CV_EXPORTS bool cv::gpu::ptxVersionIs(int major, int minor)
+{
+    return checkPtxVersion(major, minor, ComparerEqual());
+}
+
+
+CV_EXPORTS bool cv::gpu::ptxVersionIsLessOrEqual(int major, int minor)
+{
+    return checkPtxVersion(major, minor, ComparerLessOrEqual());
+}
+
+
+CV_EXPORTS bool cv::gpu::ptxVersionIsGreaterOrEqual(int major, int minor)
+{
+    return checkPtxVersion(major, minor, ComparerGreaterOrEqual());
+}


 CV_EXPORTS bool isCompatibleWith(int device)
@ -223,7 +219,7 @@ CV_EXPORTS bool isCompatibleWith(int device)
    int major, minor;
    getComputeCapability(device, major, minor);

-    return checkPtxVersion<CMP_LE>(major, minor);
+    return ptxVersionIsLessOrEqual(major, minor);
 }

 #endif
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@ -119,20 +119,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 namespace cv { namespace gpu { namespace mathfunc
 {
    template <typename T>
-    void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+    void sumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);

    template <typename T>
-    void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+    void sumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);

    template <typename T>
-    void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+    void sqrSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);

    template <typename T>
-    void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+    void sqrSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);

    namespace sum
    {
-        void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows);
+        void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
    }
 }}}

@ -149,19 +149,27 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
    using namespace mathfunc;

    typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
-    static const Caller callers[2][7] = 
-        { { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>, 
-            sum_multipass_caller<unsigned short>, sum_multipass_caller<short>, 
-            sum_multipass_caller<int>, sum_multipass_caller<float>, 0 },
-          { sum_caller<unsigned char>, sum_caller<char>, 
-            sum_caller<unsigned short>, sum_caller<short>, 
-            sum_caller<int>, sum_caller<float>, 0 } };

-    Size bufSize;
-    sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); 
-    ensureSizeIsEnough(bufSize, CV_8U, buf);
+    static Caller multipass_callers[7] = { 
+            sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, 
+            sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, 
+            sumMultipassCaller<int>, sumMultipassCaller<float>, 0 };

-    Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
+    static Caller singlepass_callers[7] = { 
+            sumCaller<unsigned char>, sumCaller<char>, 
+            sumCaller<unsigned short>, sumCaller<short>, 
+            sumCaller<int>, sumCaller<float>, 0 };
+
+    Size buf_size;
+    sum::getBufSizeRequired(src.cols, src.rows, src.channels(), 
+                               buf_size.width, buf_size.height); 
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+
+    Caller* callers = multipass_callers;
+    if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
+        callers = singlepass_callers;
+
+    Caller caller = callers[src.depth()];
    if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");

    double result[4];
@ -182,19 +190,27 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
    using namespace mathfunc;

    typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
-    static const Caller callers[2][7] = 
-        { { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>, 
-            sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>, 
-            sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 },
-          { sqsum_caller<unsigned char>, sqsum_caller<char>, 
-            sqsum_caller<unsigned short>, sqsum_caller<short>, 
-            sqsum_caller<int>, sqsum_caller<float>, 0 } };

-    Size bufSize;
-    sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); 
-    ensureSizeIsEnough(bufSize, CV_8U, buf);
+    static Caller multipass_callers[7] = { 
+            sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, 
+            sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, 
+            sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 };

-    Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()];
+    static Caller singlepass_callers[7] = { 
+            sqrSumCaller<unsigned char>, sqrSumCaller<char>, 
+            sqrSumCaller<unsigned short>, sqrSumCaller<short>, 
+            sqrSumCaller<int>, sqrSumCaller<float>, 0 };
+
+    Caller* callers = multipass_callers;
+    if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
+        callers = singlepass_callers;
+
+    Size buf_size;
+    sum::getBufSizeRequired(src.cols, src.rows, src.channels(), 
+                               buf_size.width, buf_size.height); 
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+
+    Caller caller = callers[src.depth()];
    if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");

    double result[4];
@ -207,19 +223,19 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)

 namespace cv { namespace gpu { namespace mathfunc { namespace minmax {

-    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
+    void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
    
    template <typename T> 
-    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
+    void minMaxCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);

    template <typename T> 
-    void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
+    void minMaxMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);

    template <typename T> 
-    void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
+    void minMaxMultipassCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);

    template <typename T> 
-    void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
+    void minMaxMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);

 }}}}

@ -238,23 +254,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep);
    typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep);

-    static const Caller callers[2][7] = 
-    { { min_max_multipass_caller<unsigned char>, min_max_multipass_caller<char>, 
-        min_max_multipass_caller<unsigned short>, min_max_multipass_caller<short>, 
-        min_max_multipass_caller<int>, min_max_multipass_caller<float>, 0 },
-      { min_max_caller<unsigned char>, min_max_caller<char>, 
-        min_max_caller<unsigned short>, min_max_caller<short>, 
-        min_max_caller<int>, min_max_caller<float>, min_max_caller<double> } };
+    static Caller multipass_callers[7] = { 
+            minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, 
+            minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, 
+            minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 };

-    static const MaskedCaller masked_callers[2][7] = 
-    { { min_max_mask_multipass_caller<unsigned char>, min_max_mask_multipass_caller<char>, 
-        min_max_mask_multipass_caller<unsigned short>, min_max_mask_multipass_caller<short>, 
-        min_max_mask_multipass_caller<int>, min_max_mask_multipass_caller<float>, 0 },
-      { min_max_mask_caller<unsigned char>, min_max_mask_caller<char>, 
-        min_max_mask_caller<unsigned short>, min_max_mask_caller<short>, 
-        min_max_mask_caller<int>, min_max_mask_caller<float>, 
-        min_max_mask_caller<double> } };
+    static Caller singlepass_callers[7] = { 
+            minMaxCaller<unsigned char>, minMaxCaller<char>, 
+            minMaxCaller<unsigned short>, minMaxCaller<short>, 
+            minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> };

+    static MaskedCaller masked_multipass_callers[7] = { 
+            minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, 
+            minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
+            minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 };
+
+    static MaskedCaller masked_singlepass_callers[7] = { 
+            minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, 
+            minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, 
+            minMaxMaskCaller<int>, minMaxMaskCaller<float>, 
+            minMaxMaskCaller<double> };

    CV_Assert(src.channels() == 1);
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
@ -263,19 +282,27 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    
-    Size bufSize;
-    get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
-    ensureSizeIsEnough(bufSize, CV_8U, buf);
+    Size buf_size;
+    getBufSizeRequired(src.cols, src.rows, src.elemSize(), buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);

    if (mask.empty())
    {
-        Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
+        Caller* callers = multipass_callers;
+        if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
+            callers = singlepass_callers;
+
+        Caller caller = callers[src.type()];
        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
        caller(src, minVal, maxVal, buf);
    }
    else
    {
-        MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
+        MaskedCaller* callers = masked_multipass_callers;
+        if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
+            callers = masked_singlepass_callers;
+
+        MaskedCaller caller = callers[src.type()];
        if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
        caller(src, mask, minVal, maxVal, buf);
    }
@ -287,23 +314,23 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp

 namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {

-    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
+    void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
                               int& b1rows, int& b2cols, int& b2rows);

    template <typename T> 
-    void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, 
+    void minMaxLocCaller(const DevMem2D src, double* minval, double* maxval, 
                            int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);

    template <typename T> 
-    void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
+    void minMaxLocMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
                                 int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);

    template <typename T> 
-    void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, 
+    void minMaxLocMultipassCaller(const DevMem2D src, double* minval, double* maxval, 
                                     int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);

    template <typename T> 
-    void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
+    void minMaxLocMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, 
                                           int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
 }}}}

@ -323,21 +350,26 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
    typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);

-    static const Caller callers[2][7] = 
-    { { min_max_loc_multipass_caller<unsigned char>, min_max_loc_multipass_caller<char>, 
-        min_max_loc_multipass_caller<unsigned short>, min_max_loc_multipass_caller<short>, 
-        min_max_loc_multipass_caller<int>, min_max_loc_multipass_caller<float>, 0 },
-      { min_max_loc_caller<unsigned char>, min_max_loc_caller<char>, 
-        min_max_loc_caller<unsigned short>, min_max_loc_caller<short>, 
-        min_max_loc_caller<int>, min_max_loc_caller<float>, min_max_loc_caller<double> } };
+    static Caller multipass_callers[7] = { 
+            minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, 
+            minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, 
+            minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 };

-    static const MaskedCaller masked_callers[2][7] = 
-    { { min_max_loc_mask_multipass_caller<unsigned char>, min_max_loc_mask_multipass_caller<char>, 
-        min_max_loc_mask_multipass_caller<unsigned short>, min_max_loc_mask_multipass_caller<short>, 
-        min_max_loc_mask_multipass_caller<int>, min_max_loc_mask_multipass_caller<float>, 0 },
-      { min_max_loc_mask_caller<unsigned char>, min_max_loc_mask_caller<char>, 
-        min_max_loc_mask_caller<unsigned short>, min_max_loc_mask_caller<short>, 
-        min_max_loc_mask_caller<int>, min_max_loc_mask_caller<float>, min_max_loc_mask_caller<double> } };
+    static Caller singlepass_callers[7] = { 
+            minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, 
+            minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, 
+            minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> };
+
+    static MaskedCaller masked_multipass_callers[7] = { 
+            minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>, 
+            minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, 
+            minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 };
+
+    static MaskedCaller masked_singlepass_callers[7] = { 
+            minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, 
+            minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, 
+            minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, 
+            minMaxLocMaskCaller<double> };

    CV_Assert(src.channels() == 1);
    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
@ -348,21 +380,29 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    int minLoc_[2];
    int maxLoc_[2];

-    Size valBufSize, locBufSize;
-    get_buf_size_required(src.cols, src.rows, src.elemSize(), valBufSize.width, 
-                          valBufSize.height, locBufSize.width, locBufSize.height);
-    ensureSizeIsEnough(valBufSize, CV_8U, valBuf);
-    ensureSizeIsEnough(locBufSize, CV_8U, locBuf);
+    Size valbuf_size, locbuf_size;
+    getBufSizeRequired(src.cols, src.rows, src.elemSize(), valbuf_size.width, 
+                          valbuf_size.height, locbuf_size.width, locbuf_size.height);
+    ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
+    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);

    if (mask.empty())
    {
-        Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
+        Caller* callers = multipass_callers;
+        if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
+            callers = singlepass_callers;
+
+        Caller caller = callers[src.type()];
        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
        caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }
    else
    {
-        MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()];
+        MaskedCaller* callers = masked_multipass_callers;
+        if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
+            callers = masked_singlepass_callers;
+
+        MaskedCaller caller = callers[src.type()];
        if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
        caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
    }
@ -376,13 +416,13 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point

 namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {

-    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
+    void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);

    template <typename T> 
-    int count_non_zero_caller(const DevMem2D src, PtrStep buf);
+    int countNonZeroCaller(const DevMem2D src, PtrStep buf);

    template <typename T> 
-    int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf);
+    int countNonZeroMultipassCaller(const DevMem2D src, PtrStep buf);

 }}}}

@ -400,22 +440,29 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)

    typedef int (*Caller)(const DevMem2D src, PtrStep buf);

-    static const Caller callers[2][7] = 
-    { { count_non_zero_multipass_caller<unsigned char>, count_non_zero_multipass_caller<char>,
-        count_non_zero_multipass_caller<unsigned short>, count_non_zero_multipass_caller<short>,
-        count_non_zero_multipass_caller<int>, count_non_zero_multipass_caller<float>, 0},
-      { count_non_zero_caller<unsigned char>, count_non_zero_caller<char>,
-        count_non_zero_caller<unsigned short>, count_non_zero_caller<short>,
-        count_non_zero_caller<int>, count_non_zero_caller<float>, count_non_zero_caller<double> } };
+    static Caller multipass_callers[7] = { 
+            countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
+            countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
+            countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 };
+
+    static Caller singlepass_callers[7] = { 
+            countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
+            countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
+            countNonZeroCaller<int>, countNonZeroCaller<float>, 
+            countNonZeroCaller<double> };

    CV_Assert(src.channels() == 1);
    CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));

-    Size bufSize;
-    get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height);
-    ensureSizeIsEnough(bufSize, CV_8U, buf);
+    Size buf_size;
+    getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);

-    Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
+    Caller* callers = multipass_callers;
+    if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
+        callers = singlepass_callers;
+
+    Caller caller = callers[src.type()];
    if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
    return caller(src, buf);
 }