added assertion on device features (global atomics) into gpu tests

2012-03-27 07:33:39 +00:00 · 2012-03-27 07:33:39 +00:00 · bd13e9479b
commit bd13e9479b
parent 4a996111ea
6 changed files with 498 additions and 358 deletions
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -82,76 +82,76 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace bf_match
    {
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance,
            int cc, cudaStream_t stream);

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
            int cc, cudaStream_t stream);
    }

    namespace bf_knnmatch
    {
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
            int cc, cudaStream_t stream);
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
            int cc, cudaStream_t stream);
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
+            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
            int cc, cudaStream_t stream);

-        template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+        template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+        template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
            int cc, cudaStream_t stream);
-        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
            int cc, cudaStream_t stream);
    }

-    namespace bf_radius_match 
+    namespace bf_radius_match
    {
-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
            int cc, cudaStream_t stream);
-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
            int cc, cudaStream_t stream);
-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
            int cc, cudaStream_t stream);

-        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
            int cc, cudaStream_t stream);

-        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
            int cc, cudaStream_t stream);

-        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
            int cc, cudaStream_t stream);
    }
 }}}
@ -192,7 +192,7 @@ bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const
 // Match

 void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const GpuMat& train,
-    GpuMat& trainIdx, GpuMat& distance, 
+    GpuMat& trainIdx, GpuMat& distance,
    const GpuMat& mask, Stream& stream)
 {
    if (query.empty() || train.empty())
@ -200,25 +200,25 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const

    using namespace ::cv::gpu::device::bf_match;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
+    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
                             const DevMem2Di& trainIdx, const DevMem2Df& distance,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
-            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, 
-            matchL1_gpu<unsigned short>, matchL1_gpu<short>, 
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
-            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 
-            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
-            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, 
-            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, 
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
@ -334,7 +334,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
 }

 void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, const GpuMat& trainCollection,
-    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, 
+    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
    const GpuMat& masks, Stream& stream)
 {
    if (query.empty() || trainCollection.empty())
@ -342,8 +342,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c

    using namespace ::cv::gpu::device::bf_match;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
+    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
@ -453,25 +453,25 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co

    using namespace ::cv::gpu::device::bf_knnmatch;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
-                             const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
+    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
+                             const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
-            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, 
-            matchL1_gpu<unsigned short>, matchL1_gpu<short>, 
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
-            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 
-            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
-            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, 
-            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, 
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
@ -501,7 +501,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);
-    
+
    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

@ -520,7 +520,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
    knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
 }

-void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance, 
+void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance,
    vector< vector<DMatch> >& matches, bool compactResult)
 {
    if (trainIdx.empty() || distance.empty())
@ -536,7 +536,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c

    matches.clear();
    matches.reserve(nQuery);
-    
+
    const int* trainIdx_ptr = trainIdx.ptr<int>();
    const float* distance_ptr = distance.ptr<float>();

@ -582,25 +582,25 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer

    using namespace ::cv::gpu::device::bf_knnmatch;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
-                             const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
+    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
+                             const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
-            match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/, 
-            match2L1_gpu<unsigned short>, match2L1_gpu<short>, 
+            match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
+            match2L1_gpu<unsigned short>, match2L1_gpu<short>,
            match2L1_gpu<int>, match2L1_gpu<float>
        },
        {
-            0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/, 
-            0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/, 
+            0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
+            0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
            0/*match2L2_gpu<int>*/, match2L2_gpu<float>
        },
        {
-            match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/, 
-            match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/, 
+            match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
+            match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
            match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
        }
    };
@ -620,7 +620,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer

    caller_t func = callers[distType][query.depth()];
    CV_Assert(func != 0);
-    
+
    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

@ -654,7 +654,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat& trainIdx,

    matches.clear();
    matches.reserve(nQuery);
-    
+
    const int* trainIdx_ptr = trainIdx.ptr<int>();
    const int* imgIdx_ptr = imgIdx.ptr<int>();
    const float* distance_ptr = distance.ptr<float>();
@ -755,33 +755,33 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& query, vector<
 // RadiusMatch

 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, const GpuMat& train,
-    GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, 
+    GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
    const GpuMat& mask, Stream& stream)
 {
    if (query.empty() || train.empty())
        return;

-    using namespace ::cv::gpu::device::bf_radius_match;
+    using namespace cv::gpu::device::bf_radius_match;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
+                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
-            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, 
-            matchL1_gpu<unsigned short>, matchL1_gpu<short>, 
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
-            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 
-            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
-            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, 
-            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, 
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
@ -789,7 +789,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

-    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && info.supports(GLOBAL_ATOMICS));
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    const int nQuery = query.rows;
    const int nTrain = train.rows;
@ -804,19 +805,19 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
    }
-    
+
    if (stream)
        stream.enqueueMemSet(nMatches, Scalar::all(0));
    else
        nMatches.setTo(Scalar::all(0));

    caller_t func = callers[distType][query.depth()];
-    CV_Assert(func != 0);    
+    CV_Assert(func != 0);

    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
 }

-void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, 
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
    vector< vector<DMatch> >& matches, bool compactResult)
 {
    if (trainIdx.empty() || distance.empty() || nMatches.empty())
@ -886,33 +887,33 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& query, const
    radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult);
 }

-void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, 
+void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
    float maxDistance, const vector<GpuMat>& masks, Stream& stream)
 {
    if (query.empty() || empty())
        return;

-    using namespace ::cv::gpu::device::bf_radius_match;
+    using namespace cv::gpu::device::bf_radius_match;

-    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
+                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
                             int cc, cudaStream_t stream);

    static const caller_t callers[3][6] =
    {
        {
-            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/, 
-            matchL1_gpu<unsigned short>, matchL1_gpu<short>, 
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
            matchL1_gpu<int>, matchL1_gpu<float>
        },
        {
-            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/, 
-            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/, 
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
-            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/, 
-            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/, 
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
@ -920,7 +921,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

-    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && info.supports(GLOBAL_ATOMICS));
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    const int nQuery = query.rows;

@ -934,7 +936,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx);
        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance);
    }
-    
+
    if (stream)
        stream.enqueueMemSet(nMatches, Scalar::all(0));
    else
@ -946,7 +948,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    vector<DevMem2Db> trains_(trainDescCollection.begin(), trainDescCollection.end());
    vector<DevMem2Db> masks_(masks.begin(), masks.end());

-    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0], 
+    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
        trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream));
 }

--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@ -40,7 +40,7 @@
 //
 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
 //
-// The original code was written by Paul Furgale and Chi Hay Tong 
+// The original code was written by Paul Furgale and Chi Hay Tong
 // and later optimized and prepared for integration into OpenCV by Itseez.
 //
 //M*/
@ -52,9 +52,9 @@
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/filters.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace surf 
+    namespace surf
    {
        ////////////////////////////////////////////////////////////////////////
        // Global parameters
@ -123,7 +123,7 @@ namespace cv { namespace gpu { namespace device
        #endif

            float ratio = (float)newSize / oldSize;
-            
+
            real_t d = 0;

            #pragma unroll
@ -225,7 +225,7 @@ namespace cv { namespace gpu { namespace device
            static __device__ bool check(int sum_i, int sum_j, int size)
            {
                float ratio = (float)size / 9.0f;
-                
+
                float d = 0;

                int dx1 = __float2int_rn(ratio * c_DM[0]);
@ -423,12 +423,12 @@ namespace cv { namespace gpu { namespace device
                    if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
                    {
                        // if the step is within the interpolation region, perform it
-                        
+
                        const int size = calcSize(c_octave, maxPos.z);

                        const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
                        const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
-                        
+
                        const float center_i = sum_i + (float)(size - 1) / 2;
                        const float center_j = sum_j + (float)(size - 1) / 2;

@ -471,8 +471,8 @@ namespace cv { namespace gpu { namespace device
            #endif
        }

-        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, 
-            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian, 
+        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
+            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
            unsigned int* featureCounter)
        {
            dim3 threads;
@ -509,7 +509,8 @@ namespace cv { namespace gpu { namespace device
            __shared__ float s_Y[128];
            __shared__ float s_angle[128];

-            __shared__ float s_sum[32 * 4];
+            __shared__ float s_sumx[32 * 4];
+            __shared__ float s_sumy[32 * 4];

            /* The sampling intervals and wavelet sized for selecting an orientation
             and building the keypoint descriptor are defined relative to 's' */
@ -522,126 +523,109 @@ namespace cv { namespace gpu { namespace device
            const int grad_wav_size = 2 * __float2int_rn(2.0f * s);

            // check when grad_wav_size is too big
-            if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
+            if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
+                return;
+
+            // Calc X, Y, angle and store it to shared memory
+            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+            float X = 0.0f, Y = 0.0f, angle = 0.0f;
+
+            if (tid < ORI_SAMPLES)
            {
-                // Calc X, Y, angle and store it to shared memory
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                const float margin = (float)(grad_wav_size - 1) / 2.0f;
+                const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
+                const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);

-                float X = 0.0f, Y = 0.0f, angle = 0.0f;
-
-                if (tid < ORI_SAMPLES)
+                if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
+                    x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
                {
-                    const float margin = (float)(grad_wav_size - 1) / 2.0f;
-                    const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
-                    const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
+                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
+                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);

-                    if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
-                    {
-                        X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
-                        Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
-                    
-                        angle = atan2f(Y, X);
-                        if (angle < 0)
-                            angle += 2.0f * CV_PI_F;
-                        angle *= 180.0f / CV_PI_F;
-                    }
+                    angle = atan2f(Y, X);
+                    if (angle < 0)
+                        angle += 2.0f * CV_PI_F;
+                    angle *= 180.0f / CV_PI_F;
                }
-                s_X[tid] = X;
-                s_Y[tid] = Y;
-                s_angle[tid] = angle;
+            }
+            s_X[tid] = X;
+            s_Y[tid] = Y;
+            s_angle[tid] = angle;
+            __syncthreads();
+
+            float bestx = 0, besty = 0, best_mod = 0;
+
+            #pragma unroll
+            for (int i = 0; i < 18; ++i)
+            {
+                const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
+
+                float sumx = 0.0f, sumy = 0.0f;
+                int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx = s_X[threadIdx.x];
+                    sumy = s_Y[threadIdx.x];
+                }
+                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx += s_X[threadIdx.x + 32];
+                    sumy += s_Y[threadIdx.x + 32];
+                }
+                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx += s_X[threadIdx.x + 64];
+                    sumy += s_Y[threadIdx.x + 64];
+                }
+                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx += s_X[threadIdx.x + 96];
+                    sumy += s_Y[threadIdx.x + 96];
+                }
+
+                device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
+                device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
+
+                const float temp_mod = sumx * sumx + sumy * sumy;
+                if (temp_mod > best_mod)
+                {
+                    best_mod = temp_mod;
+                    bestx = sumx;
+                    besty = sumy;
+                }
+
                __syncthreads();
+            }

-                float bestx = 0, besty = 0, best_mod = 0;
+            if (threadIdx.x == 0)
+            {
+                s_X[threadIdx.y] = bestx;
+                s_Y[threadIdx.y] = besty;
+                s_angle[threadIdx.y] = best_mod;
+            }
+            __syncthreads();

-                #pragma unroll
-                for (int i = 0; i < 18; ++i)
-                {
-                    const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                int bestIdx = 0;

-                    float sumx = 0.0f, sumy = 0.0f;
-                    int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
-                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                    {
-                        sumx = s_X[threadIdx.x];
-                        sumy = s_Y[threadIdx.x];
-                    }
-                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
-                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                    {
-                        sumx += s_X[threadIdx.x + 32];
-                        sumy += s_Y[threadIdx.x + 32];
-                    }
-                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
-                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                    {
-                        sumx += s_X[threadIdx.x + 64];
-                        sumy += s_Y[threadIdx.x + 64];
-                    }
-                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
-                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                    {
-                        sumx += s_X[threadIdx.x + 96];
-                        sumy += s_Y[threadIdx.x + 96];
-                    }
+                if (s_angle[1] > s_angle[bestIdx])
+                    bestIdx = 1;
+                if (s_angle[2] > s_angle[bestIdx])
+                    bestIdx = 2;
+                if (s_angle[3] > s_angle[bestIdx])
+                    bestIdx = 3;

-                    float* s_sum_row = s_sum + threadIdx.y * 32;
+                float kp_dir = atan2f(s_Y[bestIdx], s_X[bestIdx]);
+                if (kp_dir < 0)
+                    kp_dir += 2.0f * CV_PI_F;
+                kp_dir *= 180.0f / CV_PI_F;

-                    device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
-                    device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
-
-                    const float temp_mod = sumx * sumx + sumy * sumy;
-                    if (temp_mod > best_mod)
-                    {
-                        best_mod = temp_mod;
-                        bestx = sumx;
-                        besty = sumy;
-                    }
-
-                    __syncthreads();
-                }
-
-                if (threadIdx.x == 0)
-                {
-                    s_X[threadIdx.y] = bestx;
-                    s_Y[threadIdx.y] = besty;
-                    s_angle[threadIdx.y] = best_mod;
-                }
-                __syncthreads();
-
-                if (threadIdx.x < 2 && threadIdx.y == 0)
-                {
-                    volatile float* v_x = s_X;
-                    volatile float* v_y = s_Y;
-                    volatile float* v_mod = s_angle;
-
-                    bestx = v_x[threadIdx.x];
-                    besty = v_y[threadIdx.x];
-                    best_mod = v_mod[threadIdx.x];
-
-                    float temp_mod = v_mod[threadIdx.x + 2];
-                    if (temp_mod > best_mod)
-                    {
-                        v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
-                        v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
-                        v_mod[threadIdx.x] = best_mod = temp_mod;
-                    }
-                    temp_mod = v_mod[threadIdx.x + 1];
-                    if (temp_mod > best_mod)
-                    {
-                        v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
-                        v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
-                    }
-                }
-
-                if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
-                {
-                    float kp_dir = atan2f(besty, bestx);
-                    if (kp_dir < 0)
-                        kp_dir += 2.0f * CV_PI_F;
-                    kp_dir *= 180.0f / CV_PI_F;
-
-                    featureDir[blockIdx.x] = kp_dir;
-                }
+                featureDir[blockIdx.x] = kp_dir;
            }
        }

@ -649,7 +633,7 @@ namespace cv { namespace gpu { namespace device
        #undef ORI_WIN
        #undef ORI_SAMPLES

-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) 
+        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
        {
            dim3 threads;
            threads.x = 32;
@ -669,27 +653,27 @@ namespace cv { namespace gpu { namespace device

        #define PATCH_SZ 20

-        __constant__ float c_DW[PATCH_SZ * PATCH_SZ] = 
+        __constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
        {
-            3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, 
-            8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
-            1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
-            3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
-            5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
-            9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
-            0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
-            0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
-            0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
-            0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
-            0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
-            0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
-            0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
-            0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
-            9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
-            5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
-            3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
-            1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
-            8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
+            3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
+            8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
+            1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
+            3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
+            5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
+            9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
+            0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
+            0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
+            0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
+            0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
+            0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
+            0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
+            0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
+            0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
+            9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
+            5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
+            3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
+            1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
+            8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
            3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
        };

@ -697,7 +681,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef uchar elem_type;

-            __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : 
+            __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) :
                centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
            {
            }
@ -710,14 +694,14 @@ namespace cv { namespace gpu { namespace device
                return tex2D(imgTex, pixel_x, pixel_y);
            }

-            float centerX; 
+            float centerX;
            float centerY;
-            float win_offset; 
-            float cos_dir; 
+            float win_offset;
+            float cos_dir;
            float sin_dir;
        };

-        __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], 
+        __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
        {
            __shared__ float s_PATCH[6][6];
@ -739,7 +723,7 @@ namespace cv { namespace gpu { namespace device
            sincosf(descriptor_dir, &sin_dir, &cos_dir);

            /* Nearest neighbour version (faster) */
-            const float win_offset = -(float)(win_size - 1) / 2; 
+            const float win_offset = -(float)(win_size - 1) / 2;

            // Compute sampling points
            // since grids are 2D, need to compute xBlock and yBlock indices
@ -966,11 +950,11 @@ namespace cv { namespace gpu { namespace device
            descriptor_base[threadIdx.x] = lookup / len;
        }

-        void compute_descriptors_gpu(const DevMem2Df& descriptors, 
+        void compute_descriptors_gpu(const DevMem2Df& descriptors,
            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
        {
            // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
-            
+
            if (descriptors.cols == 64)
            {
                compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
@ -985,12 +969,12 @@ namespace cv { namespace gpu { namespace device
            }
            else
            {
-                compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            
+                compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );

-                normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);            
+                normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
@ -59,7 +59,7 @@ int cv::gpu::FAST_GPU::getKeyPoints(GpuMat&) { throw_nogpu(); return 0; }

 #else /* !defined (HAVE_CUDA) */

-cv::gpu::FAST_GPU::FAST_GPU(int _threshold, bool _nonmaxSupression, double _keypointsRatio) : 
+cv::gpu::FAST_GPU::FAST_GPU(int _threshold, bool _nonmaxSupression, double _keypointsRatio) :
    nonmaxSupression(_nonmaxSupression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
 {
 }
@ -109,9 +109,9 @@ void cv::gpu::FAST_GPU::operator ()(const GpuMat& img, const GpuMat& mask, GpuMa
    keypoints.cols = getKeyPoints(keypoints);
 }

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace fast 
+    namespace fast
    {
        int calcKeypoints_gpu(DevMem2Db img, DevMem2Db mask, short2* kpLoc, int maxKeypoints, DevMem2Di score, int threshold);
        int nonmaxSupression_gpu(const short2* kpLoc, int count, DevMem2Di score, short2* loc, float* response);
@ -124,7 +124,9 @@ int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& ma

    CV_Assert(img.type() == CV_8UC1);
    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
-    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
+
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());

@ -146,7 +148,8 @@ int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
 {
    using namespace cv::gpu::device::fast;

-    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    if (count_ == 0)
        return 0;
@ -160,7 +163,7 @@ int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
    kpLoc_.colRange(0, count_).copyTo(locRow);
    keypoints.row(1).setTo(Scalar::all(0));

-    return count_;    
+    return count_;
 }

 void cv::gpu::FAST_GPU::release()
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@ -120,7 +120,9 @@ namespace
            CV_Assert(!img.empty() && img.type() == CV_8UC1);
            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
            CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
-            CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
+
+            if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+                CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

            const int min_size = calcSize(surf_.nOctaves - 1, 0);
            CV_Assert(img_rows - min_size >= 0);
@ -184,8 +186,8 @@ namespace
                {
                    icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer.ptr<int4>(), maxCounter,
                        keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
-                        keypoints.ptr<int>(SURF_GPU::LAPLACIAN_ROW), keypoints.ptr<int>(SURF_GPU::OCTAVE_ROW), 
-                        keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::HESSIAN_ROW), 
+                        keypoints.ptr<int>(SURF_GPU::LAPLACIAN_ROW), keypoints.ptr<int>(SURF_GPU::OCTAVE_ROW),
+                        keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::HESSIAN_ROW),
                        counters.ptr<unsigned int>());
                }
            }
@ -306,7 +308,7 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
        Mat keypointsCPU(keypointsGPU);

        keypoints.resize(nFeatures);
-        
+
        float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
        float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
        int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@ -108,6 +108,25 @@ testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char

 #define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);

+int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
+{
+    std::sort(actual.begin(), actual.end(), KeyPointLess());
+    std::sort(gold.begin(), gold.end(), KeyPointLess());
+
+    int validCount = 0;
+
+    for (size_t i = 0; i < gold.size(); ++i)
+    {
+        const cv::KeyPoint& p1 = gold[i];
+        const cv::KeyPoint& p2 = actual[i];
+
+        if (keyPointsEquals(p1, p2))
+            ++validCount;
+    }
+
+    return validCount;
+}
+
 int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
 {
    int validCount = 0;
@ -170,20 +189,39 @@ TEST_P(SURF, Detector)
    surf.upright = upright;
    surf.keypointsRatio = 0.05f;

-    std::vector<cv::KeyPoint> keypoints;
-    surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf(loadMat(image), cv::gpu::GpuMat(), keypoints);

-    cv::SURF surf_gold;
-    surf_gold.hessianThreshold = hessianThreshold;
-    surf_gold.nOctaves = nOctaves;
-    surf_gold.nOctaveLayers = nOctaveLayers;
-    surf_gold.extended = extended;
-    surf_gold.upright = upright;
+        cv::SURF surf_gold;
+        surf_gold.hessianThreshold = hessianThreshold;
+        surf_gold.nOctaves = nOctaves;
+        surf_gold.nOctaveLayers = nOctaveLayers;
+        surf_gold.extended = extended;
+        surf_gold.upright = upright;

-    std::vector<cv::KeyPoint> keypoints_gold;
-    surf_gold(image, cv::noArray(), keypoints_gold);
+        std::vector<cv::KeyPoint> keypoints_gold;
+        surf_gold(image, cv::noArray(), keypoints_gold);

-    ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+
+        EXPECT_GT(matchedRatio, 0.95);
+    }
 }

 TEST_P(SURF, Detector_Masked)
@ -202,20 +240,39 @@ TEST_P(SURF, Detector_Masked)
    surf.upright = upright;
    surf.keypointsRatio = 0.05f;

-    std::vector<cv::KeyPoint> keypoints;
-    surf(loadMat(image), loadMat(mask), keypoints);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            surf(loadMat(image), loadMat(mask), keypoints);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf(loadMat(image), loadMat(mask), keypoints);

-    cv::SURF surf_gold;
-    surf_gold.hessianThreshold = hessianThreshold;
-    surf_gold.nOctaves = nOctaves;
-    surf_gold.nOctaveLayers = nOctaveLayers;
-    surf_gold.extended = extended;
-    surf_gold.upright = upright;
+        cv::SURF surf_gold;
+        surf_gold.hessianThreshold = hessianThreshold;
+        surf_gold.nOctaves = nOctaves;
+        surf_gold.nOctaveLayers = nOctaveLayers;
+        surf_gold.extended = extended;
+        surf_gold.upright = upright;

-    std::vector<cv::KeyPoint> keypoints_gold;
-    surf_gold(image, mask, keypoints_gold);
+        std::vector<cv::KeyPoint> keypoints_gold;
+        surf_gold(image, mask, keypoints_gold);

-    ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+
+        EXPECT_GT(matchedRatio, 0.95);
+    }
 }

 TEST_P(SURF, Descriptor)
@ -238,23 +295,39 @@ TEST_P(SURF, Descriptor)
    surf_gold.extended = extended;
    surf_gold.upright = upright;

-    std::vector<cv::KeyPoint> keypoints;
-    surf_gold(image, cv::noArray(), keypoints);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            cv::gpu::GpuMat descriptors;
+            surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf_gold(image, cv::noArray(), keypoints);

-    cv::gpu::GpuMat descriptors;
-    surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
+        cv::gpu::GpuMat descriptors;
+        surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);

-    cv::Mat descriptors_gold;
-    surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
+        cv::Mat descriptors_gold;
+        surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);

-    cv::BFMatcher matcher(cv::NORM_L2);
-    std::vector<cv::DMatch> matches;
-    matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+        cv::BFMatcher matcher(cv::NORM_L2);
+        std::vector<cv::DMatch> matches;
+        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);

-    int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
-    double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+        int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();

-    EXPECT_GT(matchedRatio, 0.35);
+        EXPECT_GT(matchedRatio, 0.35);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
@ -295,13 +368,28 @@ TEST_P(FAST, Accuracy)
    cv::gpu::FAST_GPU fast(threshold);
    fast.nonmaxSupression = nonmaxSupression;

-    std::vector<cv::KeyPoint> keypoints;
-    fast(loadMat(image), cv::gpu::GpuMat(), keypoints);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            fast(loadMat(image), cv::gpu::GpuMat(), keypoints);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        fast(loadMat(image), cv::gpu::GpuMat(), keypoints);

-    std::vector<cv::KeyPoint> keypoints_gold;
-    cv::FAST(image, keypoints_gold, threshold, nonmaxSupression);
+        std::vector<cv::KeyPoint> keypoints_gold;
+        cv::FAST(image, keypoints_gold, threshold, nonmaxSupression);

-    ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+        ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, FAST, testing::Combine(
@ -364,24 +452,40 @@ TEST_P(ORB, Accuracy)
    cv::gpu::ORB_GPU orb(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
    orb.blurForDescriptor = blurForDescriptor;

-    std::vector<cv::KeyPoint> keypoints;
-    cv::gpu::GpuMat descriptors;
-    orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            cv::gpu::GpuMat descriptors;
+            orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        cv::gpu::GpuMat descriptors;
+        orb(loadMat(image), loadMat(mask), keypoints, descriptors);

-    cv::ORB orb_gold(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
+        cv::ORB orb_gold(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);

-    std::vector<cv::KeyPoint> keypoints_gold;
-    cv::Mat descriptors_gold;
-    orb_gold(image, mask, keypoints_gold, descriptors_gold);
+        std::vector<cv::KeyPoint> keypoints_gold;
+        cv::Mat descriptors_gold;
+        orb_gold(image, mask, keypoints_gold, descriptors_gold);

-    cv::BFMatcher matcher(cv::NORM_HAMMING);
-    std::vector<cv::DMatch> matches;
-    matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+        cv::BFMatcher matcher(cv::NORM_HAMMING);
+        std::vector<cv::DMatch> matches;
+        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);

-    int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints, matches);
-    double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints, matches);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();

-    EXPECT_GT(matchedRatio, 0.35);
+        EXPECT_GT(matchedRatio, 0.35);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, ORB,  testing::Combine(
@ -713,25 +817,40 @@ TEST_P(BruteForceMatcher, RadiusMatch)

    cv::gpu::BruteForceMatcher_GPU_base matcher(distType);

-    std::vector< std::vector<cv::DMatch> > matches;
-    matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
-
-    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
-
-    int badCount = 0;
-    for (size_t i = 0; i < matches.size(); i++)
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
    {
-        if ((int)matches[i].size() != 1)
-            badCount++;
-        else
+        try
        {
-            cv::DMatch match = matches[i][0];
-            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
-                badCount++;
+            std::vector< std::vector<cv::DMatch> > matches;
+            matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
        }
    }
+    else
+    {
+        std::vector< std::vector<cv::DMatch> > matches;
+        matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);

-    ASSERT_EQ(0, badCount);
+        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+        int badCount = 0;
+        for (size_t i = 0; i < matches.size(); i++)
+        {
+            if ((int)matches[i].size() != 1)
+                badCount++;
+            else
+            {
+                cv::DMatch match = matches[i][0];
+                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
+                    badCount++;
+            }
+        }
+
+        ASSERT_EQ(0, badCount);
+    }
 }

 TEST_P(BruteForceMatcher, RadiusMatchAdd)
@ -756,42 +875,57 @@ TEST_P(BruteForceMatcher, RadiusMatchAdd)
            masks[mi].col(di * countFactor).setTo(cv::Scalar::all(0));
    }

-    std::vector< std::vector<cv::DMatch> > matches;
-    matcher.radiusMatch(cv::gpu::GpuMat(query), matches, radius, masks);
-
-    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
-
-    int badCount = 0;
-    int shift = matcher.isMaskSupported() ? 1 : 0;
-    int needMatchCount = matcher.isMaskSupported() ? n-1 : n;
-    for (size_t i = 0; i < matches.size(); i++)
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
    {
-        if ((int)matches[i].size() != needMatchCount)
-            badCount++;
-        else
+        try
        {
-            int localBadCount = 0;
-            for (int k = 0; k < needMatchCount; k++)
-            {
-                cv::DMatch match = matches[i][k];
-                {
-                    if ((int)i < queryDescCount / 2)
-                    {
-                        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-                            localBadCount++;
-                    }
-                    else
-                    {
-                        if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-                            localBadCount++;
-                    }
-                }
-            }
-            badCount += localBadCount > 0 ? 1 : 0;
+            std::vector< std::vector<cv::DMatch> > matches;
+            matcher.radiusMatch(cv::gpu::GpuMat(query), matches, radius, masks);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
        }
    }
+    else
+    {
+        std::vector< std::vector<cv::DMatch> > matches;
+        matcher.radiusMatch(cv::gpu::GpuMat(query), matches, radius, masks);

-    ASSERT_EQ(0, badCount);
+        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+        int badCount = 0;
+        int shift = matcher.isMaskSupported() ? 1 : 0;
+        int needMatchCount = matcher.isMaskSupported() ? n-1 : n;
+        for (size_t i = 0; i < matches.size(); i++)
+        {
+            if ((int)matches[i].size() != needMatchCount)
+                badCount++;
+            else
+            {
+                int localBadCount = 0;
+                for (int k = 0; k < needMatchCount; k++)
+                {
+                    cv::DMatch match = matches[i][k];
+                    {
+                        if ((int)i < queryDescCount / 2)
+                        {
+                            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
+                                localBadCount++;
+                        }
+                        else
+                        {
+                            if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
+                                localBadCount++;
+                        }
+                    }
+                }
+                badCount += localBadCount > 0 ? 1 : 0;
+            }
+        }
+
+        ASSERT_EQ(0, badCount);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@ -258,13 +258,28 @@ TEST_P(GaussianBlur, Accuracy)
    double sigma1 = randomDouble(0.1, 1.0);
    double sigma2 = randomDouble(0.1, 1.0);

-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);
+    if (ksize.height > 16 && !supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::GaussianBlur(loadMat(src), dst, ksize, sigma1, sigma2, borderType);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+        cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);

-    cv::Mat dst_gold;
-    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+        cv::Mat dst_gold;
+        cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);

-    EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(