added assertion on device features (global atomics) into gpu tests

2012-03-27 07:33:39 +00:00 · 2012-03-27 07:33:39 +00:00 · bd13e9479b
commit bd13e9479b
parent 4a996111ea
6 changed files with 498 additions and 358 deletions
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -761,7 +761,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
    if (query.empty() || train.empty())
        return;

-    using namespace ::cv::gpu::device::bf_radius_match;
+    using namespace cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
@ -789,7 +789,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

-    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && info.supports(GLOBAL_ATOMICS));
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    const int nQuery = query.rows;
    const int nTrain = train.rows;
@ -892,7 +893,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    if (query.empty() || empty())
        return;

-    using namespace ::cv::gpu::device::bf_radius_match;
+    using namespace cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
@ -920,7 +921,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    DeviceInfo info;
    int cc = info.majorVersion() * 10 + info.minorVersion();

-    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && info.supports(GLOBAL_ATOMICS));
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    const int nQuery = query.rows;

--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@ -509,7 +509,8 @@ namespace cv { namespace gpu { namespace device
            __shared__ float s_Y[128];
            __shared__ float s_angle[128];

-            __shared__ float s_sum[32 * 4];
+            __shared__ float s_sumx[32 * 4];
+            __shared__ float s_sumy[32 * 4];

            /* The sampling intervals and wavelet sized for selecting an orientation
             and building the keypoint descriptor are defined relative to 's' */
@ -522,126 +523,109 @@ namespace cv { namespace gpu { namespace device
            const int grad_wav_size = 2 * __float2int_rn(2.0f * s);

            // check when grad_wav_size is too big
-            if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
+            if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
+                return;
+
+            // Calc X, Y, angle and store it to shared memory
+            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+            float X = 0.0f, Y = 0.0f, angle = 0.0f;
+
+            if (tid < ORI_SAMPLES)
            {
-                // Calc X, Y, angle and store it to shared memory
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+                const float margin = (float)(grad_wav_size - 1) / 2.0f;
+                const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
+                const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);

-                float X = 0.0f, Y = 0.0f, angle = 0.0f;
-
-                if (tid < ORI_SAMPLES)
+                if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
+                    x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
                {
-                    const float margin = (float)(grad_wav_size - 1) / 2.0f;
-                    const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
-                    const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
+                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
+                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);

-                    if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
-                    {
-                        X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
-                        Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
-                    
-                        angle = atan2f(Y, X);
-                        if (angle < 0)
-                            angle += 2.0f * CV_PI_F;
-                        angle *= 180.0f / CV_PI_F;
-                    }
+                    angle = atan2f(Y, X);
+                    if (angle < 0)
+                        angle += 2.0f * CV_PI_F;
+                    angle *= 180.0f / CV_PI_F;
                }
-                s_X[tid] = X;
-                s_Y[tid] = Y;
-                s_angle[tid] = angle;
+            }
+            s_X[tid] = X;
+            s_Y[tid] = Y;
+            s_angle[tid] = angle;
+            __syncthreads();
+
+            float bestx = 0, besty = 0, best_mod = 0;
+
+            #pragma unroll
+            for (int i = 0; i < 18; ++i)
+            {
+                const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
+
+                float sumx = 0.0f, sumy = 0.0f;
+                int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx = s_X[threadIdx.x];
+                    sumy = s_Y[threadIdx.x];
+                }
+                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx += s_X[threadIdx.x + 32];
+                    sumy += s_Y[threadIdx.x + 32];
+                }
+                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx += s_X[threadIdx.x + 64];
+                    sumy += s_Y[threadIdx.x + 64];
+                }
+                d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+                {
+                    sumx += s_X[threadIdx.x + 96];
+                    sumy += s_Y[threadIdx.x + 96];
+                }
+
+                device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus<volatile float>());
+                device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus<volatile float>());
+
+                const float temp_mod = sumx * sumx + sumy * sumy;
+                if (temp_mod > best_mod)
+                {
+                    best_mod = temp_mod;
+                    bestx = sumx;
+                    besty = sumy;
+                }
+
                __syncthreads();
+            }

-                float bestx = 0, besty = 0, best_mod = 0;
+            if (threadIdx.x == 0)
+            {
+                s_X[threadIdx.y] = bestx;
+                s_Y[threadIdx.y] = besty;
+                s_angle[threadIdx.y] = best_mod;
+            }
+            __syncthreads();

-                #pragma unroll
-                for (int i = 0; i < 18; ++i)
-                {
-                    const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                int bestIdx = 0;

-                    float sumx = 0.0f, sumy = 0.0f;
-                    int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
-                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                    {
-                        sumx = s_X[threadIdx.x];
-                        sumy = s_Y[threadIdx.x];
-                    }
-                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
-                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                    {
-                        sumx += s_X[threadIdx.x + 32];
-                        sumy += s_Y[threadIdx.x + 32];
-                    }
-                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
-                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                    {
-                        sumx += s_X[threadIdx.x + 64];
-                        sumy += s_Y[threadIdx.x + 64];
-                    }
-                    d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
-                    if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-                    {
-                        sumx += s_X[threadIdx.x + 96];
-                        sumy += s_Y[threadIdx.x + 96];
-                    }
+                if (s_angle[1] > s_angle[bestIdx])
+                    bestIdx = 1;
+                if (s_angle[2] > s_angle[bestIdx])
+                    bestIdx = 2;
+                if (s_angle[3] > s_angle[bestIdx])
+                    bestIdx = 3;

-                    float* s_sum_row = s_sum + threadIdx.y * 32;
+                float kp_dir = atan2f(s_Y[bestIdx], s_X[bestIdx]);
+                if (kp_dir < 0)
+                    kp_dir += 2.0f * CV_PI_F;
+                kp_dir *= 180.0f / CV_PI_F;

-                    device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
-                    device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
-
-                    const float temp_mod = sumx * sumx + sumy * sumy;
-                    if (temp_mod > best_mod)
-                    {
-                        best_mod = temp_mod;
-                        bestx = sumx;
-                        besty = sumy;
-                    }
-
-                    __syncthreads();
-                }
-
-                if (threadIdx.x == 0)
-                {
-                    s_X[threadIdx.y] = bestx;
-                    s_Y[threadIdx.y] = besty;
-                    s_angle[threadIdx.y] = best_mod;
-                }
-                __syncthreads();
-
-                if (threadIdx.x < 2 && threadIdx.y == 0)
-                {
-                    volatile float* v_x = s_X;
-                    volatile float* v_y = s_Y;
-                    volatile float* v_mod = s_angle;
-
-                    bestx = v_x[threadIdx.x];
-                    besty = v_y[threadIdx.x];
-                    best_mod = v_mod[threadIdx.x];
-
-                    float temp_mod = v_mod[threadIdx.x + 2];
-                    if (temp_mod > best_mod)
-                    {
-                        v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
-                        v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
-                        v_mod[threadIdx.x] = best_mod = temp_mod;
-                    }
-                    temp_mod = v_mod[threadIdx.x + 1];
-                    if (temp_mod > best_mod)
-                    {
-                        v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
-                        v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
-                    }
-                }
-
-                if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
-                {
-                    float kp_dir = atan2f(besty, bestx);
-                    if (kp_dir < 0)
-                        kp_dir += 2.0f * CV_PI_F;
-                    kp_dir *= 180.0f / CV_PI_F;
-
-                    featureDir[blockIdx.x] = kp_dir;
-                }
+                featureDir[blockIdx.x] = kp_dir;
            }
        }

--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
@ -124,7 +124,9 @@ int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& ma

    CV_Assert(img.type() == CV_8UC1);
    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
-    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
+
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());

@ -146,7 +148,8 @@ int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
 {
    using namespace cv::gpu::device::fast;

-    CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

    if (count_ == 0)
        return 0;
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@ -120,7 +120,9 @@ namespace
            CV_Assert(!img.empty() && img.type() == CV_8UC1);
            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
            CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
-            CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
+
+            if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+                CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");

            const int min_size = calcSize(surf_.nOctaves - 1, 0);
            CV_Assert(img_rows - min_size >= 0);
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@ -108,6 +108,25 @@ testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char

 #define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);

+int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
+{
+    std::sort(actual.begin(), actual.end(), KeyPointLess());
+    std::sort(gold.begin(), gold.end(), KeyPointLess());
+
+    int validCount = 0;
+
+    for (size_t i = 0; i < gold.size(); ++i)
+    {
+        const cv::KeyPoint& p1 = gold[i];
+        const cv::KeyPoint& p2 = actual[i];
+
+        if (keyPointsEquals(p1, p2))
+            ++validCount;
+    }
+
+    return validCount;
+}
+
 int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
 {
    int validCount = 0;
@ -170,20 +189,39 @@ TEST_P(SURF, Detector)
    surf.upright = upright;
    surf.keypointsRatio = 0.05f;

-    std::vector<cv::KeyPoint> keypoints;
-    surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf(loadMat(image), cv::gpu::GpuMat(), keypoints);

-    cv::SURF surf_gold;
-    surf_gold.hessianThreshold = hessianThreshold;
-    surf_gold.nOctaves = nOctaves;
-    surf_gold.nOctaveLayers = nOctaveLayers;
-    surf_gold.extended = extended;
-    surf_gold.upright = upright;
+        cv::SURF surf_gold;
+        surf_gold.hessianThreshold = hessianThreshold;
+        surf_gold.nOctaves = nOctaves;
+        surf_gold.nOctaveLayers = nOctaveLayers;
+        surf_gold.extended = extended;
+        surf_gold.upright = upright;

-    std::vector<cv::KeyPoint> keypoints_gold;
-    surf_gold(image, cv::noArray(), keypoints_gold);
+        std::vector<cv::KeyPoint> keypoints_gold;
+        surf_gold(image, cv::noArray(), keypoints_gold);

-    ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+
+        EXPECT_GT(matchedRatio, 0.95);
+    }
 }

 TEST_P(SURF, Detector_Masked)
@ -202,20 +240,39 @@ TEST_P(SURF, Detector_Masked)
    surf.upright = upright;
    surf.keypointsRatio = 0.05f;

-    std::vector<cv::KeyPoint> keypoints;
-    surf(loadMat(image), loadMat(mask), keypoints);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            surf(loadMat(image), loadMat(mask), keypoints);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf(loadMat(image), loadMat(mask), keypoints);

-    cv::SURF surf_gold;
-    surf_gold.hessianThreshold = hessianThreshold;
-    surf_gold.nOctaves = nOctaves;
-    surf_gold.nOctaveLayers = nOctaveLayers;
-    surf_gold.extended = extended;
-    surf_gold.upright = upright;
+        cv::SURF surf_gold;
+        surf_gold.hessianThreshold = hessianThreshold;
+        surf_gold.nOctaves = nOctaves;
+        surf_gold.nOctaveLayers = nOctaveLayers;
+        surf_gold.extended = extended;
+        surf_gold.upright = upright;

-    std::vector<cv::KeyPoint> keypoints_gold;
-    surf_gold(image, mask, keypoints_gold);
+        std::vector<cv::KeyPoint> keypoints_gold;
+        surf_gold(image, mask, keypoints_gold);

-    ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+
+        EXPECT_GT(matchedRatio, 0.95);
+    }
 }

 TEST_P(SURF, Descriptor)
@ -238,23 +295,39 @@ TEST_P(SURF, Descriptor)
    surf_gold.extended = extended;
    surf_gold.upright = upright;

-    std::vector<cv::KeyPoint> keypoints;
-    surf_gold(image, cv::noArray(), keypoints);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            cv::gpu::GpuMat descriptors;
+            surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf_gold(image, cv::noArray(), keypoints);

-    cv::gpu::GpuMat descriptors;
-    surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
+        cv::gpu::GpuMat descriptors;
+        surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);

-    cv::Mat descriptors_gold;
-    surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
+        cv::Mat descriptors_gold;
+        surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);

-    cv::BFMatcher matcher(cv::NORM_L2);
-    std::vector<cv::DMatch> matches;
-    matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+        cv::BFMatcher matcher(cv::NORM_L2);
+        std::vector<cv::DMatch> matches;
+        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);

-    int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
-    double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+        int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();

-    EXPECT_GT(matchedRatio, 0.35);
+        EXPECT_GT(matchedRatio, 0.35);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
@ -295,13 +368,28 @@ TEST_P(FAST, Accuracy)
    cv::gpu::FAST_GPU fast(threshold);
    fast.nonmaxSupression = nonmaxSupression;

-    std::vector<cv::KeyPoint> keypoints;
-    fast(loadMat(image), cv::gpu::GpuMat(), keypoints);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            fast(loadMat(image), cv::gpu::GpuMat(), keypoints);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        fast(loadMat(image), cv::gpu::GpuMat(), keypoints);

-    std::vector<cv::KeyPoint> keypoints_gold;
-    cv::FAST(image, keypoints_gold, threshold, nonmaxSupression);
+        std::vector<cv::KeyPoint> keypoints_gold;
+        cv::FAST(image, keypoints_gold, threshold, nonmaxSupression);

-    ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+        ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, FAST, testing::Combine(
@ -364,24 +452,40 @@ TEST_P(ORB, Accuracy)
    cv::gpu::ORB_GPU orb(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
    orb.blurForDescriptor = blurForDescriptor;

-    std::vector<cv::KeyPoint> keypoints;
-    cv::gpu::GpuMat descriptors;
-    orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            cv::gpu::GpuMat descriptors;
+            orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        cv::gpu::GpuMat descriptors;
+        orb(loadMat(image), loadMat(mask), keypoints, descriptors);

-    cv::ORB orb_gold(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
+        cv::ORB orb_gold(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);

-    std::vector<cv::KeyPoint> keypoints_gold;
-    cv::Mat descriptors_gold;
-    orb_gold(image, mask, keypoints_gold, descriptors_gold);
+        std::vector<cv::KeyPoint> keypoints_gold;
+        cv::Mat descriptors_gold;
+        orb_gold(image, mask, keypoints_gold, descriptors_gold);

-    cv::BFMatcher matcher(cv::NORM_HAMMING);
-    std::vector<cv::DMatch> matches;
-    matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+        cv::BFMatcher matcher(cv::NORM_HAMMING);
+        std::vector<cv::DMatch> matches;
+        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);

-    int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints, matches);
-    double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints, matches);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();

-    EXPECT_GT(matchedRatio, 0.35);
+        EXPECT_GT(matchedRatio, 0.35);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, ORB,  testing::Combine(
@ -713,25 +817,40 @@ TEST_P(BruteForceMatcher, RadiusMatch)

    cv::gpu::BruteForceMatcher_GPU_base matcher(distType);

-    std::vector< std::vector<cv::DMatch> > matches;
-    matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
-
-    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
-
-    int badCount = 0;
-    for (size_t i = 0; i < matches.size(); i++)
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
    {
-        if ((int)matches[i].size() != 1)
-            badCount++;
-        else
+        try
        {
-            cv::DMatch match = matches[i][0];
-            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
-                badCount++;
+            std::vector< std::vector<cv::DMatch> > matches;
+            matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
        }
    }
+    else
+    {
+        std::vector< std::vector<cv::DMatch> > matches;
+        matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);

-    ASSERT_EQ(0, badCount);
+        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+        int badCount = 0;
+        for (size_t i = 0; i < matches.size(); i++)
+        {
+            if ((int)matches[i].size() != 1)
+                badCount++;
+            else
+            {
+                cv::DMatch match = matches[i][0];
+                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
+                    badCount++;
+            }
+        }
+
+        ASSERT_EQ(0, badCount);
+    }
 }

 TEST_P(BruteForceMatcher, RadiusMatchAdd)
@ -756,42 +875,57 @@ TEST_P(BruteForceMatcher, RadiusMatchAdd)
            masks[mi].col(di * countFactor).setTo(cv::Scalar::all(0));
    }

-    std::vector< std::vector<cv::DMatch> > matches;
-    matcher.radiusMatch(cv::gpu::GpuMat(query), matches, radius, masks);
-
-    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
-
-    int badCount = 0;
-    int shift = matcher.isMaskSupported() ? 1 : 0;
-    int needMatchCount = matcher.isMaskSupported() ? n-1 : n;
-    for (size_t i = 0; i < matches.size(); i++)
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
    {
-        if ((int)matches[i].size() != needMatchCount)
-            badCount++;
-        else
+        try
        {
-            int localBadCount = 0;
-            for (int k = 0; k < needMatchCount; k++)
-            {
-                cv::DMatch match = matches[i][k];
-                {
-                    if ((int)i < queryDescCount / 2)
-                    {
-                        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
-                            localBadCount++;
-                    }
-                    else
-                    {
-                        if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
-                            localBadCount++;
-                    }
-                }
-            }
-            badCount += localBadCount > 0 ? 1 : 0;
+            std::vector< std::vector<cv::DMatch> > matches;
+            matcher.radiusMatch(cv::gpu::GpuMat(query), matches, radius, masks);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
        }
    }
+    else
+    {
+        std::vector< std::vector<cv::DMatch> > matches;
+        matcher.radiusMatch(cv::gpu::GpuMat(query), matches, radius, masks);

-    ASSERT_EQ(0, badCount);
+        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+        int badCount = 0;
+        int shift = matcher.isMaskSupported() ? 1 : 0;
+        int needMatchCount = matcher.isMaskSupported() ? n-1 : n;
+        for (size_t i = 0; i < matches.size(); i++)
+        {
+            if ((int)matches[i].size() != needMatchCount)
+                badCount++;
+            else
+            {
+                int localBadCount = 0;
+                for (int k = 0; k < needMatchCount; k++)
+                {
+                    cv::DMatch match = matches[i][k];
+                    {
+                        if ((int)i < queryDescCount / 2)
+                        {
+                            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k + shift) || (match.imgIdx != 0) )
+                                localBadCount++;
+                        }
+                        else
+                        {
+                            if ((match.queryIdx != (int)i) || (match.trainIdx != ((int)i - queryDescCount / 2) * countFactor + k + shift) || (match.imgIdx != 1) )
+                                localBadCount++;
+                        }
+                    }
+                }
+                badCount += localBadCount > 0 ? 1 : 0;
+            }
+        }
+
+        ASSERT_EQ(0, badCount);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@ -258,13 +258,28 @@ TEST_P(GaussianBlur, Accuracy)
    double sigma1 = randomDouble(0.1, 1.0);
    double sigma2 = randomDouble(0.1, 1.0);

-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);
+    if (ksize.height > 16 && !supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::GaussianBlur(loadMat(src), dst, ksize, sigma1, sigma2, borderType);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+        cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);

-    cv::Mat dst_gold;
-    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+        cv::Mat dst_gold;
+        cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);

-    EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
+    }
 }

 INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(