added FAST_GPU and ORB_GPU classes

2011-12-27 09:33:20 +00:00
parent e46e13a775
commit 5e38cf8042
8 changed files with 2159 additions and 37 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1522,6 +1522,159 @@ public:
    GpuMat maxPosBuffer;
 };

+////////////////////////////////// FAST //////////////////////////////////////////
+
+class CV_EXPORTS FAST_GPU
+{
+public:
+    enum
+    {
+        LOCATION_ROW = 0,
+        RESPONSE_ROW,
+        ROWS_COUNT
+    };
+
+    // all features have same size
+    static const int FEATURE_SIZE = 7;
+
+    explicit FAST_GPU(int threshold, bool nonmaxSupression = true, double keypointsRatio = 0.05);
+
+    //! finds the keypoints using FAST detector
+    //! supports only CV_8UC1 images
+    void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);    
+    void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
+
+    //! download keypoints from device to host memory
+    void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
+
+    //! convert keypoints to KeyPoint vector
+    void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
+
+    //! release temporary buffer's memory
+    void release();
+
+    bool nonmaxSupression;
+
+    int threshold;
+
+    //! max keypoints = keypointsRatio * img.size().area()
+    double keypointsRatio;
+
+    //! find keypoints and compute it's response if nonmaxSupression is true
+    //! return count of detected keypoints
+    int calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask);
+
+    //! get final array of keypoints
+    //! performs nonmax supression if needed
+    //! return final count of keypoints
+    int getKeyPoints(GpuMat& keypoints);
+
+private:
+    GpuMat kpLoc_;
+    int count_;
+
+    GpuMat score_;
+
+    GpuMat d_keypoints_;
+};
+
+////////////////////////////////// ORB //////////////////////////////////////////
+
+class CV_EXPORTS ORB_GPU
+{
+public:
+    enum
+    {
+        X_ROW = 0,
+        Y_ROW,
+        RESPONSE_ROW,
+        ANGLE_ROW,
+        OCTAVE_ROW,
+        SIZE_ROW,
+        ROWS_COUNT
+    };
+
+    enum
+    {
+        DEFAULT_FAST_THRESHOLD = 20
+    };
+
+    //! Constructor
+    //! n_features - the number of desired features
+    //! detector_params - parameters to use
+    explicit ORB_GPU(size_t n_features = 500, const ORB::CommonParams& detector_params = ORB::CommonParams());
+
+    //! Compute the ORB features on an image
+    //! image - the image to compute the features (supports only CV_8UC1 images)
+    //! mask - the mask to apply
+    //! keypoints - the resulting keypoints
+    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
+    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
+
+    //! Compute the ORB features and descriptors on an image
+    //! image - the image to compute the features (supports only CV_8UC1 images)
+    //! mask - the mask to apply
+    //! keypoints - the resulting keypoints
+    //! descriptors - descriptors array
+    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors);
+    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors);
+
+    //! download keypoints from device to host memory
+    void downloadKeyPoints(GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
+
+    //! convert keypoints to KeyPoint vector
+    void convertKeyPoints(Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
+
+    //! returns the descriptor size in bytes
+    inline int descriptorSize() const { return kBytes; }
+
+    void setParams(size_t n_features, const ORB::CommonParams& detector_params);
+    inline void setFastParams(int threshold, bool nonmaxSupression = true)
+    {
+        fastDetector_.threshold = threshold;
+        fastDetector_.nonmaxSupression = nonmaxSupression;
+    }
+
+    //! release temporary buffer's memory
+    void release();
+
+    //! if true, image will be blurred before descriptors calculation
+    bool blurForDescriptor;
+
+private:
+    enum { kBytes = 32 };
+
+    void buildScalePyramids(const GpuMat& image, const GpuMat& mask);
+
+    void computeKeyPointsPyramid();
+
+    void computeDescriptors(GpuMat& descriptors);
+
+    void mergeKeyPoints(GpuMat& keypoints);
+
+    ORB::CommonParams params_;
+
+    // The number of desired features per scale
+    std::vector<size_t> n_features_per_level_;
+
+    // Points to compute BRIEF descriptors from
+    GpuMat pattern_;
+
+    std::vector<GpuMat> imagePyr_;
+    std::vector<GpuMat> maskPyr_;
+
+    GpuMat buf_;
+
+    std::vector<GpuMat> keyPointsPyr_;
+    std::vector<int> keyPointsCount_;
+
+    FAST_GPU fastDetector_;
+
+    Ptr<FilterEngine_GPU> blurFilter;
+
+    GpuMat d_keypoints_;
+};
+
 ////////////////////////////////// Optical Flow //////////////////////////////////////////

 class CV_EXPORTS BroxOpticalFlow
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -122,11 +122,50 @@ PERF_TEST_P(DevInfo, SURF, testing::ValuesIn(devices()))
    {
        surf(img, GpuMat(), keypoints, descriptors);
    }
-
-    Mat keypoints_host(keypoints);
-    Mat descriptors_host(descriptors);
-    
-    SANITY_CHECK(keypoints_host);
-    SANITY_CHECK(descriptors_host);
 }

+PERF_TEST_P(DevInfo, FAST, testing::ValuesIn(devices()))
+{
+    DeviceInfo devInfo = GetParam();
+
+    setDevice(devInfo.deviceID());
+
+    Mat img_host = readImage("gpu/perf/aloe.jpg", CV_LOAD_IMAGE_GRAYSCALE);
+
+    ASSERT_FALSE(img_host.empty());
+
+    GpuMat img(img_host);
+    GpuMat keypoints;
+
+    FAST_GPU fastGPU(20);
+
+    declare.time(2.0);
+
+    TEST_CYCLE(100)
+    {
+        fastGPU(img, GpuMat(), keypoints);
+    }
+}
+
+PERF_TEST_P(DevInfo, ORB, testing::ValuesIn(devices()))
+{
+    DeviceInfo devInfo = GetParam();
+
+    setDevice(devInfo.deviceID());
+
+    Mat img_host = readImage("gpu/perf/aloe.jpg", CV_LOAD_IMAGE_GRAYSCALE);
+
+    ASSERT_FALSE(img_host.empty());
+
+    GpuMat img(img_host);
+    GpuMat keypoints, descriptors;
+
+    ORB_GPU orbGPU(4000);
+
+    declare.time(2.0);
+
+    TEST_CYCLE(100)
+    {
+        orbGPU(img, GpuMat(), keypoints, descriptors);
+    }
+}
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -0,0 +1,418 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
+//
+// The original code was written by Paul Furgale and Chi Hay Tong 
+// and later optimized and prepared for integration into OpenCV by Itseez.
+//
+//M*/
+
+#include <thrust/sort.h>
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+
+namespace cv { namespace gpu { namespace device 
+{
+    namespace orb
+    {
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // cull
+
+        int cull_gpu(int* loc, float* response, int size, int n_points)
+        {            
+            thrust::device_ptr<int> loc_ptr(loc);
+            thrust::device_ptr<float> response_ptr(response);
+
+            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+
+            return n_points;
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // HarrisResponses
+
+        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
+        {
+            __shared__ int smem[8 * 32];
+
+            volatile int* srow = smem + threadIdx.y * blockDim.x;
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                const short2 loc = loc_[ptidx];
+
+                const int r = blockSize / 2;                
+                const int x0 = loc.x - r;
+                const int y0 = loc.y - r;
+                
+                int a = 0, b = 0, c = 0;
+
+                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
+                {
+                    const int i = ind / blockSize;
+                    const int j = ind % blockSize;
+
+                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 + 
+                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) + 
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
+
+                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 + 
+                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) + 
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
+
+                    a += Ix * Ix;
+                    b += Iy * Iy;
+                    c += Ix * Iy;
+                }
+
+                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+
+                if (threadIdx.x == 0)
+                {
+                    float scale = (1 << 2) * blockSize * 255.0f;
+                    scale = 1.0f / scale;
+                    const float scale_sq_sq = scale * scale * scale * scale;
+
+                    response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
+                }
+            }
+        }
+
+        void HarrisResponses_gpu(DevMem2Db img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // IC_Angle
+
+        __constant__ int c_u_max[32];
+
+        void loadUMax(const int* u_max, int count)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
+        }
+
+        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
+        {
+            __shared__ int smem[8 * 32];
+
+            volatile int* srow = smem + threadIdx.y * blockDim.x;
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                int m_01 = 0, m_10 = 0;
+
+                const short2 loc = loc_[ptidx];
+                        
+                // Treat the center line differently, v=0
+                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
+                    m_10 += u * image(loc.y, loc.x + u);
+
+                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+
+                for (int v = 1; v <= half_k; ++v)
+                {
+                    // Proceed over the two lines
+                    int v_sum = 0;
+                    int m_sum = 0;
+                    const int d = c_u_max[v];
+                    
+                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
+                    {
+                        int val_plus = image(loc.y + v, loc.x + u);
+                        int val_minus = image(loc.y - v, loc.x + u);
+
+                        v_sum += (val_plus - val_minus);
+                        m_sum += u * (val_plus + val_minus);
+                    }
+
+                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
+                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+
+                    m_10 += m_sum;
+                    m_01 += v * v_sum;
+                }
+
+                if (threadIdx.x == 0)
+                {
+                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
+                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
+                    kp_dir *= 180.0f / CV_PI;
+
+                    angle[ptidx] = kp_dir;
+                }
+            }
+        }
+
+        void IC_Angle_gpu(DevMem2Db image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // computeOrbDescriptor
+
+        template <int WTA_K> struct OrbDescriptor;
+
+        #define GET_VALUE(idx) \
+            img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
+                loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
+
+        template <> struct OrbDescriptor<2>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i; 
+                pattern_y += 16 * i;
+
+                int t0, t1, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                val = t0 < t1;
+
+                t0 = GET_VALUE(2); t1 = GET_VALUE(3);
+                val |= (t0 < t1) << 1;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                val |= (t0 < t1) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7);
+                val |= (t0 < t1) << 3;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                val |= (t0 < t1) << 4;
+
+                t0 = GET_VALUE(10); t1 = GET_VALUE(11);
+                val |= (t0 < t1) << 5;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                val |= (t0 < t1) << 6;
+
+                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
+                val |= (t0 < t1) << 7;
+                
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<3>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 12 * i; 
+                pattern_y += 12 * i;
+             
+                int t0, t1, t2, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
+                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
+                
+                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
+                
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
+                
+                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
+                
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<4>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i; 
+                pattern_y += 16 * i;
+             
+                int t0, t1, t2, t3, k, val;
+                int a, b;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                t2 = GET_VALUE(2); t3 = GET_VALUE(3);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val = k;
+                
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 2;
+                
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 4;
+                
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 6;
+                
+                return val;
+            }
+        };
+
+        #undef GET_VALUE
+
+        template <int WTA_K>
+        __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
+        {
+            const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints && descidx < dsize)
+            {
+                float angle = angle_[ptidx];
+                angle *= (float)(CV_PI / 180.f);
+
+                float sina, cosa;
+                ::sincosf(angle, &sina, &cosa);
+
+                desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
+            }
+        }
+
+        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(dsize, block.x);
+            grid.y = divUp(npoints, block.y);
+
+            switch (WTA_K)
+            {
+            case 2:
+                computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 3:
+                computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 4:
+                computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // mergeLocation
+
+        __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
+        {
+            const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (ptidx < npoints)
+            {
+                short2 loc = loc_[ptidx];
+
+                x[ptidx] = loc.x * scale;
+                y[ptidx] = loc.y * scale;
+            }
+        }
+        
+        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
+        {
+            dim3 block(256);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.x);
+
+            mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
@@ -0,0 +1,171 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#if !defined (HAVE_CUDA)
+
+cv::gpu::FAST_GPU::FAST_GPU(int, bool, double) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::release() { throw_nogpu(); }
+int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_nogpu(); return 0; }
+int cv::gpu::FAST_GPU::getKeyPoints(GpuMat&) { throw_nogpu(); return 0; }
+
+#else /* !defined (HAVE_CUDA) */
+
+cv::gpu::FAST_GPU::FAST_GPU(int _threshold, bool _nonmaxSupression, double _keypointsRatio) : 
+    nonmaxSupression(_nonmaxSupression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
+{
+}
+
+void cv::gpu::FAST_GPU::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
+{
+    if (image.empty())
+        return;
+
+    (*this)(image, mask, d_keypoints_);
+    downloadKeypoints(d_keypoints_, keypoints);
+}
+
+void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
+{
+    if (d_keypoints.empty())
+        return;
+
+    Mat h_keypoints(d_keypoints);
+    convertKeypoints(h_keypoints, keypoints);
+}
+
+void cv::gpu::FAST_GPU::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
+{
+    if (h_keypoints.empty())
+        return;
+
+    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
+
+    int npoints = h_keypoints.cols;
+
+    keypoints.resize(npoints);
+
+    const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
+    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
+
+    for (int i = 0; i < npoints; ++i)
+    {
+        KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
+        keypoints[i] = kp;
+    }
+}
+
+void cv::gpu::FAST_GPU::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
+{
+    calcKeyPointsLocation(img, mask);
+    keypoints.cols = getKeyPoints(keypoints);
+}
+
+namespace cv { namespace gpu { namespace device 
+{
+    namespace fast 
+    {
+        int calcKeypoints_gpu(DevMem2Db img, DevMem2Db mask, short2* kpLoc, int maxKeypoints, DevMem2Di score, int threshold);
+        int nonmaxSupression_gpu(const short2* kpLoc, int count, DevMem2Di score, short2* loc, float* response);
+    }
+}}}
+
+int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
+{
+    using namespace cv::gpu::device::fast;
+
+    CV_Assert(img.type() == CV_8UC1);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
+
+    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
+
+    ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
+
+    if (nonmaxSupression)
+    {
+        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
+        score_.setTo(Scalar::all(0));
+    }
+
+    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSupression ? score_ : DevMem2Di(), threshold);
+    count_ = std::min(count_, maxKeypoints);
+
+    return count_;
+}
+
+int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
+{
+    using namespace cv::gpu::device::fast;
+
+    if (count_ == 0)
+        return 0;
+
+    ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
+
+    if (nonmaxSupression)
+        return nonmaxSupression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
+
+    GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
+    kpLoc_.colRange(0, count_).copyTo(locRow);
+    keypoints.row(1).setTo(Scalar::all(0));
+
+    return count_;    
+}
+
+void cv::gpu::FAST_GPU::release()
+{
+    kpLoc_.release();
+    score_.release();
+
+    d_keypoints_.release();
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/orb.cpp
+++ b/modules/gpu/src/orb.cpp
@@ -0,0 +1,764 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA)
+
+cv::gpu::ORB_GPU::ORB_GPU(size_t, const ORB::CommonParams&) : fastDetector_(0) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::downloadKeyPoints(GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::convertKeyPoints(Mat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::setParams(size_t, const ORB::CommonParams&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::release() { throw_nogpu(); }
+void cv::gpu::ORB_GPU::buildScalePyramids(const GpuMat&, const GpuMat&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::computeKeyPointsPyramid() { throw_nogpu(); }
+void cv::gpu::ORB_GPU::computeDescriptors(GpuMat&) { throw_nogpu(); }
+void cv::gpu::ORB_GPU::mergeKeyPoints(GpuMat&) { throw_nogpu(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace device 
+{
+    namespace orb
+    {
+        int cull_gpu(int* loc, float* response, int size, int n_points);
+
+        void HarrisResponses_gpu(DevMem2Db img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream);
+
+        void loadUMax(const int* u_max, int count);
+
+        void IC_Angle_gpu(DevMem2Db image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream);
+
+        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream);
+
+        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream);
+    }
+}}}
+
+cv::gpu::ORB_GPU::ORB_GPU(size_t n_features, const ORB::CommonParams& detector_params) :
+    fastDetector_(DEFAULT_FAST_THRESHOLD)
+{
+    setParams(n_features, detector_params);
+    
+    blurFilter = createGaussianFilter_GPU(CV_8UC1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+
+    blurForDescriptor = false;
+}
+
+namespace
+{
+    const float HARRIS_K = 0.04f;
+    const int DESCRIPTOR_SIZE = 32;
+
+    const int bit_pattern_31_[256 * 4] =
+    {
+        8,-3, 9,5/*mean (0), correlation (0)*/,
+        4,2, 7,-12/*mean (1.12461e-05), correlation (0.0437584)*/,
+        -11,9, -8,2/*mean (3.37382e-05), correlation (0.0617409)*/,
+        7,-12, 12,-13/*mean (5.62303e-05), correlation (0.0636977)*/,
+        2,-13, 2,12/*mean (0.000134953), correlation (0.085099)*/,
+        1,-7, 1,6/*mean (0.000528565), correlation (0.0857175)*/,
+        -2,-10, -2,-4/*mean (0.0188821), correlation (0.0985774)*/,
+        -13,-13, -11,-8/*mean (0.0363135), correlation (0.0899616)*/,
+        -13,-3, -12,-9/*mean (0.121806), correlation (0.099849)*/,
+        10,4, 11,9/*mean (0.122065), correlation (0.093285)*/,
+        -13,-8, -8,-9/*mean (0.162787), correlation (0.0942748)*/,
+        -11,7, -9,12/*mean (0.21561), correlation (0.0974438)*/,
+        7,7, 12,6/*mean (0.160583), correlation (0.130064)*/,
+        -4,-5, -3,0/*mean (0.228171), correlation (0.132998)*/,
+        -13,2, -12,-3/*mean (0.00997526), correlation (0.145926)*/,
+        -9,0, -7,5/*mean (0.198234), correlation (0.143636)*/,
+        12,-6, 12,-1/*mean (0.0676226), correlation (0.16689)*/,
+        -3,6, -2,12/*mean (0.166847), correlation (0.171682)*/,
+        -6,-13, -4,-8/*mean (0.101215), correlation (0.179716)*/,
+        11,-13, 12,-8/*mean (0.200641), correlation (0.192279)*/,
+        4,7, 5,1/*mean (0.205106), correlation (0.186848)*/,
+        5,-3, 10,-3/*mean (0.234908), correlation (0.192319)*/,
+        3,-7, 6,12/*mean (0.0709964), correlation (0.210872)*/,
+        -8,-7, -6,-2/*mean (0.0939834), correlation (0.212589)*/,
+        -2,11, -1,-10/*mean (0.127778), correlation (0.20866)*/,
+        -13,12, -8,10/*mean (0.14783), correlation (0.206356)*/,
+        -7,3, -5,-3/*mean (0.182141), correlation (0.198942)*/,
+        -4,2, -3,7/*mean (0.188237), correlation (0.21384)*/,
+        -10,-12, -6,11/*mean (0.14865), correlation (0.23571)*/,
+        5,-12, 6,-7/*mean (0.222312), correlation (0.23324)*/,
+        5,-6, 7,-1/*mean (0.229082), correlation (0.23389)*/,
+        1,0, 4,-5/*mean (0.241577), correlation (0.215286)*/,
+        9,11, 11,-13/*mean (0.00338507), correlation (0.251373)*/,
+        4,7, 4,12/*mean (0.131005), correlation (0.257622)*/,
+        2,-1, 4,4/*mean (0.152755), correlation (0.255205)*/,
+        -4,-12, -2,7/*mean (0.182771), correlation (0.244867)*/,
+        -8,-5, -7,-10/*mean (0.186898), correlation (0.23901)*/,
+        4,11, 9,12/*mean (0.226226), correlation (0.258255)*/,
+        0,-8, 1,-13/*mean (0.0897886), correlation (0.274827)*/,
+        -13,-2, -8,2/*mean (0.148774), correlation (0.28065)*/,
+        -3,-2, -2,3/*mean (0.153048), correlation (0.283063)*/,
+        -6,9, -4,-9/*mean (0.169523), correlation (0.278248)*/,
+        8,12, 10,7/*mean (0.225337), correlation (0.282851)*/,
+        0,9, 1,3/*mean (0.226687), correlation (0.278734)*/,
+        7,-5, 11,-10/*mean (0.00693882), correlation (0.305161)*/,
+        -13,-6, -11,0/*mean (0.0227283), correlation (0.300181)*/,
+        10,7, 12,1/*mean (0.125517), correlation (0.31089)*/,
+        -6,-3, -6,12/*mean (0.131748), correlation (0.312779)*/,
+        10,-9, 12,-4/*mean (0.144827), correlation (0.292797)*/,
+        -13,8, -8,-12/*mean (0.149202), correlation (0.308918)*/,
+        -13,0, -8,-4/*mean (0.160909), correlation (0.310013)*/,
+        3,3, 7,8/*mean (0.177755), correlation (0.309394)*/,
+        5,7, 10,-7/*mean (0.212337), correlation (0.310315)*/,
+        -1,7, 1,-12/*mean (0.214429), correlation (0.311933)*/,
+        3,-10, 5,6/*mean (0.235807), correlation (0.313104)*/,
+        2,-4, 3,-10/*mean (0.00494827), correlation (0.344948)*/,
+        -13,0, -13,5/*mean (0.0549145), correlation (0.344675)*/,
+        -13,-7, -12,12/*mean (0.103385), correlation (0.342715)*/,
+        -13,3, -11,8/*mean (0.134222), correlation (0.322922)*/,
+        -7,12, -4,7/*mean (0.153284), correlation (0.337061)*/,
+        6,-10, 12,8/*mean (0.154881), correlation (0.329257)*/,
+        -9,-1, -7,-6/*mean (0.200967), correlation (0.33312)*/,
+        -2,-5, 0,12/*mean (0.201518), correlation (0.340635)*/,
+        -12,5, -7,5/*mean (0.207805), correlation (0.335631)*/,
+        3,-10, 8,-13/*mean (0.224438), correlation (0.34504)*/,
+        -7,-7, -4,5/*mean (0.239361), correlation (0.338053)*/,
+        -3,-2, -1,-7/*mean (0.240744), correlation (0.344322)*/,
+        2,9, 5,-11/*mean (0.242949), correlation (0.34145)*/,
+        -11,-13, -5,-13/*mean (0.244028), correlation (0.336861)*/,
+        -1,6, 0,-1/*mean (0.247571), correlation (0.343684)*/,
+        5,-3, 5,2/*mean (0.000697256), correlation (0.357265)*/,
+        -4,-13, -4,12/*mean (0.00213675), correlation (0.373827)*/,
+        -9,-6, -9,6/*mean (0.0126856), correlation (0.373938)*/,
+        -12,-10, -8,-4/*mean (0.0152497), correlation (0.364237)*/,
+        10,2, 12,-3/*mean (0.0299933), correlation (0.345292)*/,
+        7,12, 12,12/*mean (0.0307242), correlation (0.366299)*/,
+        -7,-13, -6,5/*mean (0.0534975), correlation (0.368357)*/,
+        -4,9, -3,4/*mean (0.099865), correlation (0.372276)*/,
+        7,-1, 12,2/*mean (0.117083), correlation (0.364529)*/,
+        -7,6, -5,1/*mean (0.126125), correlation (0.369606)*/,
+        -13,11, -12,5/*mean (0.130364), correlation (0.358502)*/,
+        -3,7, -2,-6/*mean (0.131691), correlation (0.375531)*/,
+        7,-8, 12,-7/*mean (0.160166), correlation (0.379508)*/,
+        -13,-7, -11,-12/*mean (0.167848), correlation (0.353343)*/,
+        1,-3, 12,12/*mean (0.183378), correlation (0.371916)*/,
+        2,-6, 3,0/*mean (0.228711), correlation (0.371761)*/,
+        -4,3, -2,-13/*mean (0.247211), correlation (0.364063)*/,
+        -1,-13, 1,9/*mean (0.249325), correlation (0.378139)*/,
+        7,1, 8,-6/*mean (0.000652272), correlation (0.411682)*/,
+        1,-1, 3,12/*mean (0.00248538), correlation (0.392988)*/,
+        9,1, 12,6/*mean (0.0206815), correlation (0.386106)*/,
+        -1,-9, -1,3/*mean (0.0364485), correlation (0.410752)*/,
+        -13,-13, -10,5/*mean (0.0376068), correlation (0.398374)*/,
+        7,7, 10,12/*mean (0.0424202), correlation (0.405663)*/,
+        12,-5, 12,9/*mean (0.0942645), correlation (0.410422)*/,
+        6,3, 7,11/*mean (0.1074), correlation (0.413224)*/,
+        5,-13, 6,10/*mean (0.109256), correlation (0.408646)*/,
+        2,-12, 2,3/*mean (0.131691), correlation (0.416076)*/,
+        3,8, 4,-6/*mean (0.165081), correlation (0.417569)*/,
+        2,6, 12,-13/*mean (0.171874), correlation (0.408471)*/,
+        9,-12, 10,3/*mean (0.175146), correlation (0.41296)*/,
+        -8,4, -7,9/*mean (0.183682), correlation (0.402956)*/,
+        -11,12, -4,-6/*mean (0.184672), correlation (0.416125)*/,
+        1,12, 2,-8/*mean (0.191487), correlation (0.386696)*/,
+        6,-9, 7,-4/*mean (0.192668), correlation (0.394771)*/,
+        2,3, 3,-2/*mean (0.200157), correlation (0.408303)*/,
+        6,3, 11,0/*mean (0.204588), correlation (0.411762)*/,
+        3,-3, 8,-8/*mean (0.205904), correlation (0.416294)*/,
+        7,8, 9,3/*mean (0.213237), correlation (0.409306)*/,
+        -11,-5, -6,-4/*mean (0.243444), correlation (0.395069)*/,
+        -10,11, -5,10/*mean (0.247672), correlation (0.413392)*/,
+        -5,-8, -3,12/*mean (0.24774), correlation (0.411416)*/,
+        -10,5, -9,0/*mean (0.00213675), correlation (0.454003)*/,
+        8,-1, 12,-6/*mean (0.0293635), correlation (0.455368)*/,
+        4,-6, 6,-11/*mean (0.0404971), correlation (0.457393)*/,
+        -10,12, -8,7/*mean (0.0481107), correlation (0.448364)*/,
+        4,-2, 6,7/*mean (0.050641), correlation (0.455019)*/,
+        -2,0, -2,12/*mean (0.0525978), correlation (0.44338)*/,
+        -5,-8, -5,2/*mean (0.0629667), correlation (0.457096)*/,
+        7,-6, 10,12/*mean (0.0653846), correlation (0.445623)*/,
+        -9,-13, -8,-8/*mean (0.0858749), correlation (0.449789)*/,
+        -5,-13, -5,-2/*mean (0.122402), correlation (0.450201)*/,
+        8,-8, 9,-13/*mean (0.125416), correlation (0.453224)*/,
+        -9,-11, -9,0/*mean (0.130128), correlation (0.458724)*/,
+        1,-8, 1,-2/*mean (0.132467), correlation (0.440133)*/,
+        7,-4, 9,1/*mean (0.132692), correlation (0.454)*/,
+        -2,1, -1,-4/*mean (0.135695), correlation (0.455739)*/,
+        11,-6, 12,-11/*mean (0.142904), correlation (0.446114)*/,
+        -12,-9, -6,4/*mean (0.146165), correlation (0.451473)*/,
+        3,7, 7,12/*mean (0.147627), correlation (0.456643)*/,
+        5,5, 10,8/*mean (0.152901), correlation (0.455036)*/,
+        0,-4, 2,8/*mean (0.167083), correlation (0.459315)*/,
+        -9,12, -5,-13/*mean (0.173234), correlation (0.454706)*/,
+        0,7, 2,12/*mean (0.18312), correlation (0.433855)*/,
+        -1,2, 1,7/*mean (0.185504), correlation (0.443838)*/,
+        5,11, 7,-9/*mean (0.185706), correlation (0.451123)*/,
+        3,5, 6,-8/*mean (0.188968), correlation (0.455808)*/,
+        -13,-4, -8,9/*mean (0.191667), correlation (0.459128)*/,
+        -5,9, -3,-3/*mean (0.193196), correlation (0.458364)*/,
+        -4,-7, -3,-12/*mean (0.196536), correlation (0.455782)*/,
+        6,5, 8,0/*mean (0.1972), correlation (0.450481)*/,
+        -7,6, -6,12/*mean (0.199438), correlation (0.458156)*/,
+        -13,6, -5,-2/*mean (0.211224), correlation (0.449548)*/,
+        1,-10, 3,10/*mean (0.211718), correlation (0.440606)*/,
+        4,1, 8,-4/*mean (0.213034), correlation (0.443177)*/,
+        -2,-2, 2,-13/*mean (0.234334), correlation (0.455304)*/,
+        2,-12, 12,12/*mean (0.235684), correlation (0.443436)*/,
+        -2,-13, 0,-6/*mean (0.237674), correlation (0.452525)*/,
+        4,1, 9,3/*mean (0.23962), correlation (0.444824)*/,
+        -6,-10, -3,-5/*mean (0.248459), correlation (0.439621)*/,
+        -3,-13, -1,1/*mean (0.249505), correlation (0.456666)*/,
+        7,5, 12,-11/*mean (0.00119208), correlation (0.495466)*/,
+        4,-2, 5,-7/*mean (0.00372245), correlation (0.484214)*/,
+        -13,9, -9,-5/*mean (0.00741116), correlation (0.499854)*/,
+        7,1, 8,6/*mean (0.0208952), correlation (0.499773)*/,
+        7,-8, 7,6/*mean (0.0220085), correlation (0.501609)*/,
+        -7,-4, -7,1/*mean (0.0233806), correlation (0.496568)*/,
+        -8,11, -7,-8/*mean (0.0236505), correlation (0.489719)*/,
+        -13,6, -12,-8/*mean (0.0268781), correlation (0.503487)*/,
+        2,4, 3,9/*mean (0.0323324), correlation (0.501938)*/,
+        10,-5, 12,3/*mean (0.0399235), correlation (0.494029)*/,
+        -6,-5, -6,7/*mean (0.0420153), correlation (0.486579)*/,
+        8,-3, 9,-8/*mean (0.0548021), correlation (0.484237)*/,
+        2,-12, 2,8/*mean (0.0616622), correlation (0.496642)*/,
+        -11,-2, -10,3/*mean (0.0627755), correlation (0.498563)*/,
+        -12,-13, -7,-9/*mean (0.0829622), correlation (0.495491)*/,
+        -11,0, -10,-5/*mean (0.0843342), correlation (0.487146)*/,
+        5,-3, 11,8/*mean (0.0929937), correlation (0.502315)*/,
+        -2,-13, -1,12/*mean (0.113327), correlation (0.48941)*/,
+        -1,-8, 0,9/*mean (0.132119), correlation (0.467268)*/,
+        -13,-11, -12,-5/*mean (0.136269), correlation (0.498771)*/,
+        -10,-2, -10,11/*mean (0.142173), correlation (0.498714)*/,
+        -3,9, -2,-13/*mean (0.144141), correlation (0.491973)*/,
+        2,-3, 3,2/*mean (0.14892), correlation (0.500782)*/,
+        -9,-13, -4,0/*mean (0.150371), correlation (0.498211)*/,
+        -4,6, -3,-10/*mean (0.152159), correlation (0.495547)*/,
+        -4,12, -2,-7/*mean (0.156152), correlation (0.496925)*/,
+        -6,-11, -4,9/*mean (0.15749), correlation (0.499222)*/,
+        6,-3, 6,11/*mean (0.159211), correlation (0.503821)*/,
+        -13,11, -5,5/*mean (0.162427), correlation (0.501907)*/,
+        11,11, 12,6/*mean (0.16652), correlation (0.497632)*/,
+        7,-5, 12,-2/*mean (0.169141), correlation (0.484474)*/,
+        -1,12, 0,7/*mean (0.169456), correlation (0.495339)*/,
+        -4,-8, -3,-2/*mean (0.171457), correlation (0.487251)*/,
+        -7,1, -6,7/*mean (0.175), correlation (0.500024)*/,
+        -13,-12, -8,-13/*mean (0.175866), correlation (0.497523)*/,
+        -7,-2, -6,-8/*mean (0.178273), correlation (0.501854)*/,
+        -8,5, -6,-9/*mean (0.181107), correlation (0.494888)*/,
+        -5,-1, -4,5/*mean (0.190227), correlation (0.482557)*/,
+        -13,7, -8,10/*mean (0.196739), correlation (0.496503)*/,
+        1,5, 5,-13/*mean (0.19973), correlation (0.499759)*/,
+        1,0, 10,-13/*mean (0.204465), correlation (0.49873)*/,
+        9,12, 10,-1/*mean (0.209334), correlation (0.49063)*/,
+        5,-8, 10,-9/*mean (0.211134), correlation (0.503011)*/,
+        -1,11, 1,-13/*mean (0.212), correlation (0.499414)*/,
+        -9,-3, -6,2/*mean (0.212168), correlation (0.480739)*/,
+        -1,-10, 1,12/*mean (0.212731), correlation (0.502523)*/,
+        -13,1, -8,-10/*mean (0.21327), correlation (0.489786)*/,
+        8,-11, 10,-6/*mean (0.214159), correlation (0.488246)*/,
+        2,-13, 3,-6/*mean (0.216993), correlation (0.50287)*/,
+        7,-13, 12,-9/*mean (0.223639), correlation (0.470502)*/,
+        -10,-10, -5,-7/*mean (0.224089), correlation (0.500852)*/,
+        -10,-8, -8,-13/*mean (0.228666), correlation (0.502629)*/,
+        4,-6, 8,5/*mean (0.22906), correlation (0.498305)*/,
+        3,12, 8,-13/*mean (0.233378), correlation (0.503825)*/,
+        -4,2, -3,-3/*mean (0.234323), correlation (0.476692)*/,
+        5,-13, 10,-12/*mean (0.236392), correlation (0.475462)*/,
+        4,-13, 5,-1/*mean (0.236842), correlation (0.504132)*/,
+        -9,9, -4,3/*mean (0.236977), correlation (0.497739)*/,
+        0,3, 3,-9/*mean (0.24314), correlation (0.499398)*/,
+        -12,1, -6,1/*mean (0.243297), correlation (0.489447)*/,
+        3,2, 4,-8/*mean (0.00155196), correlation (0.553496)*/,
+        -10,-10, -10,9/*mean (0.00239541), correlation (0.54297)*/,
+        8,-13, 12,12/*mean (0.0034413), correlation (0.544361)*/,
+        -8,-12, -6,-5/*mean (0.003565), correlation (0.551225)*/,
+        2,2, 3,7/*mean (0.00835583), correlation (0.55285)*/,
+        10,6, 11,-8/*mean (0.00885065), correlation (0.540913)*/,
+        6,8, 8,-12/*mean (0.0101552), correlation (0.551085)*/,
+        -7,10, -6,5/*mean (0.0102227), correlation (0.533635)*/,
+        -3,-9, -3,9/*mean (0.0110211), correlation (0.543121)*/,
+        -1,-13, -1,5/*mean (0.0113473), correlation (0.550173)*/,
+        -3,-7, -3,4/*mean (0.0140913), correlation (0.554774)*/,
+        -8,-2, -8,3/*mean (0.017049), correlation (0.55461)*/,
+        4,2, 12,12/*mean (0.01778), correlation (0.546921)*/,
+        2,-5, 3,11/*mean (0.0224022), correlation (0.549667)*/,
+        6,-9, 11,-13/*mean (0.029161), correlation (0.546295)*/,
+        3,-1, 7,12/*mean (0.0303081), correlation (0.548599)*/,
+        11,-1, 12,4/*mean (0.0355151), correlation (0.523943)*/,
+        -3,0, -3,6/*mean (0.0417904), correlation (0.543395)*/,
+        4,-11, 4,12/*mean (0.0487292), correlation (0.542818)*/,
+        2,-4, 2,1/*mean (0.0575124), correlation (0.554888)*/,
+        -10,-6, -8,1/*mean (0.0594242), correlation (0.544026)*/,
+        -13,7, -11,1/*mean (0.0597391), correlation (0.550524)*/,
+        -13,12, -11,-13/*mean (0.0608974), correlation (0.55383)*/,
+        6,0, 11,-13/*mean (0.065126), correlation (0.552006)*/,
+        0,-1, 1,4/*mean (0.074224), correlation (0.546372)*/,
+        -13,3, -9,-2/*mean (0.0808592), correlation (0.554875)*/,
+        -9,8, -6,-3/*mean (0.0883378), correlation (0.551178)*/,
+        -13,-6, -8,-2/*mean (0.0901035), correlation (0.548446)*/,
+        5,-9, 8,10/*mean (0.0949843), correlation (0.554694)*/,
+        2,7, 3,-9/*mean (0.0994152), correlation (0.550979)*/,
+        -1,-6, -1,-1/*mean (0.10045), correlation (0.552714)*/,
+        9,5, 11,-2/*mean (0.100686), correlation (0.552594)*/,
+        11,-3, 12,-8/*mean (0.101091), correlation (0.532394)*/,
+        3,0, 3,5/*mean (0.101147), correlation (0.525576)*/,
+        -1,4, 0,10/*mean (0.105263), correlation (0.531498)*/,
+        3,-6, 4,5/*mean (0.110785), correlation (0.540491)*/,
+        -13,0, -10,5/*mean (0.112798), correlation (0.536582)*/,
+        5,8, 12,11/*mean (0.114181), correlation (0.555793)*/,
+        8,9, 9,-6/*mean (0.117431), correlation (0.553763)*/,
+        7,-4, 8,-12/*mean (0.118522), correlation (0.553452)*/,
+        -10,4, -10,9/*mean (0.12094), correlation (0.554785)*/,
+        7,3, 12,4/*mean (0.122582), correlation (0.555825)*/,
+        9,-7, 10,-2/*mean (0.124978), correlation (0.549846)*/,
+        7,0, 12,-2/*mean (0.127002), correlation (0.537452)*/,
+        -1,-6, 0,-11/*mean (0.127148), correlation (0.547401)*/
+    };    
+ 
+    void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
+    {
+        RNG rng(0x12345678);
+
+        pattern.create(2, ntuples * tupleSize, CV_32SC1);
+        pattern.setTo(Scalar::all(0));
+
+        int* pattern_x_ptr = pattern.ptr<int>(0);
+        int* pattern_y_ptr = pattern.ptr<int>(1);
+        
+        for (int i = 0; i < ntuples; i++)
+        {
+            for (int k = 0; k < tupleSize; k++)
+            {
+                for(;;)
+                {
+                    int idx = rng.uniform(0, poolSize);
+                    Point pt = pattern0[idx];
+
+                    int k1;
+                    for (k1 = 0; k1 < k; k1++)
+                        if (pattern_x_ptr[tupleSize * i + k1] == pt.x && pattern_y_ptr[tupleSize * i + k1] == pt.y)
+                            break;
+
+                    if (k1 == k)
+                    {
+                        pattern_x_ptr[tupleSize * i + k] = pt.x;
+                        pattern_y_ptr[tupleSize * i + k] = pt.y;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    void makeRandomPattern(int patchSize, Point* pattern, int npoints)
+    {
+        // we always start with a fixed seed,
+        // to make patterns the same on each run
+        RNG rng(0x34985739); 
+                             
+        for (int i = 0; i < npoints; i++)
+        {
+            pattern[i].x = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
+            pattern[i].y = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
+        }
+    }
+}
+
+void cv::gpu::ORB_GPU::setParams(size_t n_features, const ORB::CommonParams& detector_params)
+{
+    params_ = detector_params;
+    
+    // fill the extractors and descriptors for the corresponding scales
+    int n_levels = static_cast<int>(params_.n_levels_);
+    float factor = 1.0f / params_.scale_factor_;
+    float n_desired_features_per_scale = n_features * (1.0f - factor) / (1.0f - std::pow(factor, n_levels));
+    
+    n_features_per_level_.resize(n_levels);
+    int sum_n_features = 0;
+    for (int level = 0; level < n_levels - 1; ++level)
+    {
+        n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
+        sum_n_features += n_features_per_level_[level];
+        n_desired_features_per_scale *= factor;
+    }
+    n_features_per_level_[n_levels - 1] = n_features - sum_n_features;
+
+    // pre-compute the end of a row in a circular patch
+    int half_patch_size = params_.patch_size_ / 2;
+    vector<int> u_max(half_patch_size + 1);
+    for (int v = 0; v <= half_patch_size * sqrt(2.f) / 2 + 1; ++v)
+        u_max[v] = cvRound(sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+    
+    // Make sure we are symmetric
+    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * sqrt(2.f) / 2; --v)
+    {
+        while (u_max[v_0] == u_max[v_0 + 1])
+            ++v_0;
+        u_max[v] = v_0;
+        ++v_0;
+    }
+    CV_Assert(u_max.size() < 32);
+    cv::gpu::device::orb::loadUMax(&u_max[0], u_max.size());
+    
+    // Calc pattern
+    const int npoints = 512;
+    Point pattern_buf[npoints];
+    const Point* pattern0 = (const Point*)bit_pattern_31_;
+    if (params_.patch_size_ != 31)
+    {
+        pattern0 = pattern_buf;
+        makeRandomPattern(params_.patch_size_, pattern_buf, npoints);
+    }
+    
+    CV_Assert(params_.WTA_K_ == 2 || params_.WTA_K_ == 3 || params_.WTA_K_ == 4);    
+
+    Mat h_pattern;
+
+    if (params_.WTA_K_ == 2)
+    {
+        h_pattern.create(2, npoints, CV_32SC1);
+        
+        int* pattern_x_ptr = h_pattern.ptr<int>(0);
+        int* pattern_y_ptr = h_pattern.ptr<int>(1);
+
+        for (int i = 0; i < npoints; ++i)
+        {
+            pattern_x_ptr[i] = pattern0[i].x;
+            pattern_y_ptr[i] = pattern0[i].y;
+        }
+    }
+    else
+    {
+        int ntuples = descriptorSize() * 4;
+        initializeOrbPattern(pattern0, h_pattern, ntuples, params_.WTA_K_, npoints);
+    }
+
+    pattern_.upload(h_pattern);
+}
+
+namespace
+{
+    inline float getScale(const ORB::CommonParams& params, int level)
+    {
+        return pow(params.scale_factor_, level - static_cast<int>(params.first_level_));
+    }
+}
+
+void cv::gpu::ORB_GPU::buildScalePyramids(const GpuMat& image, const GpuMat& mask)
+{
+    CV_Assert(image.type() == CV_8UC1);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
+
+    imagePyr_.resize(params_.n_levels_);
+    maskPyr_.resize(params_.n_levels_);
+
+    for (int level = 0; level < static_cast<int>(params_.n_levels_); ++level)
+    {
+        float scale = 1.0f / getScale(params_, level);
+
+        Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
+
+        ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
+        ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
+        maskPyr_[level].setTo(Scalar::all(255));
+        
+        // Compute the resized image
+        if (level != static_cast<int>(params_.first_level_))
+        {
+            if (level < static_cast<int>(params_.first_level_))
+            {
+                resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+
+                if (!mask.empty())
+                    resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+            }
+            else
+            {
+                resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+
+                if (!mask.empty())
+                    resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+            }
+        }
+        else
+        {
+            image.copyTo(imagePyr_[level]);
+
+            if (!mask.empty())
+                mask.copyTo(maskPyr_[level]);
+        }
+
+        // Filter keypoints by image border
+        ensureSizeIsEnough(sz, CV_8UC1, buf_);
+        buf_.setTo(Scalar::all(0));
+        Rect inner(params_.edge_threshold_, params_.edge_threshold_, sz.width - 2 * params_.edge_threshold_, sz.height - 2 * params_.edge_threshold_);
+        buf_(inner).setTo(Scalar::all(255));
+
+        bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+    }
+}
+
+namespace
+{
+    //takes keypoints and culls them by the response
+    void cull(GpuMat& keypoints, int& count, int n_points)
+    {
+        using namespace cv::gpu::device::orb;
+
+        //this is only necessary if the keypoints size is greater than the number of desired points.
+        if (count > n_points)
+        {
+            if (n_points == 0) 
+            {
+                keypoints.release();
+                return;
+            }
+
+            count = cull_gpu(keypoints.ptr<int>(FAST_GPU::LOCATION_ROW), keypoints.ptr<float>(FAST_GPU::RESPONSE_ROW), count, n_points);
+        }
+    }
+}
+
+void cv::gpu::ORB_GPU::computeKeyPointsPyramid()
+{
+    using namespace cv::gpu::device::orb;
+
+    int half_patch_size = params_.patch_size_ / 2;
+
+    keyPointsPyr_.resize(params_.n_levels_);
+    keyPointsCount_.resize(params_.n_levels_);
+    
+    for (int level = 0; level < static_cast<int>(params_.n_levels_); ++level)
+    {
+        keyPointsCount_[level] = fastDetector_.calcKeyPointsLocation(imagePyr_[level], maskPyr_[level]);
+
+        ensureSizeIsEnough(3, keyPointsCount_[level], CV_32FC1, keyPointsPyr_[level]);
+
+        keyPointsCount_[level] = fastDetector_.getKeyPoints(keyPointsPyr_[level].rowRange(0, 2));
+
+        int n_features = n_features_per_level_[level];
+        
+        if (params_.score_type_ == ORB::CommonParams::HARRIS_SCORE)
+        {
+            // Keep more points than necessary as FAST does not give amazing corners
+            cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
+
+            // Compute the Harris cornerness (better scoring than FAST)
+            HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+        }        
+
+        //cull to the final desired level, using the new Harris scores or the original FAST scores.
+        cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
+
+        // Compute orientation
+        IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
+    }
+}
+
+void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
+{
+    using namespace cv::gpu::device::orb;
+
+    int nAllkeypoints = 0;
+
+    for (size_t level = 0; level < params_.n_levels_; ++level)
+        nAllkeypoints += keyPointsCount_[level];
+
+    if (nAllkeypoints == 0)
+    {
+        descriptors.release();
+        return;
+    }
+
+    ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, descriptors);
+
+    int offset = 0;
+
+    for (size_t level = 0; level < params_.n_levels_; ++level)
+    {       
+        GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
+
+        if (blurForDescriptor)
+        {
+            // preprocess the resized image
+            ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
+            blurFilter->apply(imagePyr_[level], buf_, Rect(0, 0, imagePyr_[level].cols, imagePyr_[level].rows));
+        }
+
+        computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), 
+            keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), params_.WTA_K_, 0);
+
+        offset += keyPointsCount_[level];
+    }
+}
+
+void cv::gpu::ORB_GPU::mergeKeyPoints(GpuMat& keypoints)
+{
+    using namespace cv::gpu::device::orb;
+
+    int nAllkeypoints = 0;
+
+    for (size_t level = 0; level < params_.n_levels_; ++level)
+        nAllkeypoints += keyPointsCount_[level];
+
+    if (nAllkeypoints == 0)
+    {
+        keypoints.release();
+        return;
+    }
+
+    ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, keypoints);
+
+    int offset = 0;
+    
+    for (int level = 0; level < static_cast<int>(params_.n_levels_); ++level)
+    {
+        float sf = getScale(params_, level);
+
+        GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);        
+        
+        float locScale = level != static_cast<int>(params_.first_level_) ? sf : 1.0f;
+
+        mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+
+        keyPointsPyr_[level].rowRange(1, 3).copyTo(keyPointsRange.rowRange(2, 4));
+        
+        keyPointsRange.row(4).setTo(Scalar::all(level));
+        keyPointsRange.row(5).setTo(Scalar::all(params_.patch_size_ * sf));
+
+        offset += keyPointsCount_[level];
+    }
+}
+
+void cv::gpu::ORB_GPU::downloadKeyPoints(GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
+{
+    if (d_keypoints.empty())
+    {
+        keypoints.clear();
+        return;
+    }
+
+    Mat h_keypoints(d_keypoints);
+
+    convertKeyPoints(h_keypoints, keypoints);
+}
+
+void cv::gpu::ORB_GPU::convertKeyPoints(Mat& d_keypoints, std::vector<KeyPoint>& keypoints)
+{
+    if (d_keypoints.empty())
+    {
+        keypoints.clear();
+        return;
+    }
+
+    CV_Assert(d_keypoints.type() == CV_32FC1 && d_keypoints.rows == ROWS_COUNT);
+
+    float* x_ptr = d_keypoints.ptr<float>(X_ROW);
+    float* y_ptr = d_keypoints.ptr<float>(Y_ROW);
+    float* response_ptr = d_keypoints.ptr<float>(RESPONSE_ROW);
+    float* angle_ptr = d_keypoints.ptr<float>(ANGLE_ROW);
+    float* octave_ptr = d_keypoints.ptr<float>(OCTAVE_ROW);
+    float* size_ptr = d_keypoints.ptr<float>(SIZE_ROW);
+
+    keypoints.resize(d_keypoints.cols);
+
+    for (int i = 0; i < d_keypoints.cols; ++i)
+    {
+        KeyPoint kp;
+
+        kp.pt.x = x_ptr[i];
+        kp.pt.y = y_ptr[i];
+        kp.response = response_ptr[i];
+        kp.angle = angle_ptr[i];
+        kp.octave = static_cast<int>(octave_ptr[i]);
+        kp.size = size_ptr[i];
+
+        keypoints[i] = kp;
+    }
+}
+
+void cv::gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints)
+{
+    buildScalePyramids(image, mask);
+    computeKeyPointsPyramid();
+    mergeKeyPoints(keypoints);
+}
+
+void cv::gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors)
+{
+    buildScalePyramids(image, mask);
+    computeKeyPointsPyramid();
+    computeDescriptors(descriptors);
+    mergeKeyPoints(keypoints);
+}
+
+void cv::gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
+{
+    (*this)(image, mask, d_keypoints_);
+    downloadKeyPoints(d_keypoints_, keypoints);
+}
+
+void cv::gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors)
+{
+    (*this)(image, mask, d_keypoints_, descriptors);
+    downloadKeyPoints(d_keypoints_, keypoints);
+}
+
+void cv::gpu::ORB_GPU::release()
+{
+    imagePyr_.clear();
+    maskPyr_.clear();
+
+    buf_.release();
+
+    keyPointsPyr_.clear();
+
+    fastDetector_.release();
+
+    d_keypoints_.release();
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -70,22 +70,6 @@ struct SURF : testing::TestWithParam<cv::gpu::DeviceInfo>
        cv::SURF fdetector_gold; fdetector_gold.extended = false;
        fdetector_gold(image, mask, keypoints_gold, descriptors_gold);        
    }
-
-    bool isSimilarKeypoints(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
-    {
-        const float maxPtDif = 1.f;
-        const float maxSizeDif = 1.f;
-        const float maxAngleDif = 2.f;
-        const float maxResponseDif = 0.1f;
-
-        float dist = (float)cv::norm(p1.pt - p2.pt);
-        return (dist < maxPtDif &&
-                fabs(p1.size - p2.size) < maxSizeDif &&
-                abs(p1.angle - p2.angle) < maxAngleDif &&
-                abs(p1.response - p2.response) < maxResponseDif &&
-                p1.octave == p2.octave &&
-                p1.class_id == p2.class_id );
-    }
 };

 TEST_P(SURF, EmptyDataTest)
@@ -652,4 +636,169 @@ INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher, testing::Combine(
                        testing::Values(cv::gpu::BruteForceMatcher_GPU_base::L1Dist, cv::gpu::BruteForceMatcher_GPU_base::L2Dist),
                        testing::Values(57, 64, 83, 128, 179, 256, 304)));

+/////////////////////////////////////////////////////////////////////////////////////////////////
+// FAST
+
+struct FAST : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    cv::Mat image;
+
+    int threshold;
+
+    std::vector<cv::KeyPoint> keypoints_gold;
+    
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+        
+        image = readImage("features2d/aloe.png", CV_LOAD_IMAGE_GRAYSCALE);
+        ASSERT_FALSE(image.empty());
+
+        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
+        threshold = rng.uniform(15, 80);
+
+        cv::FAST(image, keypoints_gold, threshold);
+    }
+};
+
+struct HashEq
+{
+    size_t hash;
+    inline HashEq(size_t hash_) : hash(hash_) {}
+    inline bool operator ()(const cv::KeyPoint& kp) const
+    {
+        return kp.hash() == hash;
+    }
+};
+
+struct KeyPointCompare
+{
+    inline bool operator ()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
+    {
+        return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
+    }
+};
+
+TEST_P(FAST, Accuracy)
+{
+    std::vector<cv::KeyPoint> keypoints;
+
+    ASSERT_NO_THROW(
+        cv::gpu::FAST_GPU fastGPU(threshold);
+
+        fastGPU(cv::gpu::GpuMat(image), cv::gpu::GpuMat(), keypoints);
+    );
+    
+    ASSERT_EQ(keypoints.size(), keypoints_gold.size());
+
+    std::sort(keypoints.begin(), keypoints.end(), KeyPointCompare());
+
+    for (size_t i = 0; i < keypoints_gold.size(); ++i)
+    {
+        const cv::KeyPoint& kp1 = keypoints[i];
+        const cv::KeyPoint& kp2 = keypoints_gold[i];
+
+        size_t h1 = kp1.hash();
+        size_t h2 = kp2.hash();
+
+        ASSERT_EQ(h1, h2);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(Features2D, FAST, testing::ValuesIn(devices()));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// ORB
+
+struct ORB : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    cv::Mat image;
+    cv::Mat mask;
+
+    int npoints;
+
+    std::vector<cv::KeyPoint> keypoints_gold;
+    cv::Mat descriptors_gold;
+    
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+        
+        image = readImage("features2d/aloe.png", CV_LOAD_IMAGE_GRAYSCALE);
+        ASSERT_FALSE(image.empty());        
+        
+        mask = cv::Mat(image.size(), CV_8UC1, cv::Scalar::all(1));
+        mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
+
+        npoints = 4000;
+
+        cv::ORB orbCPU(npoints);
+
+        orbCPU(image, mask, keypoints_gold, descriptors_gold);
+    }
+};
+
+int getValidMatchesCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
+{
+    int count = 0;
+
+    for (size_t i = 0; i < matches.size(); ++i)
+    {
+        const cv::DMatch& m = matches[i];
+
+        const cv::KeyPoint& kp1 = keypoints1[m.queryIdx];
+        const cv::KeyPoint& kp2 = keypoints2[m.trainIdx];
+
+        bool isEq = 
+            fabs(kp1.pt.x - kp2.pt.x) <= 1 && 
+            fabs(kp1.pt.y - kp2.pt.y) <= 1 && 
+            //fabs(kp1.size - kp2.size) < 1 && 
+            //fabs(kp1.angle - kp2.angle) <= 1 && 
+            //fabs(kp1.response - kp2.response) < 1 &&
+            //kp1.octave == kp2.octave && 
+            //kp1.class_id == kp2.class_id
+            true;
+
+        if (isEq)
+            ++count;
+    }
+
+    return count;
+}
+
+TEST_P(ORB, Accuracy)
+{
+    std::vector<cv::KeyPoint> keypoints;
+    cv::Mat descriptors;
+
+    ASSERT_NO_THROW(
+        cv::gpu::ORB_GPU orbGPU(npoints);
+        cv::gpu::GpuMat d_descriptors;
+
+        orbGPU(cv::gpu::GpuMat(image), cv::gpu::GpuMat(mask), keypoints, d_descriptors);
+
+        d_descriptors.download(descriptors);
+    );
+
+    cv::BruteForceMatcher<cv::Hamming> matcher;
+    std::vector<cv::DMatch> matches;
+
+    matcher.match(descriptors_gold, descriptors, matches);
+
+    int count = getValidMatchesCount(keypoints_gold, keypoints, matches);
+    double ratio = 100.0 * count / matches.size();
+
+    ASSERT_GE(ratio, 70.0);
+}
+
+INSTANTIATE_TEST_CASE_P(Features2D, ORB, testing::ValuesIn(devices()));
+
 #endif // HAVE_CUDA
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -362,30 +362,81 @@ TEST(meanShift)

 TEST(SURF)
 {
-    Mat src1 = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
-    Mat src2 = imread(abspath("aloeR.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
-    if (src1.empty()) throw runtime_error("can't open aloeL.jpg");
-    if (src2.empty()) throw runtime_error("can't open aloeR.jpg");
-
-    gpu::GpuMat d_src1(src1);
-    gpu::GpuMat d_src2(src2);
+    Mat src = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
+    if (src.empty()) throw runtime_error("can't open aloeL.jpg");

    SURF surf;
-    vector<KeyPoint> keypoints1, keypoints2;
-    vector<float> descriptors1, descriptors2;
+    vector<KeyPoint> keypoints;
+    vector<float> descriptors;
+
+    surf(src, Mat(), keypoints, descriptors);

    CPU_ON;
-    surf(src1, Mat(), keypoints1, descriptors1);
-    surf(src2, Mat(), keypoints2, descriptors2);
+    surf(src, Mat(), keypoints, descriptors);
    CPU_OFF;

    gpu::SURF_GPU d_surf;
-    gpu::GpuMat d_keypoints1, d_keypoints2;
-    gpu::GpuMat d_descriptors1, d_descriptors2;
+    gpu::GpuMat d_src(src);
+    gpu::GpuMat d_keypoints;
+    gpu::GpuMat d_descriptors;
+
+    d_surf(d_src, gpu::GpuMat(), d_keypoints, d_descriptors);

    GPU_ON;
-    d_surf(d_src1, gpu::GpuMat(), d_keypoints1, d_descriptors1);
-    d_surf(d_src2, gpu::GpuMat(), d_keypoints2, d_descriptors2);
+    d_surf(d_src, gpu::GpuMat(), d_keypoints, d_descriptors);
+    GPU_OFF;
+}
+
+
+TEST(FAST)
+{
+    Mat src = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
+    if (src.empty()) throw runtime_error("can't open aloeL.jpg");
+
+    vector<KeyPoint> keypoints;
+
+    FAST(src, keypoints, 20);
+
+    CPU_ON;
+    FAST(src, keypoints, 20);
+    CPU_OFF;
+
+    gpu::FAST_GPU d_FAST(20);
+    gpu::GpuMat d_src(src);
+    gpu::GpuMat d_keypoints;
+
+    d_FAST(d_src, gpu::GpuMat(), d_keypoints);
+
+    GPU_ON;
+    d_FAST(d_src, gpu::GpuMat(), d_keypoints);
+    GPU_OFF;
+}
+
+
+TEST(ORB)
+{
+    Mat src = imread(abspath("aloeL.jpg"), CV_LOAD_IMAGE_GRAYSCALE);
+    if (src.empty()) throw runtime_error("can't open aloeL.jpg");
+
+    ORB orb(4000);
+    vector<KeyPoint> keypoints;
+    Mat descriptors;
+
+    orb(src, Mat(), keypoints, descriptors);
+
+    CPU_ON;
+    orb(src, Mat(), keypoints, descriptors);
+    CPU_OFF;
+
+    gpu::ORB_GPU d_orb;
+    gpu::GpuMat d_src(src);
+    gpu::GpuMat d_keypoints;
+    gpu::GpuMat d_descriptors;
+
+    d_orb(d_src, gpu::GpuMat(), d_keypoints, d_descriptors);
+
+    GPU_ON;
+    d_orb(d_src, gpu::GpuMat(), d_keypoints, d_descriptors);
    GPU_OFF;
 }