Merge pull request #3596 from jet47:cuda-features2d-refactoring

2015-01-19 10:48:18 +00:00
parent b6023eab9c 5f1282afdb
commit 3a84444488
10 changed files with 2026 additions and 1694 deletions
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -48,6 +48,7 @@
 #endif
 #include "opencv2/core/cuda.hpp"
 #include "opencv2/features2d.hpp"
 #include "opencv2/cudafilters.hpp"
 /**
@@ -62,262 +63,396 @@ namespace cv { namespace cuda {
 //! @addtogroup cudafeatures2d
 //! @{
-/** @brief Brute-force descriptor matcher.
+//
 // DescriptorMatcher
 //
-For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+/** @brief Abstract base class for matching keypoint descriptors.
 by trying each one. This descriptor matcher supports masking permissible matches between descriptor
 sets.
-The class BFMatcher_CUDA has an interface similar to the class DescriptorMatcher. It has two groups
+It has two groups of match methods: for matching descriptors of an image with another image or with
-of match methods: for matching descriptors of one image with another image or with an image set.
+an image set.
 Also, all functions have an alternative to save results either to the GPU memory or to the CPU
 memory.
@sa DescriptorMatcher, BFMatcher
 */
-class CV_EXPORTS BFMatcher_CUDA
+class CV_EXPORTS DescriptorMatcher : public cv::Algorithm
 {
 public:
-    explicit BFMatcher_CUDA(int norm = cv::NORM_L2);
+    //
    // Factories
    //
-    //! Add descriptors to train descriptor collection
+    /** @brief Brute-force descriptor matcher.
    void add(const std::vector<GpuMat>& descCollection);
-    //! Get train descriptors collection
+    For each descriptor in the first set, this matcher finds the closest descriptor in the second set
-    const std::vector<GpuMat>& getTrainDescriptors() const;
+    by trying each one. This descriptor matcher supports masking permissible matches of descriptor
    sets.
-    //! Clear train descriptors collection
+    @param normType One of NORM_L1, NORM_L2, NORM_HAMMING. L1 and L2 norms are
-    void clear();
+    preferable choices for SIFT and SURF descriptors, NORM_HAMMING should be used with ORB, BRISK and
    BRIEF).
     */
    static Ptr<DescriptorMatcher> createBFMatcher(int normType = cv::NORM_L2);
-    //! Return true if there are not train descriptors in collection
+    //
-    bool empty() const;
+    // Utility
    //
-    //! Return true if the matcher supports mask in match methods
+    /** @brief Returns true if the descriptor matcher supports masking permissible matches.
-    bool isMaskSupported() const;
+     */
    virtual bool isMaskSupported() const = 0;
-    //! Find one best match for each query descriptor
+    //
-    void matchSingle(const GpuMat& query, const GpuMat& train,
+    // Descriptor collection
-        GpuMat& trainIdx, GpuMat& distance,
+    //
        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
-    //! Download trainIdx and distance and convert it to CPU vector with DMatch
+    /** @brief Adds descriptors to train a descriptor collection.
    static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
    //! Convert trainIdx and distance to vector with DMatch
    static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);
-    //! Find one best match for each query descriptor
+    If the collection is not empty, the new descriptors are added to existing train descriptors.
    void match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());
-    //! Make gpu collection of trains and masks in suitable format for matchCollection function
+    @param descriptors Descriptors to add. Each descriptors[i] is a set of descriptors from the same
-    void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+    train image.
     */
    virtual void add(const std::vector<GpuMat>& descriptors) = 0;
-    //! Find one best match from train collection for each query descriptor
+    /** @brief Returns a constant link to the train descriptor collection.
-    void matchCollection(const GpuMat& query, const GpuMat& trainCollection,
+     */
-        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
+    virtual const std::vector<GpuMat>& getTrainDescriptors() const = 0;
        const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());
-    //! Download trainIdx, imgIdx and distance and convert it to vector with DMatch
+    /** @brief Clears the train descriptor collection.
-    static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);
+     */
-    //! Convert trainIdx, imgIdx and distance to vector with DMatch
+    virtual void clear() = 0;
    static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);
-    //! Find one best match from train collection for each query descriptor.
+    /** @brief Returns true if there are no train descriptors in the collection.
-    void match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+     */
    virtual bool empty() const = 0;
-    //! Find k best matches for each query descriptor (in increasing order of distances)
+    /** @brief Trains a descriptor matcher.
    void knnMatchSingle(const GpuMat& query, const GpuMat& train,
        GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
-    //! Download trainIdx and distance and convert it to vector with DMatch
+    Trains a descriptor matcher (for example, the flann index). In all methods to match, the method
-    //! compactResult is used when mask is not empty. If compactResult is false matches
+    train() is run every time before matching.
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+     */
-    //! matches vector will not contain matches for fully masked out query descriptors.
+    virtual void train() = 0;
    static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
    //! Convert trainIdx and distance to vector with DMatch
    static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Find k best matches for each query descriptor (in increasing order of distances).
+    //
-    //! compactResult is used when mask is not empty. If compactResult is false matches
+    // 1 to 1 match
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    //
    //! matches vector will not contain matches for fully masked out query descriptors.
    void knnMatch(const GpuMat& query, const GpuMat& train,
        std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),
        bool compactResult = false);
-    //! Find k best matches from train collection for each query descriptor (in increasing order of distances)
+    /** @brief Finds the best match for each descriptor from a query set (blocking version).
    void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
        const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());
-    //! Download trainIdx and distance and convert it to vector with DMatch
+    @param queryDescriptors Query set of descriptors.
-    //! compactResult is used when mask is not empty. If compactResult is false matches
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    collection stored in the class object.
-    //! matches vector will not contain matches for fully masked out query descriptors.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
-    //! @see BFMatcher_CUDA::knnMatchDownload
+    descriptor. So, matches size may be smaller than the query descriptors count.
-    static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
+    @param mask Mask specifying permissible matches between an input query and train matrices of
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    descriptors.
    //! Convert trainIdx and distance to vector with DMatch
    //! @see BFMatcher_CUDA::knnMatchConvert
    static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Find k best matches  for each query descriptor (in increasing order of distances).
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
-    //! compactResult is used when mask is not empty. If compactResult is false matches
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
-    //! matches vector will not contain matches for fully masked out query descriptors.
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
-    void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
+    mask.at\<uchar\>(i,j) is non-zero.
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
+     */
    virtual void match(InputArray queryDescriptors, InputArray trainDescriptors,
                       std::vector<DMatch>& matches,
                       InputArray mask = noArray()) = 0;
-    //! Find best matches for each query descriptor which have distance less than maxDistance.
+    /** @overload
-    //! nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
+     */
-    //! carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
+    virtual void match(InputArray queryDescriptors,
-    //! because it didn't have enough memory.
+                       std::vector<DMatch>& matches,
-    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
+                       const std::vector<GpuMat>& masks = std::vector<GpuMat>()) = 0;
    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
    //! Matches doesn't sorted.
    void radiusMatchSingle(const GpuMat& query, const GpuMat& train,
        GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
-    //! Download trainIdx, nMatches and distance and convert it to vector with DMatch.
+    /** @brief Finds the best match for each descriptor from a query set (asynchronous version).
    //! matches will be sorted in increasing order of distances.
    //! compactResult is used when mask is not empty. If compactResult is false matches
    //! vector will have the same size as queryDescriptors rows. If compactResult is true
    //! matches vector will not contain matches for fully masked out query descriptors.
    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
    //! Convert trainIdx, nMatches and distance to vector with DMatch.
    static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Find best matches for each query descriptor which have distance less than maxDistance
+    @param queryDescriptors Query set of descriptors.
-    //! in increasing order of distances).
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
-    void radiusMatch(const GpuMat& query, const GpuMat& train,
+    collection stored in the class object.
-        std::vector< std::vector<DMatch> >& matches, float maxDistance,
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
-        const GpuMat& mask = GpuMat(), bool compactResult = false);
+    Use DescriptorMatcher::matchConvert method to retrieve results in standard representation.
    @param mask Mask specifying permissible matches between an input query and train matrices of
    descriptors.
    @param stream CUDA stream.
-    //! Find best matches for each query descriptor which have distance less than maxDistance.
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
-    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
-    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
-    //! Matches doesn't sorted.
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
-    void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
+    mask.at\<uchar\>(i,j) is non-zero.
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());
+     */
    virtual void matchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
                            OutputArray matches,
                            InputArray mask = noArray(),
                            Stream& stream = Stream::Null()) = 0;
-    //! Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
+    /** @overload
-    //! matches will be sorted in increasing order of distances.
+     */
-    //! compactResult is used when mask is not empty. If compactResult is false matches
+    virtual void matchAsync(InputArray queryDescriptors,
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
+                            OutputArray matches,
-    //! matches vector will not contain matches for fully masked out query descriptors.
+                            const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
-    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
+                            Stream& stream = Stream::Null()) = 0;
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
    //! Convert trainIdx, nMatches and distance to vector with DMatch.
    static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Find best matches from train collection for each query descriptor which have distance less than
+    /** @brief Converts matches array from internal representation to standard matches vector.
    //! maxDistance (in increasing order of distances).
    void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
-    int norm;
+    The method is supposed to be used with DescriptorMatcher::matchAsync to get final result.
    Call this method only after DescriptorMatcher::matchAsync is completed (ie. after synchronization).
-private:
+    @param gpu_matches Matches, returned from DescriptorMatcher::matchAsync.
-    std::vector<GpuMat> trainDescCollection;
+    @param matches Vector of DMatch objects.
     */
    virtual void matchConvert(InputArray gpu_matches,
                              std::vector<DMatch>& matches) = 0;
    //
    // knn match
    //
    /** @brief Finds the k best matches for each descriptor from a query set (blocking version).
    @param queryDescriptors Query set of descriptors.
    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
    collection stored in the class object.
    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
    @param k Count of best matches found per each query descriptor or less if a query descriptor has
    less than k possible matches in total.
    @param mask Mask specifying permissible matches between an input query and train matrices of
    descriptors.
    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
    the matches vector does not contain matches for fully masked-out query descriptors.
    These extended variants of DescriptorMatcher::match methods find several best matches for each query
    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::match
    for the details about query and train descriptors.
     */
    virtual void knnMatch(InputArray queryDescriptors, InputArray trainDescriptors,
                          std::vector<std::vector<DMatch> >& matches,
                          int k,
                          InputArray mask = noArray(),
                          bool compactResult = false) = 0;
    /** @overload
     */
    virtual void knnMatch(InputArray queryDescriptors,
                          std::vector<std::vector<DMatch> >& matches,
                          int k,
                          const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
                          bool compactResult = false) = 0;
    /** @brief Finds the k best matches for each descriptor from a query set (asynchronous version).
    @param queryDescriptors Query set of descriptors.
    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
    collection stored in the class object.
    @param matches Matches array stored in GPU memory. Internal representation is not defined.
    Use DescriptorMatcher::knnMatchConvert method to retrieve results in standard representation.
    @param k Count of best matches found per each query descriptor or less if a query descriptor has
    less than k possible matches in total.
    @param mask Mask specifying permissible matches between an input query and train matrices of
    descriptors.
    @param stream CUDA stream.
    These extended variants of DescriptorMatcher::matchAsync methods find several best matches for each query
    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::matchAsync
    for the details about query and train descriptors.
     */
    virtual void knnMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
                               OutputArray matches,
                               int k,
                               InputArray mask = noArray(),
                               Stream& stream = Stream::Null()) = 0;
    /** @overload
     */
    virtual void knnMatchAsync(InputArray queryDescriptors,
                               OutputArray matches,
                               int k,
                               const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
                               Stream& stream = Stream::Null()) = 0;
    /** @brief Converts matches array from internal representation to standard matches vector.
    The method is supposed to be used with DescriptorMatcher::knnMatchAsync to get final result.
    Call this method only after DescriptorMatcher::knnMatchAsync is completed (ie. after synchronization).
    @param gpu_matches Matches, returned from DescriptorMatcher::knnMatchAsync.
    @param matches Vector of DMatch objects.
    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
    the matches vector does not contain matches for fully masked-out query descriptors.
     */
    virtual void knnMatchConvert(InputArray gpu_matches,
                                 std::vector< std::vector<DMatch> >& matches,
                                 bool compactResult = false) = 0;
    //
    // radius match
    //
    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (blocking version).
    @param queryDescriptors Query set of descriptors.
    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
    collection stored in the class object.
    @param matches Found matches.
    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
    in Pixels)!
    @param mask Mask specifying permissible matches between an input query and train matrices of
    descriptors.
    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
    the matches vector does not contain matches for fully masked-out query descriptors.
    For each query descriptor, the methods find such training descriptors that the distance between the
    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
    returned in the distance increasing order.
     */
    virtual void radiusMatch(InputArray queryDescriptors, InputArray trainDescriptors,
                             std::vector<std::vector<DMatch> >& matches,
                             float maxDistance,
                             InputArray mask = noArray(),
                             bool compactResult = false) = 0;
    /** @overload
     */
    virtual void radiusMatch(InputArray queryDescriptors,
                             std::vector<std::vector<DMatch> >& matches,
                             float maxDistance,
                             const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
                             bool compactResult = false) = 0;
    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (asynchronous version).
    @param queryDescriptors Query set of descriptors.
    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
    collection stored in the class object.
    @param matches Matches array stored in GPU memory. Internal representation is not defined.
    Use DescriptorMatcher::radiusMatchConvert method to retrieve results in standard representation.
    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
    in Pixels)!
    @param mask Mask specifying permissible matches between an input query and train matrices of
    descriptors.
    @param stream CUDA stream.
    For each query descriptor, the methods find such training descriptors that the distance between the
    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
    returned in the distance increasing order.
     */
    virtual void radiusMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
                                  OutputArray matches,
                                  float maxDistance,
                                  InputArray mask = noArray(),
                                  Stream& stream = Stream::Null()) = 0;
    /** @overload
     */
    virtual void radiusMatchAsync(InputArray queryDescriptors,
                                  OutputArray matches,
                                  float maxDistance,
                                  const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
                                  Stream& stream = Stream::Null()) = 0;
    /** @brief Converts matches array from internal representation to standard matches vector.
    The method is supposed to be used with DescriptorMatcher::radiusMatchAsync to get final result.
    Call this method only after DescriptorMatcher::radiusMatchAsync is completed (ie. after synchronization).
    @param gpu_matches Matches, returned from DescriptorMatcher::radiusMatchAsync.
    @param matches Vector of DMatch objects.
    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
    the matches vector does not contain matches for fully masked-out query descriptors.
     */
    virtual void radiusMatchConvert(InputArray gpu_matches,
                                    std::vector< std::vector<DMatch> >& matches,
                                    bool compactResult = false) = 0;
 };
-/** @brief Class used for corner detection using the FAST algorithm. :
+//
 // Feature2DAsync
 //
 /** @brief Abstract base class for CUDA asynchronous 2D image feature detectors and descriptor extractors.
 */
-class CV_EXPORTS FAST_CUDA
+class CV_EXPORTS Feature2DAsync
 {
 public:
    virtual ~Feature2DAsync();
    /** @brief Detects keypoints in an image.
    @param image Image.
    @param keypoints The detected keypoints.
    @param mask Mask specifying where to look for keypoints (optional). It must be a 8-bit integer
    matrix with non-zero values in the region of interest.
    @param stream CUDA stream.
     */
    virtual void detectAsync(InputArray image,
                             OutputArray keypoints,
                             InputArray mask = noArray(),
                             Stream& stream = Stream::Null());
    /** @brief Computes the descriptors for a set of keypoints detected in an image.
    @param image Image.
    @param keypoints Input collection of keypoints.
    @param descriptors Computed descriptors. Row j is the descriptor for j-th keypoint.
    @param stream CUDA stream.
     */
    virtual void computeAsync(InputArray image,
                              OutputArray keypoints,
                              OutputArray descriptors,
                              Stream& stream = Stream::Null());
    /** Detects keypoints and computes the descriptors. */
    virtual void detectAndComputeAsync(InputArray image,
                                       InputArray mask,
                                       OutputArray keypoints,
                                       OutputArray descriptors,
                                       bool useProvidedKeypoints = false,
                                       Stream& stream = Stream::Null());
    /** Converts keypoints array from internal representation to standard vector. */
    virtual void convert(InputArray gpu_keypoints,
                         std::vector<KeyPoint>& keypoints) = 0;
 };
 //
 // FastFeatureDetector
 //
 /** @brief Wrapping class for feature detection using the FAST method.
 */
 class CV_EXPORTS FastFeatureDetector : public cv::FastFeatureDetector, public Feature2DAsync
 {
 public:
    enum
    {
        LOCATION_ROW = 0,
        RESPONSE_ROW,
-        ROWS_COUNT
+        ROWS_COUNT,
        FEATURE_SIZE = 7
    };
-    //! all features have same size
+    static Ptr<FastFeatureDetector> create(int threshold=10,
-    static const int FEATURE_SIZE = 7;
+                                           bool nonmaxSuppression=true,
                                           int type=FastFeatureDetector::TYPE_9_16,
                                           int max_npoints = 5000);
-    /** @brief Constructor.
+    virtual void setMaxNumPoints(int max_npoints) = 0;
-
+    virtual int getMaxNumPoints() const = 0;
    @param threshold Threshold on difference between intensity of the central pixel and pixels on a
    circle around this pixel.
    @param nonmaxSuppression If it is true, non-maximum suppression is applied to detected corners
    (keypoints).
    @param keypointsRatio Inner buffer size for keypoints store is determined as (keypointsRatio \*
    image_width \* image_height).
     */
    explicit FAST_CUDA(int threshold, bool nonmaxSuppression = true, double keypointsRatio = 0.05);
    /** @brief Finds the keypoints using FAST detector.
    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
    supported.
    @param mask Optional input mask that marks the regions where we should detect features.
    @param keypoints The output vector of keypoints. Can be stored both in CPU and GPU memory. For GPU
    memory:
    -   keypoints.ptr\<Vec2s\>(LOCATION_ROW)[i] will contain location of i'th point
    -   keypoints.ptr\<float\>(RESPONSE_ROW)[i] will contain response of i'th point (if non-maximum
    suppression is applied)
     */
    void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
    /** @overload */
    void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
    /** @brief Download keypoints from GPU to CPU memory.
    */
    static void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
    */
    static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
    /** @brief Releases inner buffer memory.
    */
    void release();
    bool nonmaxSuppression;
    int threshold;
    //! max keypoints = keypointsRatio * img.size().area()
    double keypointsRatio;
    /** @brief Find keypoints and compute it's response if nonmaxSuppression is true.
    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
    supported.
    @param mask Optional input mask that marks the regions where we should detect features.
    The function returns count of detected keypoints.
     */
    int calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask);
    /** @brief Gets final array of keypoints.
    @param keypoints The output vector of keypoints.
    The function performs non-max suppression if needed and returns final count of keypoints.
     */
    int getKeyPoints(GpuMat& keypoints);
 private:
    GpuMat kpLoc_;
    int count_;
    GpuMat score_;
    GpuMat d_keypoints_;
 };
-/** @brief Class for extracting ORB features and descriptors from an image. :
+//
 // ORB
 //
 /** @brief Class implementing the ORB (*oriented BRIEF*) keypoint detector and descriptor extractor
 *
 * @sa cv::ORB
 */
-class CV_EXPORTS ORB_CUDA
+class CV_EXPORTS ORB : public cv::ORB, public Feature2DAsync
 {
 public:
    enum
@@ -331,113 +466,20 @@ public:
        ROWS_COUNT
    };
-    enum
+    static Ptr<ORB> create(int nfeatures=500,
-    {
+                           float scaleFactor=1.2f,
-        DEFAULT_FAST_THRESHOLD = 20
+                           int nlevels=8,
-    };
+                           int edgeThreshold=31,
-
+                           int firstLevel=0,
-    /** @brief Constructor.
+                           int WTA_K=2,
-
+                           int scoreType=ORB::HARRIS_SCORE,
-    @param nFeatures The number of desired features.
+                           int patchSize=31,
-    @param scaleFactor Coefficient by which we divide the dimensions from one scale pyramid level to
+                           int fastThreshold=20,
-    the next.
+                           bool blurForDescriptor=false);
    @param nLevels The number of levels in the scale pyramid.
    @param edgeThreshold How far from the boundary the points should be.
    @param firstLevel The level at which the image is given. If 1, that means we will also look at the
    image scaleFactor times bigger.
    @param WTA_K
    @param scoreType
    @param patchSize
     */
    explicit ORB_CUDA(int nFeatures = 500, float scaleFactor = 1.2f, int nLevels = 8, int edgeThreshold = 31,
                     int firstLevel = 0, int WTA_K = 2, int scoreType = 0, int patchSize = 31);
    /** @overload */
    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
    /** @overload */
    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
    /** @brief Detects keypoints and computes descriptors for them.
    @param image Input 8-bit grayscale image.
    @param mask Optional input mask that marks the regions where we should detect features.
    @param keypoints The input/output vector of keypoints. Can be stored both in CPU and GPU memory.
    For GPU memory:
    -   keypoints.ptr\<float\>(X_ROW)[i] contains x coordinate of the i'th feature.
    -   keypoints.ptr\<float\>(Y_ROW)[i] contains y coordinate of the i'th feature.
    -   keypoints.ptr\<float\>(RESPONSE_ROW)[i] contains the response of the i'th feature.
    -   keypoints.ptr\<float\>(ANGLE_ROW)[i] contains orientation of the i'th feature.
    -   keypoints.ptr\<float\>(OCTAVE_ROW)[i] contains the octave of the i'th feature.
    -   keypoints.ptr\<float\>(SIZE_ROW)[i] contains the size of the i'th feature.
    @param descriptors Computed descriptors. if blurForDescriptor is true, image will be blurred
    before descriptors calculation.
     */
    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors);
    /** @overload */
    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors);
    /** @brief Download keypoints from GPU to CPU memory.
    */
    static void downloadKeyPoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
    */
    static void convertKeyPoints(const Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
    //! returns the descriptor size in bytes
    inline int descriptorSize() const { return kBytes; }
    inline void setFastParams(int threshold, bool nonmaxSuppression = true)
    {
        fastDetector_.threshold = threshold;
        fastDetector_.nonmaxSuppression = nonmaxSuppression;
    }
    /** @brief Releases inner buffer memory.
    */
    void release();
    //! if true, image will be blurred before descriptors calculation
-    bool blurForDescriptor;
+    virtual void setBlurForDescriptor(bool blurForDescriptor) = 0;
-
+    virtual bool getBlurForDescriptor() const = 0;
 private:
    enum { kBytes = 32 };
    void buildScalePyramids(const GpuMat& image, const GpuMat& mask);
    void computeKeyPointsPyramid();
    void computeDescriptors(GpuMat& descriptors);
    void mergeKeyPoints(GpuMat& keypoints);
    int nFeatures_;
    float scaleFactor_;
    int nLevels_;
    int edgeThreshold_;
    int firstLevel_;
    int WTA_K_;
    int scoreType_;
    int patchSize_;
    //! The number of desired features per scale
    std::vector<size_t> n_features_per_level_;
    //! Points to compute BRIEF descriptors from
    GpuMat pattern_;
    std::vector<GpuMat> imagePyr_;
    std::vector<GpuMat> maskPyr_;
    GpuMat buf_;
    std::vector<GpuMat> keyPointsPyr_;
    std::vector<int> keyPointsCount_;
    FAST_CUDA fastDetector_;
    Ptr<cuda::Filter> blurFilter;
    GpuMat d_keypoints_;
 };
 //! @}
--- a/modules/cudafeatures2d/perf/perf_features2d.cpp
+++ b/modules/cudafeatures2d/perf/perf_features2d.cpp
@@ -64,15 +64,18 @@ PERF_TEST_P(Image_Threshold_NonMaxSuppression, FAST,
    if (PERF_RUN_CUDA())
    {
-        cv::cuda::FAST_CUDA d_fast(threshold, nonMaxSuppersion, 0.5);
+        cv::Ptr<cv::cuda::FastFeatureDetector> d_fast =
                cv::cuda::FastFeatureDetector::create(threshold, nonMaxSuppersion,
                                                      cv::FastFeatureDetector::TYPE_9_16,
                                                      0.5 * img.size().area());
        const cv::cuda::GpuMat d_img(img);
        cv::cuda::GpuMat d_keypoints;
-        TEST_CYCLE() d_fast(d_img, cv::cuda::GpuMat(), d_keypoints);
+        TEST_CYCLE() d_fast->detectAsync(d_img, d_keypoints);
        std::vector<cv::KeyPoint> gpu_keypoints;
-        d_fast.downloadKeypoints(d_keypoints, gpu_keypoints);
+        d_fast->convert(d_keypoints, gpu_keypoints);
        sortKeyPoints(gpu_keypoints);
@@ -106,15 +109,15 @@ PERF_TEST_P(Image_NFeatures, ORB,
    if (PERF_RUN_CUDA())
    {
-        cv::cuda::ORB_CUDA d_orb(nFeatures);
+        cv::Ptr<cv::cuda::ORB> d_orb = cv::cuda::ORB::create(nFeatures);
        const cv::cuda::GpuMat d_img(img);
        cv::cuda::GpuMat d_keypoints, d_descriptors;
-        TEST_CYCLE() d_orb(d_img, cv::cuda::GpuMat(), d_keypoints, d_descriptors);
+        TEST_CYCLE() d_orb->detectAndComputeAsync(d_img, cv::noArray(), d_keypoints, d_descriptors);
        std::vector<cv::KeyPoint> gpu_keypoints;
-        d_orb.downloadKeyPoints(d_keypoints, gpu_keypoints);
+        d_orb->convert(d_keypoints, gpu_keypoints);
        cv::Mat gpu_descriptors(d_descriptors);
@@ -164,16 +167,16 @@ PERF_TEST_P(DescSize_Norm, BFMatch,
    if (PERF_RUN_CUDA())
    {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
        const cv::cuda::GpuMat d_query(query);
        const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_distance;
+        cv::cuda::GpuMat d_matches;
-        TEST_CYCLE() d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        TEST_CYCLE() d_matcher->matchAsync(d_query, d_train, d_matches);
        std::vector<cv::DMatch> gpu_matches;
-        d_matcher.matchDownload(d_trainIdx, d_distance, gpu_matches);
+        d_matcher->matchConvert(d_matches, gpu_matches);
        SANITY_CHECK_MATCHES(gpu_matches);
    }
@@ -223,16 +226,16 @@ PERF_TEST_P(DescSize_K_Norm, BFKnnMatch,
    if (PERF_RUN_CUDA())
    {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
        const cv::cuda::GpuMat d_query(query);
        const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_distance, d_allDist;
+        cv::cuda::GpuMat d_matches;
-        TEST_CYCLE() d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        TEST_CYCLE() d_matcher->knnMatchAsync(d_query, d_train, d_matches, k);
        std::vector< std::vector<cv::DMatch> > matchesTbl;
-        d_matcher.knnMatchDownload(d_trainIdx, d_distance, matchesTbl);
+        d_matcher->knnMatchConvert(d_matches, matchesTbl);
        std::vector<cv::DMatch> gpu_matches;
        toOneRowMatches(matchesTbl, gpu_matches);
@@ -277,16 +280,16 @@ PERF_TEST_P(DescSize_Norm, BFRadiusMatch,
    if (PERF_RUN_CUDA())
    {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
        const cv::cuda::GpuMat d_query(query);
        const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_nMatches, d_distance;
+        cv::cuda::GpuMat d_matches;
-        TEST_CYCLE() d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, maxDistance);
+        TEST_CYCLE() d_matcher->radiusMatchAsync(d_query, d_train, d_matches, maxDistance);
        std::vector< std::vector<cv::DMatch> > matchesTbl;
-        d_matcher.radiusMatchDownload(d_trainIdx, d_distance, d_nMatches, matchesTbl);
+        d_matcher->radiusMatchConvert(d_matches, matchesTbl);
        std::vector<cv::DMatch> gpu_matches;
        toOneRowMatches(matchesTbl, gpu_matches);
--- a/modules/cudafeatures2d/src/brute_force_matcher.cpp
+++ b/modules/cudafeatures2d/src/brute_force_matcher.cpp
--- a/modules/cudafeatures2d/src/cuda/fast.cu
+++ b/modules/cudafeatures2d/src/cuda/fast.cu
@@ -279,7 +279,7 @@ namespace cv { namespace cuda { namespace device
            #endif
        }
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold)
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -290,29 +290,29 @@ namespace cv { namespace cuda { namespace device
            grid.x = divUp(img.cols - 6, block.x);
            grid.y = divUp(img.rows - 6, block.y);
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
            if (score.data)
            {
                if (mask.data)
-                    calcKeypoints<true><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
                else
-                    calcKeypoints<true><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
            }
            else
            {
                if (mask.data)
-                    calcKeypoints<false><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
                else
-                    calcKeypoints<false><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
            }
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
            unsigned int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
            cudaSafeCall( cudaStreamSynchronize(stream) );
            return count;
        }
@@ -356,7 +356,7 @@ namespace cv { namespace cuda { namespace device
            #endif
        }
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response)
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -366,15 +366,15 @@ namespace cv { namespace cuda { namespace device
            dim3 grid;
            grid.x = divUp(count, block.x);
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
-            nonmaxSuppression<<<grid, block>>>(kpLoc, count, score, loc, response);
+            nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
            unsigned int new_count;
-            cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
            cudaSafeCall( cudaStreamSynchronize(stream) );
            return new_count;
        }
--- a/modules/cudafeatures2d/src/fast.cpp
+++ b/modules/cudafeatures2d/src/fast.cpp
@@ -47,124 +47,162 @@ using namespace cv::cuda;
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-cv::cuda::FAST_CUDA::FAST_CUDA(int, bool, double) { throw_no_cuda(); }
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int, bool, int, int) { throw_no_cuda(); return Ptr<cv::cuda::FastFeatureDetector>(); }
 void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
 void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
 void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
 void cv::cuda::FAST_CUDA::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
 void cv::cuda::FAST_CUDA::release() { throw_no_cuda(); }
 int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_no_cuda(); return 0; }
 int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat&) { throw_no_cuda(); return 0; }
 #else /* !defined (HAVE_CUDA) */
 cv::cuda::FAST_CUDA::FAST_CUDA(int _threshold, bool _nonmaxSuppression, double _keypointsRatio) :
    nonmaxSuppression(_nonmaxSuppression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
 {
 }
 void cv::cuda::FAST_CUDA::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
 {
    if (image.empty())
        return;
    (*this)(image, mask, d_keypoints_);
    downloadKeypoints(d_keypoints_, keypoints);
 }
 void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
 {
    if (d_keypoints.empty())
        return;
    Mat h_keypoints(d_keypoints);
    convertKeypoints(h_keypoints, keypoints);
 }
 void cv::cuda::FAST_CUDA::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
 {
    if (h_keypoints.empty())
        return;
    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
    int npoints = h_keypoints.cols;
    keypoints.resize(npoints);
    const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
    for (int i = 0; i < npoints; ++i)
    {
        KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
        keypoints[i] = kp;
    }
 }
 void cv::cuda::FAST_CUDA::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
 {
    calcKeyPointsLocation(img, mask);
    keypoints.cols = getKeyPoints(keypoints);
 }
 namespace cv { namespace cuda { namespace device
 {
    namespace fast
    {
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);
    }
 }}}
-int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
+namespace
 {
-    using namespace cv::cuda::device::fast;
+    class FAST_Impl : public cv::cuda::FastFeatureDetector
-
+    {
-    CV_Assert(img.type() == CV_8UC1);
+    public:
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
+        FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints);
-
+
-    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
+        virtual void detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask);
-
+        virtual void detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream);
-    ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
+
-
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
-    if (nonmaxSuppression)
+
        virtual void setThreshold(int threshold) { threshold_ = threshold; }
        virtual int getThreshold() const { return threshold_; }
        virtual void setNonmaxSuppression(bool f) { nonmaxSuppression_ = f; }
        virtual bool getNonmaxSuppression() const { return nonmaxSuppression_; }
        virtual void setMaxNumPoints(int max_npoints) { max_npoints_ = max_npoints; }
        virtual int getMaxNumPoints() const { return max_npoints_; }
        virtual void setType(int type) { CV_Assert( type == TYPE_9_16 ); }
        virtual int getType() const { return TYPE_9_16; }
    private:
        int threshold_;
        bool nonmaxSuppression_;
        int max_npoints_;
    };
    FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
        threshold_(threshold), nonmaxSuppression_(nonmaxSuppression), max_npoints_(max_npoints)
    {
        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
        score_.setTo(Scalar::all(0));
    }
-    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSuppression ? score_ : PtrStepSzi(), threshold);
+    void FAST_Impl::detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask)
-    count_ = std::min(count_, maxKeypoints);
+    {
        if (_image.empty())
        {
            keypoints.clear();
            return;
        }
-    return count_;
+        BufferPool pool(Stream::Null());
        GpuMat d_keypoints = pool.getBuffer(ROWS_COUNT, max_npoints_, CV_16SC2);
        detectAsync(_image, d_keypoints, _mask, Stream::Null());
        convert(d_keypoints, keypoints);
    }
    void FAST_Impl::detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream)
    {
        using namespace cv::cuda::device::fast;
        const GpuMat img = _image.getGpuMat();
        const GpuMat mask = _mask.getGpuMat();
        CV_Assert( img.type() == CV_8UC1 );
        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()) );
        BufferPool pool(stream);
        GpuMat kpLoc = pool.getBuffer(1, max_npoints_, CV_16SC2);
        GpuMat score;
        if (nonmaxSuppression_)
        {
            score = pool.getBuffer(img.size(), CV_32SC1);
            score.setTo(Scalar::all(0), stream);
        }
        int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));
        count = std::min(count, max_npoints_);
        if (count == 0)
        {
            _keypoints.release();
            return;
        }
        ensureSizeIsEnough(ROWS_COUNT, count, CV_32FC1, _keypoints);
        GpuMat& keypoints = _keypoints.getGpuMatRef();
        if (nonmaxSuppression_)
        {
            count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));
            if (count == 0)
            {
                keypoints.release();
            }
            else
            {
                keypoints.cols = count;
            }
        }
        else
        {
            GpuMat locRow(1, count, kpLoc.type(), keypoints.ptr(0));
            kpLoc.colRange(0, count).copyTo(locRow, stream);
            keypoints.row(1).setTo(Scalar::all(0), stream);
        }
    }
    void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
    {
        if (_gpu_keypoints.empty())
        {
            keypoints.clear();
            return;
        }
        Mat h_keypoints;
        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
        {
            _gpu_keypoints.getGpuMat().download(h_keypoints);
        }
        else
        {
            h_keypoints = _gpu_keypoints.getMat();
        }
        CV_Assert( h_keypoints.rows == ROWS_COUNT );
        CV_Assert( h_keypoints.elemSize() == 4 );
        const int npoints = h_keypoints.cols;
        keypoints.resize(npoints);
        const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
        const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
        for (int i = 0; i < npoints; ++i)
        {
            KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
            keypoints[i] = kp;
        }
    }
 }
-int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat& keypoints)
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int threshold, bool nonmaxSuppression, int type, int max_npoints)
 {
-    using namespace cv::cuda::device::fast;
+    CV_Assert( type == TYPE_9_16 );
-
+    return makePtr<FAST_Impl>(threshold, nonmaxSuppression, max_npoints);
    if (count_ == 0)
        return 0;
    ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
    if (nonmaxSuppression)
        return nonmaxSuppression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
    GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
    kpLoc_.colRange(0, count_).copyTo(locRow);
    keypoints.row(1).setTo(Scalar::all(0));
    return count_;
 }
 void cv::cuda::FAST_CUDA::release()
 {
    kpLoc_.release();
    score_.release();
    d_keypoints_.release();
 }
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudafeatures2d/src/feature2d_async.cpp
+++ b/modules/cudafeatures2d/src/feature2d_async.cpp
@@ -0,0 +1,85 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "precomp.hpp"
 cv::cuda::Feature2DAsync::~Feature2DAsync()
 {
 }
 void cv::cuda::Feature2DAsync::detectAsync(InputArray image,
                                           OutputArray keypoints,
                                           InputArray mask,
                                           Stream& stream)
 {
    if (image.empty())
    {
        keypoints.clear();
        return;
    }
    detectAndComputeAsync(image, mask, keypoints, noArray(), false, stream);
 }
 void cv::cuda::Feature2DAsync::computeAsync(InputArray image,
                                            OutputArray keypoints,
                                            OutputArray descriptors,
                                            Stream& stream)
 {
    if (image.empty())
    {
        descriptors.release();
        return;
    }
    detectAndComputeAsync(image, noArray(), keypoints, descriptors, true, stream);
 }
 void cv::cuda::Feature2DAsync::detectAndComputeAsync(InputArray /*image*/,
                                                     InputArray /*mask*/,
                                                     OutputArray /*keypoints*/,
                                                     OutputArray /*descriptors*/,
                                                     bool /*useProvidedKeypoints*/,
                                                     Stream& /*stream*/)
 {
    CV_Error(Error::StsNotImplemented, "");
 }
--- a/modules/cudafeatures2d/src/orb.cpp
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -47,18 +47,7 @@ using namespace cv::cuda;
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-cv::cuda::ORB_CUDA::ORB_CUDA(int, float, int, int, int, int, int, int) : fastDetector_(20) { throw_no_cuda(); }
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int, float, int, int, int, int, int, int, int, bool) { throw_no_cuda(); return Ptr<cv::cuda::ORB>(); }
 void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&) { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::downloadKeyPoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::convertKeyPoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::release() { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::buildScalePyramids(const GpuMat&, const GpuMat&) { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::computeKeyPointsPyramid() { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::computeDescriptors(GpuMat&) { throw_no_cuda(); }
 void cv::cuda::ORB_CUDA::mergeKeyPoints(GpuMat&) { throw_no_cuda(); }
 #else /* !defined (HAVE_CUDA) */
@@ -346,7 +335,100 @@ namespace
        -1,-6, 0,-11/*mean (0.127148), correlation (0.547401)*/
    };
-    void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
+    class ORB_Impl : public cv::cuda::ORB
    {
    public:
        ORB_Impl(int nfeatures,
                 float scaleFactor,
                 int nlevels,
                 int edgeThreshold,
                 int firstLevel,
                 int WTA_K,
                 int scoreType,
                 int patchSize,
                 int fastThreshold,
                 bool blurForDescriptor);
        virtual void detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints);
        virtual void detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream);
        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
        virtual int descriptorSize() const { return kBytes; }
        virtual int descriptorType() const { return CV_8U; }
        virtual int defaultNorm() const { return NORM_HAMMING; }
        virtual void setMaxFeatures(int maxFeatures) { nFeatures_ = maxFeatures; }
        virtual int getMaxFeatures() const { return nFeatures_; }
        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
        virtual double getScaleFactor() const { return scaleFactor_; }
        virtual void setNLevels(int nlevels) { nLevels_ = nlevels; }
        virtual int getNLevels() const { return nLevels_; }
        virtual void setEdgeThreshold(int edgeThreshold) { edgeThreshold_ = edgeThreshold; }
        virtual int getEdgeThreshold() const { return edgeThreshold_; }
        virtual void setFirstLevel(int firstLevel) { firstLevel_ = firstLevel; }
        virtual int getFirstLevel() const { return firstLevel_; }
        virtual void setWTA_K(int wta_k) { WTA_K_ = wta_k; }
        virtual int getWTA_K() const { return WTA_K_; }
        virtual void setScoreType(int scoreType) { scoreType_ = scoreType; }
        virtual int getScoreType() const { return scoreType_; }
        virtual void setPatchSize(int patchSize) { patchSize_ = patchSize; }
        virtual int getPatchSize() const { return patchSize_; }
        virtual void setFastThreshold(int fastThreshold) { fastThreshold_ = fastThreshold; }
        virtual int getFastThreshold() const { return fastThreshold_; }
        virtual void setBlurForDescriptor(bool blurForDescriptor) { blurForDescriptor_ = blurForDescriptor; }
        virtual bool getBlurForDescriptor() const { return blurForDescriptor_; }
    private:
        int nFeatures_;
        float scaleFactor_;
        int nLevels_;
        int edgeThreshold_;
        int firstLevel_;
        int WTA_K_;
        int scoreType_;
        int patchSize_;
        int fastThreshold_;
        bool blurForDescriptor_;
    private:
        void buildScalePyramids(InputArray _image, InputArray _mask);
        void computeKeyPointsPyramid();
        void computeDescriptors(OutputArray _descriptors);
        void mergeKeyPoints(OutputArray _keypoints);
    private:
        Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
        //! The number of desired features per scale
        std::vector<size_t> n_features_per_level_;
        //! Points to compute BRIEF descriptors from
        GpuMat pattern_;
        std::vector<GpuMat> imagePyr_;
        std::vector<GpuMat> maskPyr_;
        GpuMat buf_;
        std::vector<GpuMat> keyPointsPyr_;
        std::vector<int> keyPointsCount_;
        Ptr<cuda::Filter> blurFilter_;
        GpuMat d_keypoints_;
    };
    static void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
    {
        RNG rng(0x12345678);
@@ -381,7 +463,7 @@ namespace
        }
    }
-    void makeRandomPattern(int patchSize, Point* pattern, int npoints)
+    static void makeRandomPattern(int patchSize, Point* pattern, int npoints)
    {
        // we always start with a fixed seed,
        // to make patterns the same on each run
@@ -393,155 +475,189 @@ namespace
            pattern[i].y = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
        }
    }
 }
-cv::cuda::ORB_CUDA::ORB_CUDA(int nFeatures, float scaleFactor, int nLevels, int edgeThreshold, int firstLevel, int WTA_K, int scoreType, int patchSize) :
+    ORB_Impl::ORB_Impl(int nFeatures,
-    nFeatures_(nFeatures), scaleFactor_(scaleFactor), nLevels_(nLevels), edgeThreshold_(edgeThreshold), firstLevel_(firstLevel), WTA_K_(WTA_K),
+                       float scaleFactor,
-    scoreType_(scoreType), patchSize_(patchSize),
+                       int nLevels,
-    fastDetector_(DEFAULT_FAST_THRESHOLD)
+                       int edgeThreshold,
-{
+                       int firstLevel,
-    CV_Assert(patchSize_ >= 2);
+                       int WTA_K,
-
+                       int scoreType,
-    // fill the extractors and descriptors for the corresponding scales
+                       int patchSize,
-    float factor = 1.0f / scaleFactor_;
+                       int fastThreshold,
-    float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
+                       bool blurForDescriptor) :
-
+        nFeatures_(nFeatures),
-    n_features_per_level_.resize(nLevels_);
+        scaleFactor_(scaleFactor),
-    size_t sum_n_features = 0;
+        nLevels_(nLevels),
-    for (int level = 0; level < nLevels_ - 1; ++level)
+        edgeThreshold_(edgeThreshold),
        firstLevel_(firstLevel),
        WTA_K_(WTA_K),
        scoreType_(scoreType),
        patchSize_(patchSize),
        fastThreshold_(fastThreshold),
        blurForDescriptor_(blurForDescriptor)
    {
-        n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
+        CV_Assert( patchSize_ >= 2 );
-        sum_n_features += n_features_per_level_[level];
+        CV_Assert( WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4 );
        n_desired_features_per_scale *= factor;
    }
    n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
-    // pre-compute the end of a row in a circular patch
+        fastDetector_ = cuda::FastFeatureDetector::create(fastThreshold_);
    int half_patch_size = patchSize_ / 2;
    std::vector<int> u_max(half_patch_size + 2);
    for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
        u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
-    // Make sure we are symmetric
+        // fill the extractors and descriptors for the corresponding scales
-    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
+        float factor = 1.0f / scaleFactor_;
-    {
+        float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
        while (u_max[v_0] == u_max[v_0 + 1])
            ++v_0;
        u_max[v] = v_0;
        ++v_0;
    }
    CV_Assert(u_max.size() < 32);
    cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
-    // Calc pattern
+        n_features_per_level_.resize(nLevels_);
-    const int npoints = 512;
+        size_t sum_n_features = 0;
-    Point pattern_buf[npoints];
+        for (int level = 0; level < nLevels_ - 1; ++level)
    const Point* pattern0 = (const Point*)bit_pattern_31_;
    if (patchSize_ != 31)
    {
        pattern0 = pattern_buf;
        makeRandomPattern(patchSize_, pattern_buf, npoints);
    }
    CV_Assert(WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4);
    Mat h_pattern;
    if (WTA_K_ == 2)
    {
        h_pattern.create(2, npoints, CV_32SC1);
        int* pattern_x_ptr = h_pattern.ptr<int>(0);
        int* pattern_y_ptr = h_pattern.ptr<int>(1);
        for (int i = 0; i < npoints; ++i)
        {
-            pattern_x_ptr[i] = pattern0[i].x;
+            n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
-            pattern_y_ptr[i] = pattern0[i].y;
+            sum_n_features += n_features_per_level_[level];
            n_desired_features_per_scale *= factor;
        }
-    }
+        n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
    else
    {
        int ntuples = descriptorSize() * 4;
        initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
    }
-    pattern_.upload(h_pattern);
+        // pre-compute the end of a row in a circular patch
-
+        int half_patch_size = patchSize_ / 2;
-    blurFilter = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+        std::vector<int> u_max(half_patch_size + 2);
-
+        for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
    blurForDescriptor = false;
 }
 namespace
 {
    inline float getScale(float scaleFactor, int firstLevel, int level)
    {
        return pow(scaleFactor, level - firstLevel);
    }
 }
 void cv::cuda::ORB_CUDA::buildScalePyramids(const GpuMat& image, const GpuMat& mask)
 {
    CV_Assert(image.type() == CV_8UC1);
    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
    imagePyr_.resize(nLevels_);
    maskPyr_.resize(nLevels_);
    for (int level = 0; level < nLevels_; ++level)
    {
        float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
        Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
        ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
        ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
        maskPyr_[level].setTo(Scalar::all(255));
        // Compute the resized image
        if (level != firstLevel_)
        {
-            if (level < firstLevel_)
+            u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
-            {
+        }
                cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
-                if (!mask.empty())
+        // Make sure we are symmetric
-                    cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+        for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
-            }
+        {
-            else
+            while (u_max[v_0] == u_max[v_0 + 1])
-            {
+                ++v_0;
-                cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+            u_max[v] = v_0;
            ++v_0;
        }
        CV_Assert( u_max.size() < 32 );
        cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
-                if (!mask.empty())
+        // Calc pattern
-                {
+        const int npoints = 512;
-                    cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+        Point pattern_buf[npoints];
-                    cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
+        const Point* pattern0 = (const Point*)bit_pattern_31_;
-                }
+        if (patchSize_ != 31)
        {
            pattern0 = pattern_buf;
            makeRandomPattern(patchSize_, pattern_buf, npoints);
        }
        Mat h_pattern;
        if (WTA_K_ == 2)
        {
            h_pattern.create(2, npoints, CV_32SC1);
            int* pattern_x_ptr = h_pattern.ptr<int>(0);
            int* pattern_y_ptr = h_pattern.ptr<int>(1);
            for (int i = 0; i < npoints; ++i)
            {
                pattern_x_ptr[i] = pattern0[i].x;
                pattern_y_ptr[i] = pattern0[i].y;
            }
        }
        else
        {
-            image.copyTo(imagePyr_[level]);
+            int ntuples = descriptorSize() * 4;
-
+            initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
            if (!mask.empty())
                mask.copyTo(maskPyr_[level]);
        }
-        // Filter keypoints by image border
+        pattern_.upload(h_pattern);
        ensureSizeIsEnough(sz, CV_8UC1, buf_);
        buf_.setTo(Scalar::all(0));
        Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
        buf_(inner).setTo(Scalar::all(255));
-        cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+        blurFilter_ = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
    }
 }
-namespace
+    void ORB_Impl::detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints)
-{
+    {
-    //takes keypoints and culls them by the response
+        CV_Assert( useProvidedKeypoints == false );
-    void cull(GpuMat& keypoints, int& count, int n_points)
+
        detectAndComputeAsync(_image, _mask, d_keypoints_, _descriptors, false, Stream::Null());
        convert(d_keypoints_, keypoints);
    }
    void ORB_Impl::detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream)
    {
        CV_Assert( useProvidedKeypoints == false );
        buildScalePyramids(_image, _mask);
        computeKeyPointsPyramid();
        if (_descriptors.needed())
        {
            computeDescriptors(_descriptors);
        }
        mergeKeyPoints(_keypoints);
    }
    static float getScale(float scaleFactor, int firstLevel, int level)
    {
        return pow(scaleFactor, level - firstLevel);
    }
    void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask)
    {
        const GpuMat image = _image.getGpuMat();
        const GpuMat mask = _mask.getGpuMat();
        CV_Assert( image.type() == CV_8UC1 );
        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
        imagePyr_.resize(nLevels_);
        maskPyr_.resize(nLevels_);
        for (int level = 0; level < nLevels_; ++level)
        {
            float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
            Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
            ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
            ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
            maskPyr_[level].setTo(Scalar::all(255));
            // Compute the resized image
            if (level != firstLevel_)
            {
                if (level < firstLevel_)
                {
                    cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
                    if (!mask.empty())
                        cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
                }
                else
                {
                    cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
                    if (!mask.empty())
                    {
                        cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
                        cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
                    }
                }
            }
            else
            {
                image.copyTo(imagePyr_[level]);
                if (!mask.empty())
                    mask.copyTo(maskPyr_[level]);
            }
            // Filter keypoints by image border
            ensureSizeIsEnough(sz, CV_8UC1, buf_);
            buf_.setTo(Scalar::all(0));
            Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
            buf_(inner).setTo(Scalar::all(255));
            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
        }
    }
    // takes keypoints and culls them by the response
    static void cull(GpuMat& keypoints, int& count, int n_points)
    {
        using namespace cv::cuda::device::orb;
@@ -554,222 +670,199 @@ namespace
                return;
            }
-            count = cull_gpu(keypoints.ptr<int>(FAST_CUDA::LOCATION_ROW), keypoints.ptr<float>(FAST_CUDA::RESPONSE_ROW), count, n_points);
+            count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points);
        }
    }
 }
-void cv::cuda::ORB_CUDA::computeKeyPointsPyramid()
+    void ORB_Impl::computeKeyPointsPyramid()
 {
    using namespace cv::cuda::device::orb;
    int half_patch_size = patchSize_ / 2;
    keyPointsPyr_.resize(nLevels_);
    keyPointsCount_.resize(nLevels_);
    for (int level = 0; level < nLevels_; ++level)
    {
-        keyPointsCount_[level] = fastDetector_.calcKeyPointsLocation(imagePyr_[level], maskPyr_[level]);
+        using namespace cv::cuda::device::orb;
-        if (keyPointsCount_[level] == 0)
+        int half_patch_size = patchSize_ / 2;
            continue;
-        ensureSizeIsEnough(3, keyPointsCount_[level], CV_32FC1, keyPointsPyr_[level]);
+        keyPointsPyr_.resize(nLevels_);
        keyPointsCount_.resize(nLevels_);
-        GpuMat fastKpRange = keyPointsPyr_[level].rowRange(0, 2);
+        fastDetector_->setThreshold(fastThreshold_);
        keyPointsCount_[level] = fastDetector_.getKeyPoints(fastKpRange);
-        if (keyPointsCount_[level] == 0)
+        for (int level = 0; level < nLevels_; ++level)
            continue;
        int n_features = static_cast<int>(n_features_per_level_[level]);
        if (scoreType_ == ORB::HARRIS_SCORE)
        {
-            // Keep more points than necessary as FAST does not give amazing corners
+            fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
            cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
-            // Compute the Harris cornerness (better scoring than FAST)
+            GpuMat fastKpRange;
-            HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+            fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
            keyPointsCount_[level] = fastKpRange.cols;
            if (keyPointsCount_[level] == 0)
                continue;
            ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
            fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
            const int n_features = static_cast<int>(n_features_per_level_[level]);
            if (scoreType_ == ORB::HARRIS_SCORE)
            {
                // Keep more points than necessary as FAST does not give amazing corners
                cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
                // Compute the Harris cornerness (better scoring than FAST)
                HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
            }
            //cull to the final desired level, using the new Harris scores or the original FAST scores.
            cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
            // Compute orientation
            IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
        }
        //cull to the final desired level, using the new Harris scores or the original FAST scores.
        cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
        // Compute orientation
        IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
    }
 }
 void cv::cuda::ORB_CUDA::computeDescriptors(GpuMat& descriptors)
 {
    using namespace cv::cuda::device::orb;
    int nAllkeypoints = 0;
    for (int level = 0; level < nLevels_; ++level)
        nAllkeypoints += keyPointsCount_[level];
    if (nAllkeypoints == 0)
    {
        descriptors.release();
        return;
    }
-    ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, descriptors);
+    void ORB_Impl::computeDescriptors(OutputArray _descriptors)
    int offset = 0;
    for (int level = 0; level < nLevels_; ++level)
    {
-        if (keyPointsCount_[level] == 0)
+        using namespace cv::cuda::device::orb;
            continue;
-        GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
+        int nAllkeypoints = 0;
-        if (blurForDescriptor)
+        for (int level = 0; level < nLevels_; ++level)
            nAllkeypoints += keyPointsCount_[level];
        if (nAllkeypoints == 0)
        {
-            // preprocess the resized image
+            _descriptors.release();
-            ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
+            return;
            blurFilter->apply(imagePyr_[level], buf_);
        }
-        computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
+        ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, _descriptors);
-            keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
+        GpuMat descriptors = _descriptors.getGpuMat();
-        offset += keyPointsCount_[level];
+        int offset = 0;
        for (int level = 0; level < nLevels_; ++level)
        {
            if (keyPointsCount_[level] == 0)
                continue;
            GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
            if (blurForDescriptor_)
            {
                // preprocess the resized image
                ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
                blurFilter_->apply(imagePyr_[level], buf_);
            }
            computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
                keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
            offset += keyPointsCount_[level];
        }
    }
 }
-void cv::cuda::ORB_CUDA::mergeKeyPoints(GpuMat& keypoints)
+    void ORB_Impl::mergeKeyPoints(OutputArray _keypoints)
 {
    using namespace cv::cuda::device::orb;
    int nAllkeypoints = 0;
    for (int level = 0; level < nLevels_; ++level)
        nAllkeypoints += keyPointsCount_[level];
    if (nAllkeypoints == 0)
    {
-        keypoints.release();
+        using namespace cv::cuda::device::orb;
-        return;
+
        int nAllkeypoints = 0;
        for (int level = 0; level < nLevels_; ++level)
            nAllkeypoints += keyPointsCount_[level];
        if (nAllkeypoints == 0)
        {
            _keypoints.release();
            return;
        }
        ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, _keypoints);
        GpuMat& keypoints = _keypoints.getGpuMatRef();
        int offset = 0;
        for (int level = 0; level < nLevels_; ++level)
        {
            if (keyPointsCount_[level] == 0)
                continue;
            float sf = getScale(scaleFactor_, firstLevel_, level);
            GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
            float locScale = level != firstLevel_ ? sf : 1.0f;
            mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
            GpuMat range = keyPointsRange.rowRange(2, 4);
            keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
            keyPointsRange.row(4).setTo(Scalar::all(level));
            keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
            offset += keyPointsCount_[level];
        }
    }
-    ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, keypoints);
+    void ORB_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
    int offset = 0;
    for (int level = 0; level < nLevels_; ++level)
    {
-        if (keyPointsCount_[level] == 0)
+        if (_gpu_keypoints.empty())
-            continue;
+        {
            keypoints.clear();
            return;
        }
-        float sf = getScale(scaleFactor_, firstLevel_, level);
+        Mat h_keypoints;
        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
        {
            _gpu_keypoints.getGpuMat().download(h_keypoints);
        }
        else
        {
            h_keypoints = _gpu_keypoints.getMat();
        }
-        GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
        CV_Assert( h_keypoints.type() == CV_32FC1 );
-        float locScale = level != firstLevel_ ? sf : 1.0f;
+        const int npoints = h_keypoints.cols;
-        mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+        keypoints.resize(npoints);
-        GpuMat range = keyPointsRange.rowRange(2, 4);
+        const float* x_ptr = h_keypoints.ptr<float>(X_ROW);
-        keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
+        const float* y_ptr = h_keypoints.ptr<float>(Y_ROW);
        const float* response_ptr = h_keypoints.ptr<float>(RESPONSE_ROW);
        const float* angle_ptr = h_keypoints.ptr<float>(ANGLE_ROW);
        const float* octave_ptr = h_keypoints.ptr<float>(OCTAVE_ROW);
        const float* size_ptr = h_keypoints.ptr<float>(SIZE_ROW);
-        keyPointsRange.row(4).setTo(Scalar::all(level));
+        for (int i = 0; i < npoints; ++i)
-        keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
+        {
            KeyPoint kp;
-        offset += keyPointsCount_[level];
+            kp.pt.x = x_ptr[i];
            kp.pt.y = y_ptr[i];
            kp.response = response_ptr[i];
            kp.angle = angle_ptr[i];
            kp.octave = static_cast<int>(octave_ptr[i]);
            kp.size = size_ptr[i];
            keypoints[i] = kp;
        }
    }
 }
-void cv::cuda::ORB_CUDA::downloadKeyPoints(const GpuMat &d_keypoints, std::vector<KeyPoint>& keypoints)
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int nfeatures,
                                         float scaleFactor,
                                         int nlevels,
                                         int edgeThreshold,
                                         int firstLevel,
                                         int WTA_K,
                                         int scoreType,
                                         int patchSize,
                                         int fastThreshold,
                                         bool blurForDescriptor)
 {
-    if (d_keypoints.empty())
+    return makePtr<ORB_Impl>(nfeatures, scaleFactor, nlevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize, fastThreshold, blurForDescriptor);
    {
        keypoints.clear();
        return;
    }
    Mat h_keypoints(d_keypoints);
    convertKeyPoints(h_keypoints, keypoints);
 }
 void cv::cuda::ORB_CUDA::convertKeyPoints(const Mat &d_keypoints, std::vector<KeyPoint>& keypoints)
 {
    if (d_keypoints.empty())
    {
        keypoints.clear();
        return;
    }
    CV_Assert(d_keypoints.type() == CV_32FC1 && d_keypoints.rows == ROWS_COUNT);
    const float* x_ptr = d_keypoints.ptr<float>(X_ROW);
    const float* y_ptr = d_keypoints.ptr<float>(Y_ROW);
    const float* response_ptr = d_keypoints.ptr<float>(RESPONSE_ROW);
    const float* angle_ptr = d_keypoints.ptr<float>(ANGLE_ROW);
    const float* octave_ptr = d_keypoints.ptr<float>(OCTAVE_ROW);
    const float* size_ptr = d_keypoints.ptr<float>(SIZE_ROW);
    keypoints.resize(d_keypoints.cols);
    for (int i = 0; i < d_keypoints.cols; ++i)
    {
        KeyPoint kp;
        kp.pt.x = x_ptr[i];
        kp.pt.y = y_ptr[i];
        kp.response = response_ptr[i];
        kp.angle = angle_ptr[i];
        kp.octave = static_cast<int>(octave_ptr[i]);
        kp.size = size_ptr[i];
        keypoints[i] = kp;
    }
 }
 void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints)
 {
    buildScalePyramids(image, mask);
    computeKeyPointsPyramid();
    mergeKeyPoints(keypoints);
 }
 void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors)
 {
    buildScalePyramids(image, mask);
    computeKeyPointsPyramid();
    computeDescriptors(descriptors);
    mergeKeyPoints(keypoints);
 }
 void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
 {
    (*this)(image, mask, d_keypoints_);
    downloadKeyPoints(d_keypoints_, keypoints);
 }
 void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors)
 {
    (*this)(image, mask, d_keypoints_, descriptors);
    downloadKeyPoints(d_keypoints_, keypoints);
 }
 void cv::cuda::ORB_CUDA::release()
 {
    imagePyr_.clear();
    maskPyr_.clear();
    buf_.release();
    keyPointsPyr_.clear();
    fastDetector_.release();
    d_keypoints_.release();
 }
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudafeatures2d/test/test_features2d.cpp
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -76,15 +76,14 @@ CUDA_TEST_P(FAST, Accuracy)
    cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(image.empty());
-    cv::cuda::FAST_CUDA fast(threshold);
+    cv::Ptr<cv::cuda::FastFeatureDetector> fast = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
    fast.nonmaxSuppression = nonmaxSuppression;
    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
    {
        try
        {
            std::vector<cv::KeyPoint> keypoints;
-            fast(loadMat(image), cv::cuda::GpuMat(), keypoints);
+            fast->detect(loadMat(image), keypoints);
        }
        catch (const cv::Exception& e)
        {
@@ -94,7 +93,7 @@ CUDA_TEST_P(FAST, Accuracy)
    else
    {
        std::vector<cv::KeyPoint> keypoints;
-        fast(loadMat(image), cv::cuda::GpuMat(), keypoints);
+        fast->detect(loadMat(image), keypoints);
        std::vector<cv::KeyPoint> keypoints_gold;
        cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
@@ -123,7 +122,7 @@ namespace
    IMPLEMENT_PARAM_CLASS(ORB_BlurForDescriptor, bool)
 }
-CV_ENUM(ORB_ScoreType, ORB::HARRIS_SCORE, ORB::FAST_SCORE)
+CV_ENUM(ORB_ScoreType, cv::ORB::HARRIS_SCORE, cv::ORB::FAST_SCORE)
 PARAM_TEST_CASE(ORB, cv::cuda::DeviceInfo, ORB_FeaturesCount, ORB_ScaleFactor, ORB_LevelsCount, ORB_EdgeThreshold, ORB_firstLevel, ORB_WTA_K, ORB_ScoreType, ORB_PatchSize, ORB_BlurForDescriptor)
 {
@@ -163,8 +162,9 @@ CUDA_TEST_P(ORB, Accuracy)
    cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
    mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
-    cv::cuda::ORB_CUDA orb(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
+    cv::Ptr<cv::cuda::ORB> orb =
-    orb.blurForDescriptor = blurForDescriptor;
+            cv::cuda::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel,
                                  WTA_K, scoreType, patchSize, 20, blurForDescriptor);
    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
    {
@@ -172,7 +172,7 @@ CUDA_TEST_P(ORB, Accuracy)
        {
            std::vector<cv::KeyPoint> keypoints;
            cv::cuda::GpuMat descriptors;
-            orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+            orb->detectAndComputeAsync(loadMat(image), loadMat(mask), keypoints, descriptors);
        }
        catch (const cv::Exception& e)
        {
@@ -183,7 +183,7 @@ CUDA_TEST_P(ORB, Accuracy)
    {
        std::vector<cv::KeyPoint> keypoints;
        cv::cuda::GpuMat descriptors;
-        orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+        orb->detectAndCompute(loadMat(image), loadMat(mask), keypoints, descriptors);
        cv::Ptr<cv::ORB> orb_gold = cv::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
@@ -208,7 +208,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Features2D, ORB,  testing::Combine(
    testing::Values(ORB_ScaleFactor(1.2f)),
    testing::Values(ORB_LevelsCount(4), ORB_LevelsCount(8)),
    testing::Values(ORB_EdgeThreshold(31)),
-    testing::Values(ORB_firstLevel(0), ORB_firstLevel(2)),
+    testing::Values(ORB_firstLevel(0)),
    testing::Values(ORB_WTA_K(2), ORB_WTA_K(3), ORB_WTA_K(4)),
    testing::Values(ORB_ScoreType(cv::ORB::HARRIS_SCORE)),
    testing::Values(ORB_PatchSize(31), ORB_PatchSize(29)),
@@ -285,7 +285,8 @@ PARAM_TEST_CASE(BruteForceMatcher, cv::cuda::DeviceInfo, NormCode, DescriptorSiz
 CUDA_TEST_P(BruteForceMatcher, Match_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
    cv::cuda::GpuMat mask;
    if (useMask)
@@ -295,7 +296,7 @@ CUDA_TEST_P(BruteForceMatcher, Match_Single)
    }
    std::vector<cv::DMatch> matches;
-    matcher.match(loadMat(query), loadMat(train), matches, mask);
+    matcher->match(loadMat(query), loadMat(train), matches, mask);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
@@ -312,13 +313,14 @@ CUDA_TEST_P(BruteForceMatcher, Match_Single)
 CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
    cv::cuda::GpuMat d_train(train);
    // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
    // prepare masks (make first nearest match illegal)
    std::vector<cv::cuda::GpuMat> masks(2);
@@ -331,9 +333,9 @@ CUDA_TEST_P(BruteForceMatcher, Match_Collection)
    std::vector<cv::DMatch> matches;
    if (useMask)
-        matcher.match(cv::cuda::GpuMat(query), matches, masks);
+        matcher->match(cv::cuda::GpuMat(query), matches, masks);
    else
-        matcher.match(cv::cuda::GpuMat(query), matches);
+        matcher->match(cv::cuda::GpuMat(query), matches);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
@@ -366,7 +368,8 @@ CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
    const int knn = 2;
@@ -378,7 +381,7 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
    }
    std::vector< std::vector<cv::DMatch> > matches;
-    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
@@ -405,7 +408,8 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
    const int knn = 3;
@@ -417,7 +421,7 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
    }
    std::vector< std::vector<cv::DMatch> > matches;
-    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
@@ -444,15 +448,16 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
    const int knn = 2;
    cv::cuda::GpuMat d_train(train);
    // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
    // prepare masks (make first nearest match illegal)
    std::vector<cv::cuda::GpuMat> masks(2);
@@ -466,9 +471,9 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
    std::vector< std::vector<cv::DMatch> > matches;
    if (useMask)
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
    else
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
@@ -506,15 +511,16 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
    const int knn = 3;
    cv::cuda::GpuMat d_train(train);
    // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
    // prepare masks (make first nearest match illegal)
    std::vector<cv::cuda::GpuMat> masks(2);
@@ -528,9 +534,9 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
    std::vector< std::vector<cv::DMatch> > matches;
    if (useMask)
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
    else
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);
    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
@@ -568,7 +574,8 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
    const float radius = 1.f / countFactor;
@@ -577,7 +584,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
        try
        {
            std::vector< std::vector<cv::DMatch> > matches;
-            matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
+            matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius);
        }
        catch (const cv::Exception& e)
        {
@@ -594,7 +601,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
        }
        std::vector< std::vector<cv::DMatch> > matches;
-        matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);
+        matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);
        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
@@ -617,7 +624,8 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
    const int n = 3;
    const float radius = 1.f / countFactor * n;
@@ -625,8 +633,8 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
    cv::cuda::GpuMat d_train(train);
    // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
    // prepare masks (make first nearest match illegal)
    std::vector<cv::cuda::GpuMat> masks(2);
@@ -642,7 +650,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
        try
        {
            std::vector< std::vector<cv::DMatch> > matches;
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
        }
        catch (const cv::Exception& e)
        {
@@ -654,9 +662,9 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
        std::vector< std::vector<cv::DMatch> > matches;
        if (useMask)
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
        else
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius);
        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -154,7 +154,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
    matches_info.matches.clear();
-    Ptr<DescriptorMatcher> matcher;
+    Ptr<cv::DescriptorMatcher> matcher;
 #if 0 // TODO check this
    if (ocl::useOpenCL())
    {
@@ -220,13 +220,13 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
    descriptors1_.upload(features1.descriptors);
    descriptors2_.upload(features2.descriptors);
-    BFMatcher_CUDA matcher(NORM_L2);
+    Ptr<cuda::DescriptorMatcher> matcher = cuda::DescriptorMatcher::createBFMatcher(NORM_L2);
    MatchesSet matches;
    // Find 1->2 matches
    pair_matches.clear();
-    matcher.knnMatchSingle(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2);
+    matcher->knnMatch(descriptors1_, descriptors2_, pair_matches, 2);
    matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
    for (size_t i = 0; i < pair_matches.size(); ++i)
    {
        if (pair_matches[i].size() < 2)
@@ -242,8 +242,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
    // Find 2->1 matches
    pair_matches.clear();
-    matcher.knnMatchSingle(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2);
+    matcher->knnMatch(descriptors2_, descriptors1_, pair_matches, 2);
    matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
    for (size_t i = 0; i < pair_matches.size(); ++i)
    {
        if (pair_matches[i].size() < 2)
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -322,14 +322,14 @@ TEST(FAST)
    FAST(src, keypoints, 20);
    CPU_OFF;
-    cuda::FAST_CUDA d_FAST(20);
+    cv::Ptr<cv::cuda::FastFeatureDetector> d_FAST = cv::cuda::FastFeatureDetector::create(20);
    cuda::GpuMat d_src(src);
    cuda::GpuMat d_keypoints;
-    d_FAST(d_src, cuda::GpuMat(), d_keypoints);
+    d_FAST->detectAsync(d_src, d_keypoints);
    CUDA_ON;
-    d_FAST(d_src, cuda::GpuMat(), d_keypoints);
+    d_FAST->detectAsync(d_src, d_keypoints);
    CUDA_OFF;
 }
@@ -350,15 +350,15 @@ TEST(ORB)
    orb->detectAndCompute(src, Mat(), keypoints, descriptors);
    CPU_OFF;
-    cuda::ORB_CUDA d_orb;
+    Ptr<cuda::ORB> d_orb = cuda::ORB::create();
    cuda::GpuMat d_src(src);
    cuda::GpuMat d_keypoints;
    cuda::GpuMat d_descriptors;
-    d_orb(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
+    d_orb->detectAndComputeAsync(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
    CUDA_ON;
-    d_orb(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
+    d_orb->detectAndComputeAsync(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
    CUDA_OFF;
 }
@@ -379,14 +379,14 @@ TEST(BruteForceMatcher)
    // Init CUDA matcher
-    cuda::BFMatcher_CUDA d_matcher(NORM_L2);
+    Ptr<cuda::DescriptorMatcher> d_matcher = cuda::DescriptorMatcher::createBFMatcher(NORM_L2);
    cuda::GpuMat d_query(query);
    cuda::GpuMat d_train(train);
    // Output
    vector< vector<DMatch> > matches(2);
-    cuda::GpuMat d_trainIdx, d_distance, d_allDist, d_nMatches;
+    cuda::GpuMat d_matches;
    SUBTEST << "match";
@@ -396,10 +396,10 @@ TEST(BruteForceMatcher)
    matcher.match(query, train, matches[0]);
    CPU_OFF;
-    d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+    d_matcher->matchAsync(d_query, d_train, d_matches);
    CUDA_ON;
-    d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+    d_matcher->matchAsync(d_query, d_train, d_matches);
    CUDA_OFF;
    SUBTEST << "knnMatch";
@@ -410,10 +410,10 @@ TEST(BruteForceMatcher)
    matcher.knnMatch(query, train, matches, 2);
    CPU_OFF;
-    d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+    d_matcher->knnMatchAsync(d_query, d_train, d_matches, 2);
    CUDA_ON;
-    d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+    d_matcher->knnMatchAsync(d_query, d_train, d_matches, 2);
    CUDA_OFF;
    SUBTEST << "radiusMatch";
@@ -426,12 +426,10 @@ TEST(BruteForceMatcher)
    matcher.radiusMatch(query, train, matches, max_distance);
    CPU_OFF;
-    d_trainIdx.release();
+    d_matcher->radiusMatchAsync(d_query, d_train, d_matches, max_distance);
    d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
    CUDA_ON;
-    d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+    d_matcher->radiusMatchAsync(d_query, d_train, d_matches, max_distance);
    CUDA_OFF;
 }