From 6f91b7f6a4605591f16a121058940481e5b5a2c7 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 12 Jan 2015 12:46:44 +0300
Subject: [PATCH 1/8] remove firstLevel=2 test case, because CPU implementation
 fails

---
 modules/cudafeatures2d/test/test_features2d.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/modules/cudafeatures2d/test/test_features2d.cpp b/modules/cudafeatures2d/test/test_features2d.cpp
index 6e4479b7d..468024a5d 100644
--- a/modules/cudafeatures2d/test/test_features2d.cpp
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -208,7 +208,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Features2D, ORB,  testing::Combine(
     testing::Values(ORB_ScaleFactor(1.2f)),
     testing::Values(ORB_LevelsCount(4), ORB_LevelsCount(8)),
     testing::Values(ORB_EdgeThreshold(31)),
-    testing::Values(ORB_firstLevel(0), ORB_firstLevel(2)),
+    testing::Values(ORB_firstLevel(0)),
     testing::Values(ORB_WTA_K(2), ORB_WTA_K(3), ORB_WTA_K(4)),
     testing::Values(ORB_ScoreType(cv::ORB::HARRIS_SCORE)),
     testing::Values(ORB_PatchSize(31), ORB_PatchSize(29)),

From 14ef62ed661893d44546a3c0b08a518bd39ee99d Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 12 Jan 2015 18:11:09 +0300
Subject: [PATCH 2/8] refactor CUDA FAST feature detector algorithm:

use new FastFeatureDetector abstract interface and hidden implementation
---
 .../include/opencv2/cudafeatures2d.hpp        | 115 +++------
 .../cudafeatures2d/perf/perf_features2d.cpp   |   9 +-
 modules/cudafeatures2d/src/cuda/fast.cu       |  30 +--
 modules/cudafeatures2d/src/fast.cpp           | 238 ++++++++++--------
 modules/cudafeatures2d/src/orb.cpp            |  24 +-
 .../cudafeatures2d/test/test_features2d.cpp   |   7 +-
 samples/gpu/performance/tests.cpp             |   6 +-
 7 files changed, 213 insertions(+), 216 deletions(-)

diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index f61d2dfd0..f6f674d2a 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -48,6 +48,7 @@
 #endif
 
 #include "opencv2/core/cuda.hpp"
+#include "opencv2/features2d.hpp"
 #include "opencv2/cudafilters.hpp"
 
 /**
@@ -228,91 +229,49 @@ private:
     std::vector<GpuMat> trainDescCollection;
 };
 
-/** @brief Class used for corner detection using the FAST algorithm. :
+//
+// Feature2DAsync
+//
+
+/** @brief Abstract base class for 2D image feature detectors and descriptor extractors.
  */
-class CV_EXPORTS FAST_CUDA
+class CV_EXPORTS Feature2DAsync
+{
+public:
+    virtual ~Feature2DAsync() {}
+
+    virtual void detectAsync(InputArray image, OutputArray keypoints,
+                             InputArray mask = noArray(),
+                             Stream& stream = Stream::Null()) = 0;
+
+    virtual void convert(InputArray gpu_keypoints, std::vector<KeyPoint>& keypoints) = 0;
+};
+
+//
+// FastFeatureDetector
+//
+
+/** @brief Wrapping class for feature detection using the FAST method.
+ */
+class CV_EXPORTS FastFeatureDetector : public cv::FastFeatureDetector, public Feature2DAsync
 {
 public:
     enum
     {
         LOCATION_ROW = 0,
         RESPONSE_ROW,
-        ROWS_COUNT
+        ROWS_COUNT,
+
+        FEATURE_SIZE = 7
     };
 
-    //! all features have same size
-    static const int FEATURE_SIZE = 7;
+    static Ptr<FastFeatureDetector> create(int threshold=10,
+                                           bool nonmaxSuppression=true,
+                                           int type=FastFeatureDetector::TYPE_9_16,
+                                           int max_npoints = 5000);
 
-    /** @brief Constructor.
-
-    @param threshold Threshold on difference between intensity of the central pixel and pixels on a
-    circle around this pixel.
-    @param nonmaxSuppression If it is true, non-maximum suppression is applied to detected corners
-    (keypoints).
-    @param keypointsRatio Inner buffer size for keypoints store is determined as (keypointsRatio \*
-    image_width \* image_height).
-     */
-    explicit FAST_CUDA(int threshold, bool nonmaxSuppression = true, double keypointsRatio = 0.05);
-
-    /** @brief Finds the keypoints using FAST detector.
-
-    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
-    supported.
-    @param mask Optional input mask that marks the regions where we should detect features.
-    @param keypoints The output vector of keypoints. Can be stored both in CPU and GPU memory. For GPU
-    memory:
-    -   keypoints.ptr\<Vec2s\>(LOCATION_ROW)[i] will contain location of i'th point
-    -   keypoints.ptr\<float\>(RESPONSE_ROW)[i] will contain response of i'th point (if non-maximum
-    suppression is applied)
-     */
-    void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
-    /** @overload */
-    void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Download keypoints from GPU to CPU memory.
-    */
-    static void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
-    */
-    static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Releases inner buffer memory.
-    */
-    void release();
-
-    bool nonmaxSuppression;
-
-    int threshold;
-
-    //! max keypoints = keypointsRatio * img.size().area()
-    double keypointsRatio;
-
-    /** @brief Find keypoints and compute it's response if nonmaxSuppression is true.
-
-    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
-    supported.
-    @param mask Optional input mask that marks the regions where we should detect features.
-
-    The function returns count of detected keypoints.
-     */
-    int calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask);
-
-    /** @brief Gets final array of keypoints.
-
-    @param keypoints The output vector of keypoints.
-
-    The function performs non-max suppression if needed and returns final count of keypoints.
-     */
-    int getKeyPoints(GpuMat& keypoints);
-
-private:
-    GpuMat kpLoc_;
-    int count_;
-
-    GpuMat score_;
-
-    GpuMat d_keypoints_;
+    virtual void setMaxNumPoints(int max_npoints) = 0;
+    virtual int getMaxNumPoints() const = 0;
 };
 
 /** @brief Class for extracting ORB features and descriptors from an image. :
@@ -388,8 +347,8 @@ public:
 
     inline void setFastParams(int threshold, bool nonmaxSuppression = true)
     {
-        fastDetector_.threshold = threshold;
-        fastDetector_.nonmaxSuppression = nonmaxSuppression;
+        fastDetector_->setThreshold(threshold);
+        fastDetector_->setNonmaxSuppression(nonmaxSuppression);
     }
 
     /** @brief Releases inner buffer memory.
@@ -433,7 +392,7 @@ private:
     std::vector<GpuMat> keyPointsPyr_;
     std::vector<int> keyPointsCount_;
 
-    FAST_CUDA fastDetector_;
+    Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
 
     Ptr<cuda::Filter> blurFilter;
 
diff --git a/modules/cudafeatures2d/perf/perf_features2d.cpp b/modules/cudafeatures2d/perf/perf_features2d.cpp
index 26eb434f4..da3cd77db 100644
--- a/modules/cudafeatures2d/perf/perf_features2d.cpp
+++ b/modules/cudafeatures2d/perf/perf_features2d.cpp
@@ -64,15 +64,18 @@ PERF_TEST_P(Image_Threshold_NonMaxSuppression, FAST,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::FAST_CUDA d_fast(threshold, nonMaxSuppersion, 0.5);
+        cv::Ptr<cv::cuda::FastFeatureDetector> d_fast =
+                cv::cuda::FastFeatureDetector::create(threshold, nonMaxSuppersion,
+                                                      cv::FastFeatureDetector::TYPE_9_16,
+                                                      0.5 * img.size().area());
 
         const cv::cuda::GpuMat d_img(img);
         cv::cuda::GpuMat d_keypoints;
 
-        TEST_CYCLE() d_fast(d_img, cv::cuda::GpuMat(), d_keypoints);
+        TEST_CYCLE() d_fast->detectAsync(d_img, d_keypoints);
 
         std::vector<cv::KeyPoint> gpu_keypoints;
-        d_fast.downloadKeypoints(d_keypoints, gpu_keypoints);
+        d_fast->convert(d_keypoints, gpu_keypoints);
 
         sortKeyPoints(gpu_keypoints);
 
diff --git a/modules/cudafeatures2d/src/cuda/fast.cu b/modules/cudafeatures2d/src/cuda/fast.cu
index 7aa888ac3..72235d4e5 100644
--- a/modules/cudafeatures2d/src/cuda/fast.cu
+++ b/modules/cudafeatures2d/src/cuda/fast.cu
@@ -279,7 +279,7 @@ namespace cv { namespace cuda { namespace device
             #endif
         }
 
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold)
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
         {
             void* counter_ptr;
             cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -290,29 +290,29 @@ namespace cv { namespace cuda { namespace device
             grid.x = divUp(img.cols - 6, block.x);
             grid.y = divUp(img.rows - 6, block.y);
 
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
 
             if (score.data)
             {
                 if (mask.data)
-                    calcKeypoints<true><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
                 else
-                    calcKeypoints<true><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
             }
             else
             {
                 if (mask.data)
-                    calcKeypoints<false><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
                 else
-                    calcKeypoints<false><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
             }
 
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             unsigned int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             return count;
         }
@@ -356,7 +356,7 @@ namespace cv { namespace cuda { namespace device
             #endif
         }
 
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response)
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream)
         {
             void* counter_ptr;
             cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -366,15 +366,15 @@ namespace cv { namespace cuda { namespace device
             dim3 grid;
             grid.x = divUp(count, block.x);
 
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );
 
-            nonmaxSuppression<<<grid, block>>>(kpLoc, count, score, loc, response);
+            nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response);
             cudaSafeCall( cudaGetLastError() );
 
-            cudaSafeCall( cudaDeviceSynchronize() );
-
             unsigned int new_count;
-            cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );
 
             return new_count;
         }
diff --git a/modules/cudafeatures2d/src/fast.cpp b/modules/cudafeatures2d/src/fast.cpp
index aa77aa87b..cb22ea54d 100644
--- a/modules/cudafeatures2d/src/fast.cpp
+++ b/modules/cudafeatures2d/src/fast.cpp
@@ -47,124 +47,162 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::cuda::FAST_CUDA::FAST_CUDA(int, bool, double) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::release() { throw_no_cuda(); }
-int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_no_cuda(); return 0; }
-int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat&) { throw_no_cuda(); return 0; }
+Ptr<FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int, bool, int, int) { throw_no_cuda(); return Ptr<FastFeatureDetector>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
-cv::cuda::FAST_CUDA::FAST_CUDA(int _threshold, bool _nonmaxSuppression, double _keypointsRatio) :
-    nonmaxSuppression(_nonmaxSuppression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
-{
-}
-
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    if (image.empty())
-        return;
-
-    (*this)(image, mask, d_keypoints_);
-    downloadKeypoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-        return;
-
-    Mat h_keypoints(d_keypoints);
-    convertKeypoints(h_keypoints, keypoints);
-}
-
-void cv::cuda::FAST_CUDA::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (h_keypoints.empty())
-        return;
-
-    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
-
-    int npoints = h_keypoints.cols;
-
-    keypoints.resize(npoints);
-
-    const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
-    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
-
-    for (int i = 0; i < npoints; ++i)
-    {
-        KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
-        keypoints[i] = kp;
-    }
-}
-
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
-{
-    calcKeyPointsLocation(img, mask);
-    keypoints.cols = getKeyPoints(keypoints);
-}
-
 namespace cv { namespace cuda { namespace device
 {
     namespace fast
     {
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);
     }
 }}}
 
-int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
+namespace
 {
-    using namespace cv::cuda::device::fast;
-
-    CV_Assert(img.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
-
-    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
-
-    ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
-
-    if (nonmaxSuppression)
+    class FAST_Impl : public cv::cuda::FastFeatureDetector
+    {
+    public:
+        FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints);
+
+        virtual void detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask);
+        virtual void detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream);
+
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
+
+        virtual void setThreshold(int threshold) { threshold_ = threshold; }
+        virtual int getThreshold() const { return threshold_; }
+
+        virtual void setNonmaxSuppression(bool f) { nonmaxSuppression_ = f; }
+        virtual bool getNonmaxSuppression() const { return nonmaxSuppression_; }
+
+        virtual void setMaxNumPoints(int max_npoints) { max_npoints_ = max_npoints; }
+        virtual int getMaxNumPoints() const { return max_npoints_; }
+
+        virtual void setType(int type) { CV_Assert( type == TYPE_9_16 ); }
+        virtual int getType() const { return TYPE_9_16; }
+
+    private:
+        int threshold_;
+        bool nonmaxSuppression_;
+        int max_npoints_;
+    };
+
+    FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
+        threshold_(threshold), nonmaxSuppression_(nonmaxSuppression), max_npoints_(max_npoints)
     {
-        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
-        score_.setTo(Scalar::all(0));
     }
 
-    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSuppression ? score_ : PtrStepSzi(), threshold);
-    count_ = std::min(count_, maxKeypoints);
+    void FAST_Impl::detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask)
+    {
+        if (_image.empty())
+        {
+            keypoints.clear();
+            return;
+        }
 
-    return count_;
+        BufferPool pool(Stream::Null());
+        GpuMat d_keypoints = pool.getBuffer(ROWS_COUNT, max_npoints_, CV_16SC2);
+
+        detectAsync(_image, d_keypoints, _mask, Stream::Null());
+        convert(d_keypoints, keypoints);
+    }
+
+    void FAST_Impl::detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream)
+    {
+        using namespace cv::cuda::device::fast;
+
+        const GpuMat img = _image.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()) );
+
+        BufferPool pool(stream);
+
+        GpuMat kpLoc = pool.getBuffer(1, max_npoints_, CV_16SC2);
+
+        GpuMat score;
+        if (nonmaxSuppression_)
+        {
+            score = pool.getBuffer(img.size(), CV_32SC1);
+            score.setTo(Scalar::all(0), stream);
+        }
+
+        int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));
+        count = std::min(count, max_npoints_);
+
+        if (count == 0)
+        {
+            _keypoints.release();
+            return;
+        }
+
+        ensureSizeIsEnough(ROWS_COUNT, count, CV_32FC1, _keypoints);
+        GpuMat& keypoints = _keypoints.getGpuMatRef();
+
+        if (nonmaxSuppression_)
+        {
+            count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));
+            if (count == 0)
+            {
+                keypoints.release();
+            }
+            else
+            {
+                keypoints.cols = count;
+            }
+        }
+        else
+        {
+            GpuMat locRow(1, count, kpLoc.type(), keypoints.ptr(0));
+            kpLoc.colRange(0, count).copyTo(locRow, stream);
+            keypoints.row(1).setTo(Scalar::all(0), stream);
+        }
+    }
+
+    void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
+    {
+        if (_gpu_keypoints.empty())
+        {
+            keypoints.clear();
+            return;
+        }
+
+        Mat h_keypoints;
+        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_keypoints.getGpuMat().download(h_keypoints);
+        }
+        else
+        {
+            h_keypoints = _gpu_keypoints.getMat();
+        }
+
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
+        CV_Assert( h_keypoints.elemSize() == 4 );
+
+        const int npoints = h_keypoints.cols;
+
+        keypoints.resize(npoints);
+
+        const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
+        const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
+
+        for (int i = 0; i < npoints; ++i)
+        {
+            KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
+            keypoints[i] = kp;
+        }
+    }
 }
 
-int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat& keypoints)
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int threshold, bool nonmaxSuppression, int type, int max_npoints)
 {
-    using namespace cv::cuda::device::fast;
-
-    if (count_ == 0)
-        return 0;
-
-    ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
-
-    if (nonmaxSuppression)
-        return nonmaxSuppression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
-
-    GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
-    kpLoc_.colRange(0, count_).copyTo(locRow);
-    keypoints.row(1).setTo(Scalar::all(0));
-
-    return count_;
-}
-
-void cv::cuda::FAST_CUDA::release()
-{
-    kpLoc_.release();
-    score_.release();
-
-    d_keypoints_.release();
+    CV_Assert( type == TYPE_9_16 );
+    return makePtr<FAST_Impl>(threshold, nonmaxSuppression, max_npoints);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp
index 8d8afe8f0..c04649b1f 100644
--- a/modules/cudafeatures2d/src/orb.cpp
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -398,7 +398,7 @@ namespace
 cv::cuda::ORB_CUDA::ORB_CUDA(int nFeatures, float scaleFactor, int nLevels, int edgeThreshold, int firstLevel, int WTA_K, int scoreType, int patchSize) :
     nFeatures_(nFeatures), scaleFactor_(scaleFactor), nLevels_(nLevels), edgeThreshold_(edgeThreshold), firstLevel_(firstLevel), WTA_K_(WTA_K),
     scoreType_(scoreType), patchSize_(patchSize),
-    fastDetector_(DEFAULT_FAST_THRESHOLD)
+    fastDetector_(cuda::FastFeatureDetector::create(DEFAULT_FAST_THRESHOLD))
 {
     CV_Assert(patchSize_ >= 2);
 
@@ -554,7 +554,7 @@ namespace
                 return;
             }
 
-            count = cull_gpu(keypoints.ptr<int>(FAST_CUDA::LOCATION_ROW), keypoints.ptr<float>(FAST_CUDA::RESPONSE_ROW), count, n_points);
+            count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points);
         }
     }
 }
@@ -570,20 +570,20 @@ void cv::cuda::ORB_CUDA::computeKeyPointsPyramid()
 
     for (int level = 0; level < nLevels_; ++level)
     {
-        keyPointsCount_[level] = fastDetector_.calcKeyPointsLocation(imagePyr_[level], maskPyr_[level]);
+        fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
+
+        GpuMat fastKpRange;
+        fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
+
+        keyPointsCount_[level] = fastKpRange.cols;
 
         if (keyPointsCount_[level] == 0)
             continue;
 
-        ensureSizeIsEnough(3, keyPointsCount_[level], CV_32FC1, keyPointsPyr_[level]);
+        ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
+        fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
 
-        GpuMat fastKpRange = keyPointsPyr_[level].rowRange(0, 2);
-        keyPointsCount_[level] = fastDetector_.getKeyPoints(fastKpRange);
-
-        if (keyPointsCount_[level] == 0)
-            continue;
-
-        int n_features = static_cast<int>(n_features_per_level_[level]);
+        const int n_features = static_cast<int>(n_features_per_level_[level]);
 
         if (scoreType_ == ORB::HARRIS_SCORE)
         {
@@ -767,8 +767,6 @@ void cv::cuda::ORB_CUDA::release()
 
     keyPointsPyr_.clear();
 
-    fastDetector_.release();
-
     d_keypoints_.release();
 }
 
diff --git a/modules/cudafeatures2d/test/test_features2d.cpp b/modules/cudafeatures2d/test/test_features2d.cpp
index 468024a5d..9a8d76ce3 100644
--- a/modules/cudafeatures2d/test/test_features2d.cpp
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -76,15 +76,14 @@ CUDA_TEST_P(FAST, Accuracy)
     cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(image.empty());
 
-    cv::cuda::FAST_CUDA fast(threshold);
-    fast.nonmaxSuppression = nonmaxSuppression;
+    cv::Ptr<cv::cuda::FastFeatureDetector> fast = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
 
     if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
     {
         try
         {
             std::vector<cv::KeyPoint> keypoints;
-            fast(loadMat(image), cv::cuda::GpuMat(), keypoints);
+            fast->detect(loadMat(image), keypoints);
         }
         catch (const cv::Exception& e)
         {
@@ -94,7 +93,7 @@ CUDA_TEST_P(FAST, Accuracy)
     else
     {
         std::vector<cv::KeyPoint> keypoints;
-        fast(loadMat(image), cv::cuda::GpuMat(), keypoints);
+        fast->detect(loadMat(image), keypoints);
 
         std::vector<cv::KeyPoint> keypoints_gold;
         cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 2e7faa334..8869a1b66 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -322,14 +322,14 @@ TEST(FAST)
     FAST(src, keypoints, 20);
     CPU_OFF;
 
-    cuda::FAST_CUDA d_FAST(20);
+    cv::Ptr<cv::cuda::FastFeatureDetector> d_FAST = cv::cuda::FastFeatureDetector::create(20);
     cuda::GpuMat d_src(src);
     cuda::GpuMat d_keypoints;
 
-    d_FAST(d_src, cuda::GpuMat(), d_keypoints);
+    d_FAST->detectAsync(d_src, d_keypoints);
 
     CUDA_ON;
-    d_FAST(d_src, cuda::GpuMat(), d_keypoints);
+    d_FAST->detectAsync(d_src, d_keypoints);
     CUDA_OFF;
 }
 

From 9f77ffb03f443331080bfb7d59c4f10d0dd08815 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Mon, 12 Jan 2015 18:26:41 +0300
Subject: [PATCH 3/8] extend Feature2DAsync interface

---
 .../include/opencv2/cudafeatures2d.hpp        | 26 ++++--
 .../cudafeatures2d/src/feature2d_async.cpp    | 85 +++++++++++++++++++
 2 files changed, 103 insertions(+), 8 deletions(-)
 create mode 100644 modules/cudafeatures2d/src/feature2d_async.cpp

diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index f6f674d2a..a193eb6f8 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -233,26 +233,36 @@ private:
 // Feature2DAsync
 //
 
-/** @brief Abstract base class for 2D image feature detectors and descriptor extractors.
- */
 class CV_EXPORTS Feature2DAsync
 {
 public:
-    virtual ~Feature2DAsync() {}
+    virtual ~Feature2DAsync();
 
-    virtual void detectAsync(InputArray image, OutputArray keypoints,
+    virtual void detectAsync(InputArray image,
+                             OutputArray keypoints,
                              InputArray mask = noArray(),
-                             Stream& stream = Stream::Null()) = 0;
+                             Stream& stream = Stream::Null());
 
-    virtual void convert(InputArray gpu_keypoints, std::vector<KeyPoint>& keypoints) = 0;
+    virtual void computeAsync(InputArray image,
+                              OutputArray keypoints,
+                              OutputArray descriptors,
+                              Stream& stream = Stream::Null());
+
+    virtual void detectAndComputeAsync(InputArray image,
+                                       InputArray mask,
+                                       OutputArray keypoints,
+                                       OutputArray descriptors,
+                                       bool useProvidedKeypoints=false,
+                                       Stream& stream = Stream::Null());
+
+    virtual void convert(InputArray gpu_keypoints,
+                         std::vector<KeyPoint>& keypoints) = 0;
 };
 
 //
 // FastFeatureDetector
 //
 
-/** @brief Wrapping class for feature detection using the FAST method.
- */
 class CV_EXPORTS FastFeatureDetector : public cv::FastFeatureDetector, public Feature2DAsync
 {
 public:
diff --git a/modules/cudafeatures2d/src/feature2d_async.cpp b/modules/cudafeatures2d/src/feature2d_async.cpp
new file mode 100644
index 000000000..202a72537
--- /dev/null
+++ b/modules/cudafeatures2d/src/feature2d_async.cpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+cv::cuda::Feature2DAsync::~Feature2DAsync()
+{
+}
+
+void cv::cuda::Feature2DAsync::detectAsync(InputArray image,
+                                           OutputArray keypoints,
+                                           InputArray mask,
+                                           Stream& stream)
+{
+    if (image.empty())
+    {
+        keypoints.clear();
+        return;
+    }
+
+    detectAndComputeAsync(image, mask, keypoints, noArray(), false, stream);
+}
+
+void cv::cuda::Feature2DAsync::computeAsync(InputArray image,
+                                            OutputArray keypoints,
+                                            OutputArray descriptors,
+                                            Stream& stream)
+{
+    if (image.empty())
+    {
+        descriptors.release();
+        return;
+    }
+
+    detectAndComputeAsync(image, noArray(), keypoints, descriptors, true, stream);
+}
+
+void cv::cuda::Feature2DAsync::detectAndComputeAsync(InputArray /*image*/,
+                                                     InputArray /*mask*/,
+                                                     OutputArray /*keypoints*/,
+                                                     OutputArray /*descriptors*/,
+                                                     bool /*useProvidedKeypoints*/,
+                                                     Stream& /*stream*/)
+{
+    CV_Error(Error::StsNotImplemented, "");
+}

From 554ddd2ec49f4df46c7b23ac257c21fda0a449d0 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 13 Jan 2015 10:25:34 +0300
Subject: [PATCH 4/8] fix compilation without CUDA

---
 modules/cudafeatures2d/src/fast.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/cudafeatures2d/src/fast.cpp b/modules/cudafeatures2d/src/fast.cpp
index cb22ea54d..2095ef7cf 100644
--- a/modules/cudafeatures2d/src/fast.cpp
+++ b/modules/cudafeatures2d/src/fast.cpp
@@ -47,7 +47,7 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-Ptr<FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int, bool, int, int) { throw_no_cuda(); return Ptr<FastFeatureDetector>(); }
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int, bool, int, int) { throw_no_cuda(); return Ptr<cv::cuda::FastFeatureDetector>(); }
 
 #else /* !defined (HAVE_CUDA) */
 

From f960a5707d99d9a55da8f2b12e96bcad65fd9b90 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 13 Jan 2015 10:40:58 +0300
Subject: [PATCH 5/8] refactor CUDA ORB feature detector/extractor algorithm:

use new abstract interface and hidden implementation
---
 .../include/opencv2/cudafeatures2d.hpp        | 125 +--
 .../cudafeatures2d/perf/perf_features2d.cpp   |   6 +-
 modules/cudafeatures2d/src/orb.cpp            | 741 ++++++++++--------
 .../cudafeatures2d/test/test_features2d.cpp   |  11 +-
 samples/gpu/performance/tests.cpp             |   6 +-
 5 files changed, 447 insertions(+), 442 deletions(-)

diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index a193eb6f8..4a78d50e6 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -284,9 +284,11 @@ public:
     virtual int getMaxNumPoints() const = 0;
 };
 
-/** @brief Class for extracting ORB features and descriptors from an image. :
- */
-class CV_EXPORTS ORB_CUDA
+//
+// ORB
+//
+
+class CV_EXPORTS ORB : public cv::ORB, public Feature2DAsync
 {
 public:
     enum
@@ -300,113 +302,20 @@ public:
         ROWS_COUNT
     };
 
-    enum
-    {
-        DEFAULT_FAST_THRESHOLD = 20
-    };
-
-    /** @brief Constructor.
-
-    @param nFeatures The number of desired features.
-    @param scaleFactor Coefficient by which we divide the dimensions from one scale pyramid level to
-    the next.
-    @param nLevels The number of levels in the scale pyramid.
-    @param edgeThreshold How far from the boundary the points should be.
-    @param firstLevel The level at which the image is given. If 1, that means we will also look at the
-    image scaleFactor times bigger.
-    @param WTA_K
-    @param scoreType
-    @param patchSize
-     */
-    explicit ORB_CUDA(int nFeatures = 500, float scaleFactor = 1.2f, int nLevels = 8, int edgeThreshold = 31,
-                     int firstLevel = 0, int WTA_K = 2, int scoreType = 0, int patchSize = 31);
-
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
-
-    /** @brief Detects keypoints and computes descriptors for them.
-
-    @param image Input 8-bit grayscale image.
-    @param mask Optional input mask that marks the regions where we should detect features.
-    @param keypoints The input/output vector of keypoints. Can be stored both in CPU and GPU memory.
-    For GPU memory:
-    -   keypoints.ptr\<float\>(X_ROW)[i] contains x coordinate of the i'th feature.
-    -   keypoints.ptr\<float\>(Y_ROW)[i] contains y coordinate of the i'th feature.
-    -   keypoints.ptr\<float\>(RESPONSE_ROW)[i] contains the response of the i'th feature.
-    -   keypoints.ptr\<float\>(ANGLE_ROW)[i] contains orientation of the i'th feature.
-    -   keypoints.ptr\<float\>(OCTAVE_ROW)[i] contains the octave of the i'th feature.
-    -   keypoints.ptr\<float\>(SIZE_ROW)[i] contains the size of the i'th feature.
-    @param descriptors Computed descriptors. if blurForDescriptor is true, image will be blurred
-    before descriptors calculation.
-     */
-    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors);
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors);
-
-    /** @brief Download keypoints from GPU to CPU memory.
-    */
-    static void downloadKeyPoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
-    */
-    static void convertKeyPoints(const Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
-
-    //! returns the descriptor size in bytes
-    inline int descriptorSize() const { return kBytes; }
-
-    inline void setFastParams(int threshold, bool nonmaxSuppression = true)
-    {
-        fastDetector_->setThreshold(threshold);
-        fastDetector_->setNonmaxSuppression(nonmaxSuppression);
-    }
-
-    /** @brief Releases inner buffer memory.
-    */
-    void release();
+    static Ptr<ORB> create(int nfeatures=500,
+                           float scaleFactor=1.2f,
+                           int nlevels=8,
+                           int edgeThreshold=31,
+                           int firstLevel=0,
+                           int WTA_K=2,
+                           int scoreType=ORB::HARRIS_SCORE,
+                           int patchSize=31,
+                           int fastThreshold=20,
+                           bool blurForDescriptor=false);
 
     //! if true, image will be blurred before descriptors calculation
-    bool blurForDescriptor;
-
-private:
-    enum { kBytes = 32 };
-
-    void buildScalePyramids(const GpuMat& image, const GpuMat& mask);
-
-    void computeKeyPointsPyramid();
-
-    void computeDescriptors(GpuMat& descriptors);
-
-    void mergeKeyPoints(GpuMat& keypoints);
-
-    int nFeatures_;
-    float scaleFactor_;
-    int nLevels_;
-    int edgeThreshold_;
-    int firstLevel_;
-    int WTA_K_;
-    int scoreType_;
-    int patchSize_;
-
-    //! The number of desired features per scale
-    std::vector<size_t> n_features_per_level_;
-
-    //! Points to compute BRIEF descriptors from
-    GpuMat pattern_;
-
-    std::vector<GpuMat> imagePyr_;
-    std::vector<GpuMat> maskPyr_;
-
-    GpuMat buf_;
-
-    std::vector<GpuMat> keyPointsPyr_;
-    std::vector<int> keyPointsCount_;
-
-    Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
-
-    Ptr<cuda::Filter> blurFilter;
-
-    GpuMat d_keypoints_;
+    virtual void setBlurForDescriptor(bool blurForDescriptor) = 0;
+    virtual bool getBlurForDescriptor() const = 0;
 };
 
 //! @}
diff --git a/modules/cudafeatures2d/perf/perf_features2d.cpp b/modules/cudafeatures2d/perf/perf_features2d.cpp
index da3cd77db..0dcb0434f 100644
--- a/modules/cudafeatures2d/perf/perf_features2d.cpp
+++ b/modules/cudafeatures2d/perf/perf_features2d.cpp
@@ -109,15 +109,15 @@ PERF_TEST_P(Image_NFeatures, ORB,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::ORB_CUDA d_orb(nFeatures);
+        cv::Ptr<cv::cuda::ORB> d_orb = cv::cuda::ORB::create(nFeatures);
 
         const cv::cuda::GpuMat d_img(img);
         cv::cuda::GpuMat d_keypoints, d_descriptors;
 
-        TEST_CYCLE() d_orb(d_img, cv::cuda::GpuMat(), d_keypoints, d_descriptors);
+        TEST_CYCLE() d_orb->detectAndComputeAsync(d_img, cv::noArray(), d_keypoints, d_descriptors);
 
         std::vector<cv::KeyPoint> gpu_keypoints;
-        d_orb.downloadKeyPoints(d_keypoints, gpu_keypoints);
+        d_orb->convert(d_keypoints, gpu_keypoints);
 
         cv::Mat gpu_descriptors(d_descriptors);
 
diff --git a/modules/cudafeatures2d/src/orb.cpp b/modules/cudafeatures2d/src/orb.cpp
index c04649b1f..6bfdd5ac4 100644
--- a/modules/cudafeatures2d/src/orb.cpp
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -47,18 +47,7 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::cuda::ORB_CUDA::ORB_CUDA(int, float, int, int, int, int, int, int) : fastDetector_(20) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::downloadKeyPoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::convertKeyPoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::release() { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::buildScalePyramids(const GpuMat&, const GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::computeKeyPointsPyramid() { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::computeDescriptors(GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::mergeKeyPoints(GpuMat&) { throw_no_cuda(); }
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int, float, int, int, int, int, int, int, int, bool) { throw_no_cuda(); return Ptr<cv::cuda::ORB>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -346,7 +335,100 @@ namespace
         -1,-6, 0,-11/*mean (0.127148), correlation (0.547401)*/
     };
 
-    void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
+    class ORB_Impl : public cv::cuda::ORB
+    {
+    public:
+        ORB_Impl(int nfeatures,
+                 float scaleFactor,
+                 int nlevels,
+                 int edgeThreshold,
+                 int firstLevel,
+                 int WTA_K,
+                 int scoreType,
+                 int patchSize,
+                 int fastThreshold,
+                 bool blurForDescriptor);
+
+        virtual void detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints);
+        virtual void detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream);
+
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
+
+        virtual int descriptorSize() const { return kBytes; }
+        virtual int descriptorType() const { return CV_8U; }
+        virtual int defaultNorm() const { return NORM_HAMMING; }
+
+        virtual void setMaxFeatures(int maxFeatures) { nFeatures_ = maxFeatures; }
+        virtual int getMaxFeatures() const { return nFeatures_; }
+
+        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
+        virtual double getScaleFactor() const { return scaleFactor_; }
+
+        virtual void setNLevels(int nlevels) { nLevels_ = nlevels; }
+        virtual int getNLevels() const { return nLevels_; }
+
+        virtual void setEdgeThreshold(int edgeThreshold) { edgeThreshold_ = edgeThreshold; }
+        virtual int getEdgeThreshold() const { return edgeThreshold_; }
+
+        virtual void setFirstLevel(int firstLevel) { firstLevel_ = firstLevel; }
+        virtual int getFirstLevel() const { return firstLevel_; }
+
+        virtual void setWTA_K(int wta_k) { WTA_K_ = wta_k; }
+        virtual int getWTA_K() const { return WTA_K_; }
+
+        virtual void setScoreType(int scoreType) { scoreType_ = scoreType; }
+        virtual int getScoreType() const { return scoreType_; }
+
+        virtual void setPatchSize(int patchSize) { patchSize_ = patchSize; }
+        virtual int getPatchSize() const { return patchSize_; }
+
+        virtual void setFastThreshold(int fastThreshold) { fastThreshold_ = fastThreshold; }
+        virtual int getFastThreshold() const { return fastThreshold_; }
+
+        virtual void setBlurForDescriptor(bool blurForDescriptor) { blurForDescriptor_ = blurForDescriptor; }
+        virtual bool getBlurForDescriptor() const { return blurForDescriptor_; }
+
+    private:
+        int nFeatures_;
+        float scaleFactor_;
+        int nLevels_;
+        int edgeThreshold_;
+        int firstLevel_;
+        int WTA_K_;
+        int scoreType_;
+        int patchSize_;
+        int fastThreshold_;
+        bool blurForDescriptor_;
+
+    private:
+        void buildScalePyramids(InputArray _image, InputArray _mask);
+        void computeKeyPointsPyramid();
+        void computeDescriptors(OutputArray _descriptors);
+        void mergeKeyPoints(OutputArray _keypoints);
+
+    private:
+        Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
+
+        //! The number of desired features per scale
+        std::vector<size_t> n_features_per_level_;
+
+        //! Points to compute BRIEF descriptors from
+        GpuMat pattern_;
+
+        std::vector<GpuMat> imagePyr_;
+        std::vector<GpuMat> maskPyr_;
+
+        GpuMat buf_;
+
+        std::vector<GpuMat> keyPointsPyr_;
+        std::vector<int> keyPointsCount_;
+
+        Ptr<cuda::Filter> blurFilter_;
+
+        GpuMat d_keypoints_;
+    };
+
+    static void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
     {
         RNG rng(0x12345678);
 
@@ -381,7 +463,7 @@ namespace
         }
     }
 
-    void makeRandomPattern(int patchSize, Point* pattern, int npoints)
+    static void makeRandomPattern(int patchSize, Point* pattern, int npoints)
     {
         // we always start with a fixed seed,
         // to make patterns the same on each run
@@ -393,155 +475,189 @@ namespace
             pattern[i].y = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
         }
     }
-}
 
-cv::cuda::ORB_CUDA::ORB_CUDA(int nFeatures, float scaleFactor, int nLevels, int edgeThreshold, int firstLevel, int WTA_K, int scoreType, int patchSize) :
-    nFeatures_(nFeatures), scaleFactor_(scaleFactor), nLevels_(nLevels), edgeThreshold_(edgeThreshold), firstLevel_(firstLevel), WTA_K_(WTA_K),
-    scoreType_(scoreType), patchSize_(patchSize),
-    fastDetector_(cuda::FastFeatureDetector::create(DEFAULT_FAST_THRESHOLD))
-{
-    CV_Assert(patchSize_ >= 2);
-
-    // fill the extractors and descriptors for the corresponding scales
-    float factor = 1.0f / scaleFactor_;
-    float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
-
-    n_features_per_level_.resize(nLevels_);
-    size_t sum_n_features = 0;
-    for (int level = 0; level < nLevels_ - 1; ++level)
+    ORB_Impl::ORB_Impl(int nFeatures,
+                       float scaleFactor,
+                       int nLevels,
+                       int edgeThreshold,
+                       int firstLevel,
+                       int WTA_K,
+                       int scoreType,
+                       int patchSize,
+                       int fastThreshold,
+                       bool blurForDescriptor) :
+        nFeatures_(nFeatures),
+        scaleFactor_(scaleFactor),
+        nLevels_(nLevels),
+        edgeThreshold_(edgeThreshold),
+        firstLevel_(firstLevel),
+        WTA_K_(WTA_K),
+        scoreType_(scoreType),
+        patchSize_(patchSize),
+        fastThreshold_(fastThreshold),
+        blurForDescriptor_(blurForDescriptor)
     {
-        n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
-        sum_n_features += n_features_per_level_[level];
-        n_desired_features_per_scale *= factor;
-    }
-    n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
+        CV_Assert( patchSize_ >= 2 );
+        CV_Assert( WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4 );
 
-    // pre-compute the end of a row in a circular patch
-    int half_patch_size = patchSize_ / 2;
-    std::vector<int> u_max(half_patch_size + 2);
-    for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
-        u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+        fastDetector_ = cuda::FastFeatureDetector::create(fastThreshold_);
 
-    // Make sure we are symmetric
-    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
-    {
-        while (u_max[v_0] == u_max[v_0 + 1])
-            ++v_0;
-        u_max[v] = v_0;
-        ++v_0;
-    }
-    CV_Assert(u_max.size() < 32);
-    cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
+        // fill the extractors and descriptors for the corresponding scales
+        float factor = 1.0f / scaleFactor_;
+        float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
 
-    // Calc pattern
-    const int npoints = 512;
-    Point pattern_buf[npoints];
-    const Point* pattern0 = (const Point*)bit_pattern_31_;
-    if (patchSize_ != 31)
-    {
-        pattern0 = pattern_buf;
-        makeRandomPattern(patchSize_, pattern_buf, npoints);
-    }
-
-    CV_Assert(WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4);
-
-    Mat h_pattern;
-
-    if (WTA_K_ == 2)
-    {
-        h_pattern.create(2, npoints, CV_32SC1);
-
-        int* pattern_x_ptr = h_pattern.ptr<int>(0);
-        int* pattern_y_ptr = h_pattern.ptr<int>(1);
-
-        for (int i = 0; i < npoints; ++i)
+        n_features_per_level_.resize(nLevels_);
+        size_t sum_n_features = 0;
+        for (int level = 0; level < nLevels_ - 1; ++level)
         {
-            pattern_x_ptr[i] = pattern0[i].x;
-            pattern_y_ptr[i] = pattern0[i].y;
+            n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
+            sum_n_features += n_features_per_level_[level];
+            n_desired_features_per_scale *= factor;
         }
-    }
-    else
-    {
-        int ntuples = descriptorSize() * 4;
-        initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
-    }
+        n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
 
-    pattern_.upload(h_pattern);
-
-    blurFilter = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
-
-    blurForDescriptor = false;
-}
-
-namespace
-{
-    inline float getScale(float scaleFactor, int firstLevel, int level)
-    {
-        return pow(scaleFactor, level - firstLevel);
-    }
-}
-
-void cv::cuda::ORB_CUDA::buildScalePyramids(const GpuMat& image, const GpuMat& mask)
-{
-    CV_Assert(image.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
-
-    imagePyr_.resize(nLevels_);
-    maskPyr_.resize(nLevels_);
-
-    for (int level = 0; level < nLevels_; ++level)
-    {
-        float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
-
-        Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
-
-        ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
-        ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
-        maskPyr_[level].setTo(Scalar::all(255));
-
-        // Compute the resized image
-        if (level != firstLevel_)
+        // pre-compute the end of a row in a circular patch
+        int half_patch_size = patchSize_ / 2;
+        std::vector<int> u_max(half_patch_size + 2);
+        for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
         {
-            if (level < firstLevel_)
-            {
-                cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+            u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+        }
 
-                if (!mask.empty())
-                    cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
-            }
-            else
-            {
-                cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+        // Make sure we are symmetric
+        for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
+        {
+            while (u_max[v_0] == u_max[v_0 + 1])
+                ++v_0;
+            u_max[v] = v_0;
+            ++v_0;
+        }
+        CV_Assert( u_max.size() < 32 );
+        cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
 
-                if (!mask.empty())
-                {
-                    cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
-                    cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
-                }
+        // Calc pattern
+        const int npoints = 512;
+        Point pattern_buf[npoints];
+        const Point* pattern0 = (const Point*)bit_pattern_31_;
+        if (patchSize_ != 31)
+        {
+            pattern0 = pattern_buf;
+            makeRandomPattern(patchSize_, pattern_buf, npoints);
+        }
+
+        Mat h_pattern;
+        if (WTA_K_ == 2)
+        {
+            h_pattern.create(2, npoints, CV_32SC1);
+
+            int* pattern_x_ptr = h_pattern.ptr<int>(0);
+            int* pattern_y_ptr = h_pattern.ptr<int>(1);
+
+            for (int i = 0; i < npoints; ++i)
+            {
+                pattern_x_ptr[i] = pattern0[i].x;
+                pattern_y_ptr[i] = pattern0[i].y;
             }
         }
         else
         {
-            image.copyTo(imagePyr_[level]);
-
-            if (!mask.empty())
-                mask.copyTo(maskPyr_[level]);
+            int ntuples = descriptorSize() * 4;
+            initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
         }
 
-        // Filter keypoints by image border
-        ensureSizeIsEnough(sz, CV_8UC1, buf_);
-        buf_.setTo(Scalar::all(0));
-        Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
-        buf_(inner).setTo(Scalar::all(255));
+        pattern_.upload(h_pattern);
 
-        cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+        blurFilter_ = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
     }
-}
 
-namespace
-{
-    //takes keypoints and culls them by the response
-    void cull(GpuMat& keypoints, int& count, int n_points)
+    void ORB_Impl::detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints)
+    {
+        CV_Assert( useProvidedKeypoints == false );
+
+        detectAndComputeAsync(_image, _mask, d_keypoints_, _descriptors, false, Stream::Null());
+        convert(d_keypoints_, keypoints);
+    }
+
+    void ORB_Impl::detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream)
+    {
+        CV_Assert( useProvidedKeypoints == false );
+
+        buildScalePyramids(_image, _mask);
+        computeKeyPointsPyramid();
+        if (_descriptors.needed())
+        {
+            computeDescriptors(_descriptors);
+        }
+        mergeKeyPoints(_keypoints);
+    }
+
+    static float getScale(float scaleFactor, int firstLevel, int level)
+    {
+        return pow(scaleFactor, level - firstLevel);
+    }
+
+    void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask)
+    {
+        const GpuMat image = _image.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( image.type() == CV_8UC1 );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
+
+        imagePyr_.resize(nLevels_);
+        maskPyr_.resize(nLevels_);
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
+
+            Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
+
+            ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
+            ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
+            maskPyr_[level].setTo(Scalar::all(255));
+
+            // Compute the resized image
+            if (level != firstLevel_)
+            {
+                if (level < firstLevel_)
+                {
+                    cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+
+                    if (!mask.empty())
+                        cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+                }
+                else
+                {
+                    cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+
+                    if (!mask.empty())
+                    {
+                        cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+                        cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
+                    }
+                }
+            }
+            else
+            {
+                image.copyTo(imagePyr_[level]);
+
+                if (!mask.empty())
+                    mask.copyTo(maskPyr_[level]);
+            }
+
+            // Filter keypoints by image border
+            ensureSizeIsEnough(sz, CV_8UC1, buf_);
+            buf_.setTo(Scalar::all(0));
+            Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
+            buf_(inner).setTo(Scalar::all(255));
+
+            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+        }
+    }
+
+    // takes keypoints and culls them by the response
+    static void cull(GpuMat& keypoints, int& count, int n_points)
     {
         using namespace cv::cuda::device::orb;
 
@@ -557,217 +673,196 @@ namespace
             count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points);
         }
     }
-}
 
-void cv::cuda::ORB_CUDA::computeKeyPointsPyramid()
-{
-    using namespace cv::cuda::device::orb;
-
-    int half_patch_size = patchSize_ / 2;
-
-    keyPointsPyr_.resize(nLevels_);
-    keyPointsCount_.resize(nLevels_);
-
-    for (int level = 0; level < nLevels_; ++level)
+    void ORB_Impl::computeKeyPointsPyramid()
     {
-        fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
+        using namespace cv::cuda::device::orb;
 
-        GpuMat fastKpRange;
-        fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
+        int half_patch_size = patchSize_ / 2;
 
-        keyPointsCount_[level] = fastKpRange.cols;
+        keyPointsPyr_.resize(nLevels_);
+        keyPointsCount_.resize(nLevels_);
 
-        if (keyPointsCount_[level] == 0)
-            continue;
+        fastDetector_->setThreshold(fastThreshold_);
 
-        ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
-        fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
-
-        const int n_features = static_cast<int>(n_features_per_level_[level]);
-
-        if (scoreType_ == ORB::HARRIS_SCORE)
+        for (int level = 0; level < nLevels_; ++level)
         {
-            // Keep more points than necessary as FAST does not give amazing corners
-            cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
+            fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());
 
-            // Compute the Harris cornerness (better scoring than FAST)
-            HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+            GpuMat fastKpRange;
+            fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
+
+            keyPointsCount_[level] = fastKpRange.cols;
+
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
+            fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
+
+            const int n_features = static_cast<int>(n_features_per_level_[level]);
+
+            if (scoreType_ == ORB::HARRIS_SCORE)
+            {
+                // Keep more points than necessary as FAST does not give amazing corners
+                cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
+
+                // Compute the Harris cornerness (better scoring than FAST)
+                HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+            }
+
+            //cull to the final desired level, using the new Harris scores or the original FAST scores.
+            cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
+
+            // Compute orientation
+            IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
+        }
+    }
+
+    void ORB_Impl::computeDescriptors(OutputArray _descriptors)
+    {
+        using namespace cv::cuda::device::orb;
+
+        int nAllkeypoints = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+            nAllkeypoints += keyPointsCount_[level];
+
+        if (nAllkeypoints == 0)
+        {
+            _descriptors.release();
+            return;
         }
 
-        //cull to the final desired level, using the new Harris scores or the original FAST scores.
-        cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
+        ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, _descriptors);
+        GpuMat descriptors = _descriptors.getGpuMat();
 
-        // Compute orientation
-        IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
-    }
-}
+        int offset = 0;
 
-void cv::cuda::ORB_CUDA::computeDescriptors(GpuMat& descriptors)
-{
-    using namespace cv::cuda::device::orb;
-
-    int nAllkeypoints = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
-        nAllkeypoints += keyPointsCount_[level];
-
-    if (nAllkeypoints == 0)
-    {
-        descriptors.release();
-        return;
-    }
-
-    ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, descriptors);
-
-    int offset = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
-    {
-        if (keyPointsCount_[level] == 0)
-            continue;
-
-        GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
-
-        if (blurForDescriptor)
+        for (int level = 0; level < nLevels_; ++level)
         {
-            // preprocess the resized image
-            ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_);
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
+
+            if (blurForDescriptor_)
+            {
+                // preprocess the resized image
+                ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
+                blurFilter_->apply(imagePyr_[level], buf_);
+            }
+
+            computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
+                keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
+
+            offset += keyPointsCount_[level];
+        }
+    }
+
+    void ORB_Impl::mergeKeyPoints(OutputArray _keypoints)
+    {
+        using namespace cv::cuda::device::orb;
+
+        int nAllkeypoints = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+            nAllkeypoints += keyPointsCount_[level];
+
+        if (nAllkeypoints == 0)
+        {
+            _keypoints.release();
+            return;
         }
 
-        computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
-            keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
+        ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, _keypoints);
+        GpuMat& keypoints = _keypoints.getGpuMatRef();
 
-        offset += keyPointsCount_[level];
+        int offset = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            float sf = getScale(scaleFactor_, firstLevel_, level);
+
+            GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
+
+            float locScale = level != firstLevel_ ? sf : 1.0f;
+
+            mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+
+            GpuMat range = keyPointsRange.rowRange(2, 4);
+            keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
+
+            keyPointsRange.row(4).setTo(Scalar::all(level));
+            keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
+
+            offset += keyPointsCount_[level];
+        }
     }
-}
 
-void cv::cuda::ORB_CUDA::mergeKeyPoints(GpuMat& keypoints)
-{
-    using namespace cv::cuda::device::orb;
-
-    int nAllkeypoints = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
-        nAllkeypoints += keyPointsCount_[level];
-
-    if (nAllkeypoints == 0)
+    void ORB_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
     {
-        keypoints.release();
-        return;
-    }
+        if (_gpu_keypoints.empty())
+        {
+            keypoints.clear();
+            return;
+        }
 
-    ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, keypoints);
+        Mat h_keypoints;
+        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_keypoints.getGpuMat().download(h_keypoints);
+        }
+        else
+        {
+            h_keypoints = _gpu_keypoints.getMat();
+        }
 
-    int offset = 0;
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
+        CV_Assert( h_keypoints.type() == CV_32FC1 );
 
-    for (int level = 0; level < nLevels_; ++level)
-    {
-        if (keyPointsCount_[level] == 0)
-            continue;
+        const int npoints = h_keypoints.cols;
 
-        float sf = getScale(scaleFactor_, firstLevel_, level);
+        keypoints.resize(npoints);
 
-        GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
+        const float* x_ptr = h_keypoints.ptr<float>(X_ROW);
+        const float* y_ptr = h_keypoints.ptr<float>(Y_ROW);
+        const float* response_ptr = h_keypoints.ptr<float>(RESPONSE_ROW);
+        const float* angle_ptr = h_keypoints.ptr<float>(ANGLE_ROW);
+        const float* octave_ptr = h_keypoints.ptr<float>(OCTAVE_ROW);
+        const float* size_ptr = h_keypoints.ptr<float>(SIZE_ROW);
 
-        float locScale = level != firstLevel_ ? sf : 1.0f;
+        for (int i = 0; i < npoints; ++i)
+        {
+            KeyPoint kp;
 
-        mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+            kp.pt.x = x_ptr[i];
+            kp.pt.y = y_ptr[i];
+            kp.response = response_ptr[i];
+            kp.angle = angle_ptr[i];
+            kp.octave = static_cast<int>(octave_ptr[i]);
+            kp.size = size_ptr[i];
 
-        GpuMat range = keyPointsRange.rowRange(2, 4);
-        keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
-
-        keyPointsRange.row(4).setTo(Scalar::all(level));
-        keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
-
-        offset += keyPointsCount_[level];
+            keypoints[i] = kp;
+        }
     }
 }
 
-void cv::cuda::ORB_CUDA::downloadKeyPoints(const GpuMat &d_keypoints, std::vector<KeyPoint>& keypoints)
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int nfeatures,
+                                         float scaleFactor,
+                                         int nlevels,
+                                         int edgeThreshold,
+                                         int firstLevel,
+                                         int WTA_K,
+                                         int scoreType,
+                                         int patchSize,
+                                         int fastThreshold,
+                                         bool blurForDescriptor)
 {
-    if (d_keypoints.empty())
-    {
-        keypoints.clear();
-        return;
-    }
-
-    Mat h_keypoints(d_keypoints);
-
-    convertKeyPoints(h_keypoints, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::convertKeyPoints(const Mat &d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-    {
-        keypoints.clear();
-        return;
-    }
-
-    CV_Assert(d_keypoints.type() == CV_32FC1 && d_keypoints.rows == ROWS_COUNT);
-
-    const float* x_ptr = d_keypoints.ptr<float>(X_ROW);
-    const float* y_ptr = d_keypoints.ptr<float>(Y_ROW);
-    const float* response_ptr = d_keypoints.ptr<float>(RESPONSE_ROW);
-    const float* angle_ptr = d_keypoints.ptr<float>(ANGLE_ROW);
-    const float* octave_ptr = d_keypoints.ptr<float>(OCTAVE_ROW);
-    const float* size_ptr = d_keypoints.ptr<float>(SIZE_ROW);
-
-    keypoints.resize(d_keypoints.cols);
-
-    for (int i = 0; i < d_keypoints.cols; ++i)
-    {
-        KeyPoint kp;
-
-        kp.pt.x = x_ptr[i];
-        kp.pt.y = y_ptr[i];
-        kp.response = response_ptr[i];
-        kp.angle = angle_ptr[i];
-        kp.octave = static_cast<int>(octave_ptr[i]);
-        kp.size = size_ptr[i];
-
-        keypoints[i] = kp;
-    }
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints)
-{
-    buildScalePyramids(image, mask);
-    computeKeyPointsPyramid();
-    mergeKeyPoints(keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors)
-{
-    buildScalePyramids(image, mask);
-    computeKeyPointsPyramid();
-    computeDescriptors(descriptors);
-    mergeKeyPoints(keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    (*this)(image, mask, d_keypoints_);
-    downloadKeyPoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors)
-{
-    (*this)(image, mask, d_keypoints_, descriptors);
-    downloadKeyPoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::release()
-{
-    imagePyr_.clear();
-    maskPyr_.clear();
-
-    buf_.release();
-
-    keyPointsPyr_.clear();
-
-    d_keypoints_.release();
+    return makePtr<ORB_Impl>(nfeatures, scaleFactor, nlevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize, fastThreshold, blurForDescriptor);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/test/test_features2d.cpp b/modules/cudafeatures2d/test/test_features2d.cpp
index 9a8d76ce3..25ba48faf 100644
--- a/modules/cudafeatures2d/test/test_features2d.cpp
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -122,7 +122,7 @@ namespace
     IMPLEMENT_PARAM_CLASS(ORB_BlurForDescriptor, bool)
 }
 
-CV_ENUM(ORB_ScoreType, ORB::HARRIS_SCORE, ORB::FAST_SCORE)
+CV_ENUM(ORB_ScoreType, cv::ORB::HARRIS_SCORE, cv::ORB::FAST_SCORE)
 
 PARAM_TEST_CASE(ORB, cv::cuda::DeviceInfo, ORB_FeaturesCount, ORB_ScaleFactor, ORB_LevelsCount, ORB_EdgeThreshold, ORB_firstLevel, ORB_WTA_K, ORB_ScoreType, ORB_PatchSize, ORB_BlurForDescriptor)
 {
@@ -162,8 +162,9 @@ CUDA_TEST_P(ORB, Accuracy)
     cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
     mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
 
-    cv::cuda::ORB_CUDA orb(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
-    orb.blurForDescriptor = blurForDescriptor;
+    cv::Ptr<cv::cuda::ORB> orb =
+            cv::cuda::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel,
+                                  WTA_K, scoreType, patchSize, 20, blurForDescriptor);
 
     if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
     {
@@ -171,7 +172,7 @@ CUDA_TEST_P(ORB, Accuracy)
         {
             std::vector<cv::KeyPoint> keypoints;
             cv::cuda::GpuMat descriptors;
-            orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+            orb->detectAndComputeAsync(loadMat(image), loadMat(mask), keypoints, descriptors);
         }
         catch (const cv::Exception& e)
         {
@@ -182,7 +183,7 @@ CUDA_TEST_P(ORB, Accuracy)
     {
         std::vector<cv::KeyPoint> keypoints;
         cv::cuda::GpuMat descriptors;
-        orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+        orb->detectAndCompute(loadMat(image), loadMat(mask), keypoints, descriptors);
 
         cv::Ptr<cv::ORB> orb_gold = cv::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
 
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 8869a1b66..0d083e5bd 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -350,15 +350,15 @@ TEST(ORB)
     orb->detectAndCompute(src, Mat(), keypoints, descriptors);
     CPU_OFF;
 
-    cuda::ORB_CUDA d_orb;
+    Ptr<cuda::ORB> d_orb = cuda::ORB::create();
     cuda::GpuMat d_src(src);
     cuda::GpuMat d_keypoints;
     cuda::GpuMat d_descriptors;
 
-    d_orb(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
+    d_orb->detectAndComputeAsync(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
 
     CUDA_ON;
-    d_orb(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
+    d_orb->detectAndComputeAsync(d_src, cuda::GpuMat(), d_keypoints, d_descriptors);
     CUDA_OFF;
 }
 

From 764d55b81df438ff218c861d4e47459f89a9f467 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 13 Jan 2015 17:57:09 +0300
Subject: [PATCH 6/8] add extended documentation for Features2DAsync

---
 .../include/opencv2/cudafeatures2d.hpp        | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index 4a78d50e6..c7ab6e392 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -233,28 +233,47 @@ private:
 // Feature2DAsync
 //
 
+/** @brief Abstract base class for CUDA asynchronous 2D image feature detectors and descriptor extractors.
+ */
 class CV_EXPORTS Feature2DAsync
 {
 public:
     virtual ~Feature2DAsync();
 
+    /** @brief Detects keypoints in an image.
+
+    @param image Image.
+    @param keypoints The detected keypoints.
+    @param mask Mask specifying where to look for keypoints (optional). It must be a 8-bit integer
+    matrix with non-zero values in the region of interest.
+    @param stream CUDA stream.
+     */
     virtual void detectAsync(InputArray image,
                              OutputArray keypoints,
                              InputArray mask = noArray(),
                              Stream& stream = Stream::Null());
 
+    /** @brief Computes the descriptors for a set of keypoints detected in an image.
+
+    @param image Image.
+    @param keypoints Input collection of keypoints.
+    @param descriptors Computed descriptors. Row j is the descriptor for j-th keypoint.
+    @param stream CUDA stream.
+     */
     virtual void computeAsync(InputArray image,
                               OutputArray keypoints,
                               OutputArray descriptors,
                               Stream& stream = Stream::Null());
 
+    /** Detects keypoints and computes the descriptors. */
     virtual void detectAndComputeAsync(InputArray image,
                                        InputArray mask,
                                        OutputArray keypoints,
                                        OutputArray descriptors,
-                                       bool useProvidedKeypoints=false,
+                                       bool useProvidedKeypoints = false,
                                        Stream& stream = Stream::Null());
 
+    /** Converts keypoints array from internal representation to standard vector. */
     virtual void convert(InputArray gpu_keypoints,
                          std::vector<KeyPoint>& keypoints) = 0;
 };
@@ -263,6 +282,8 @@ public:
 // FastFeatureDetector
 //
 
+/** @brief Wrapping class for feature detection using the FAST method.
+ */
 class CV_EXPORTS FastFeatureDetector : public cv::FastFeatureDetector, public Feature2DAsync
 {
 public:
@@ -288,6 +309,10 @@ public:
 // ORB
 //
 
+/** @brief Class implementing the ORB (*oriented BRIEF*) keypoint detector and descriptor extractor
+ *
+ * @sa cv::ORB
+ */
 class CV_EXPORTS ORB : public cv::ORB, public Feature2DAsync
 {
 public:

From 8a178da1a42d6ec2a26eed1c998889377d1576ae Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 13 Jan 2015 17:57:30 +0300
Subject: [PATCH 7/8] refactor CUDA BFMatcher algorithm:

use new abstract interface and hidden implementation
---
 .../include/opencv2/cudafeatures2d.hpp        |  411 ++--
 .../cudafeatures2d/perf/perf_features2d.cpp   |   24 +-
 .../src/brute_force_matcher.cpp               | 1790 +++++++++--------
 .../cudafeatures2d/test/test_features2d.cpp   |   68 +-
 modules/stitching/src/matchers.cpp            |   11 +-
 samples/gpu/performance/tests.cpp             |   18 +-
 6 files changed, 1269 insertions(+), 1053 deletions(-)

diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index c7ab6e392..975726973 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -63,170 +63,315 @@ namespace cv { namespace cuda {
 //! @addtogroup cudafeatures2d
 //! @{
 
-/** @brief Brute-force descriptor matcher.
+//
+// DescriptorMatcher
+//
 
-For each descriptor in the first set, this matcher finds the closest descriptor in the second set
-by trying each one. This descriptor matcher supports masking permissible matches between descriptor
-sets.
+/** @brief Abstract base class for matching keypoint descriptors.
 
-The class BFMatcher_CUDA has an interface similar to the class DescriptorMatcher. It has two groups
-of match methods: for matching descriptors of one image with another image or with an image set.
-Also, all functions have an alternative to save results either to the GPU memory or to the CPU
-memory.
-
-@sa DescriptorMatcher, BFMatcher
+It has two groups of match methods: for matching descriptors of an image with another image or with
+an image set.
  */
-class CV_EXPORTS BFMatcher_CUDA
+class CV_EXPORTS DescriptorMatcher : public cv::Algorithm
 {
 public:
-    explicit BFMatcher_CUDA(int norm = cv::NORM_L2);
+    //
+    // Factories
+    //
 
-    //! Add descriptors to train descriptor collection
-    void add(const std::vector<GpuMat>& descCollection);
+    /** @brief Brute-force descriptor matcher.
 
-    //! Get train descriptors collection
-    const std::vector<GpuMat>& getTrainDescriptors() const;
+    For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+    by trying each one. This descriptor matcher supports masking permissible matches of descriptor
+    sets.
 
-    //! Clear train descriptors collection
-    void clear();
+    @param normType One of NORM_L1, NORM_L2, NORM_HAMMING. L1 and L2 norms are
+    preferable choices for SIFT and SURF descriptors, NORM_HAMMING should be used with ORB, BRISK and
+    BRIEF).
+     */
+    static Ptr<DescriptorMatcher> createBFMatcher(int norm = cv::NORM_L2);
 
-    //! Return true if there are not train descriptors in collection
-    bool empty() const;
+    //
+    // Utility
+    //
 
-    //! Return true if the matcher supports mask in match methods
-    bool isMaskSupported() const;
+    /** @brief Returns true if the descriptor matcher supports masking permissible matches.
+     */
+    virtual bool isMaskSupported() const = 0;
 
-    //! Find one best match for each query descriptor
-    void matchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    //
+    // Descriptor collection
+    //
 
-    //! Download trainIdx and distance and convert it to CPU vector with DMatch
-    static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    //! Convert trainIdx and distance to vector with DMatch
-    static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);
+    /** @brief Adds descriptors to train a descriptor collection.
 
-    //! Find one best match for each query descriptor
-    void match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());
+    If the collection is not empty, the new descriptors are added to existing train descriptors.
 
-    //! Make gpu collection of trains and masks in suitable format for matchCollection function
-    void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+    @param descriptors Descriptors to add. Each descriptors[i] is a set of descriptors from the same
+    train image.
+     */
+    virtual void add(const std::vector<GpuMat>& descriptors) = 0;
 
-    //! Find one best match from train collection for each query descriptor
-    void matchCollection(const GpuMat& query, const GpuMat& trainCollection,
-        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-        const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Returns a constant link to the train descriptor collection.
+     */
+    virtual const std::vector<GpuMat>& getTrainDescriptors() const = 0;
 
-    //! Download trainIdx, imgIdx and distance and convert it to vector with DMatch
-    static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    //! Convert trainIdx, imgIdx and distance to vector with DMatch
-    static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);
+    /** @brief Clears the train descriptor collection.
+     */
+    virtual void clear() = 0;
 
-    //! Find one best match from train collection for each query descriptor.
-    void match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+    /** @brief Returns true if there are no train descriptors in the collection.
+     */
+    virtual bool empty() const = 0;
 
-    //! Find k best matches for each query descriptor (in increasing order of distances)
-    void knnMatchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Trains a descriptor matcher.
 
-    //! Download trainIdx and distance and convert it to vector with DMatch
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx and distance to vector with DMatch
-    static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    Trains a descriptor matcher (for example, the flann index). In all methods to match, the method
+    train() is run every time before matching.
+     */
+    virtual void train() = 0;
 
-    //! Find k best matches for each query descriptor (in increasing order of distances).
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    void knnMatch(const GpuMat& query, const GpuMat& train,
-        std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),
-        bool compactResult = false);
+    //
+    // 1 to 1 match
+    //
 
-    //! Find k best matches from train collection for each query descriptor (in increasing order of distances)
-    void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
-        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-        const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Finds the best match for each descriptor from a query set (blocking version).
 
-    //! Download trainIdx and distance and convert it to vector with DMatch
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    //! @see BFMatcher_CUDA::knnMatchDownload
-    static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx and distance to vector with DMatch
-    //! @see BFMatcher_CUDA::knnMatchConvert
-    static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
+    descriptor. So, matches size may be smaller than the query descriptors count.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
 
-    //! Find k best matches  for each query descriptor (in increasing order of distances).
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    virtual void match(InputArray queryDescriptors, InputArray trainDescriptors,
+                       std::vector<DMatch>& matches,
+                       InputArray mask = noArray()) = 0;
 
-    //! Find best matches for each query descriptor which have distance less than maxDistance.
-    //! nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
-    //! carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
-    //! because it didn't have enough memory.
-    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
-    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    //! Matches doesn't sorted.
-    void radiusMatchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    /** @overload
+     */
+    virtual void match(InputArray queryDescriptors,
+                       std::vector<DMatch>& matches,
+                       const std::vector<GpuMat>& masks = std::vector<GpuMat>()) = 0;
 
-    //! Download trainIdx, nMatches and distance and convert it to vector with DMatch.
-    //! matches will be sorted in increasing order of distances.
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx, nMatches and distance to vector with DMatch.
-    static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    /** @brief Finds the best match for each descriptor from a query set (asynchronous version).
 
-    //! Find best matches for each query descriptor which have distance less than maxDistance
-    //! in increasing order of distances).
-    void radiusMatch(const GpuMat& query, const GpuMat& train,
-        std::vector< std::vector<DMatch> >& matches, float maxDistance,
-        const GpuMat& mask = GpuMat(), bool compactResult = false);
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::matchConvert method to retrieve results in standard representation.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
 
-    //! Find best matches for each query descriptor which have distance less than maxDistance.
-    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
-    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    //! Matches doesn't sorted.
-    void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    virtual void matchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                            OutputArray matches,
+                            InputArray mask = noArray(),
+                            Stream& stream = Stream::Null()) = 0;
 
-    //! Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
-    //! matches will be sorted in increasing order of distances.
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx, nMatches and distance to vector with DMatch.
-    static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    /** @overload
+     */
+    virtual void matchAsync(InputArray queryDescriptors,
+                            OutputArray matches,
+                            const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                            Stream& stream = Stream::Null()) = 0;
 
-    //! Find best matches from train collection for each query descriptor which have distance less than
-    //! maxDistance (in increasing order of distances).
-    void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
+    /** @brief Converts matches array from internal representation to standard matches vector.
 
-    int norm;
+    The method is supposed to be used with DescriptorMatcher::matchAsync to get final result.
+    Call this method only after DescriptorMatcher::matchAsync is completed (ie. after synchronization).
 
-private:
-    std::vector<GpuMat> trainDescCollection;
+    @param gpu_matches Matches, returned from DescriptorMatcher::matchAsync.
+    @param matches Vector of DMatch objects.
+     */
+    virtual void matchConvert(InputArray gpu_matches,
+                              std::vector<DMatch>& matches) = 0;
+
+    //
+    // knn match
+    //
+
+    /** @brief Finds the k best matches for each descriptor from a query set (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    These extended variants of DescriptorMatcher::match methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::match
+    for the details about query and train descriptors.
+     */
+    virtual void knnMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                          std::vector<std::vector<DMatch> >& matches,
+                          int k,
+                          InputArray mask = noArray(),
+                          bool compactResult = false) = 0;
+
+    /** @overload
+     */
+    virtual void knnMatch(InputArray queryDescriptors,
+                          std::vector<std::vector<DMatch> >& matches,
+                          int k,
+                          const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                          bool compactResult = false) = 0;
+
+    /** @brief Finds the k best matches for each descriptor from a query set (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::knnMatchConvert method to retrieve results in standard representation.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+    @param stream CUDA stream.
+
+    These extended variants of DescriptorMatcher::matchAsync methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::matchAsync
+    for the details about query and train descriptors.
+     */
+    virtual void knnMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                               OutputArray matches,
+                               int k,
+                               InputArray mask = noArray(),
+                               Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    virtual void knnMatchAsync(InputArray queryDescriptors,
+                               OutputArray matches,
+                               int k,
+                               const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                               Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::knnMatchAsync to get final result.
+    Call this method only after DescriptorMatcher::knnMatchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::knnMatchAsync.
+    @param matches Vector of DMatch objects.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+     */
+    virtual void knnMatchConvert(InputArray gpu_matches,
+                                 std::vector< std::vector<DMatch> >& matches,
+                                 bool compactResult = false) = 0;
+
+    //
+    // radius match
+    //
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Found matches.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    virtual void radiusMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                             std::vector<std::vector<DMatch> >& matches,
+                             float maxDistance,
+                             InputArray mask = noArray(),
+                             bool compactResult = false) = 0;
+
+    /** @overload
+     */
+    virtual void radiusMatch(InputArray queryDescriptors,
+                             std::vector<std::vector<DMatch> >& matches,
+                             float maxDistance,
+                             const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                             bool compactResult = false) = 0;
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::radiusMatchConvert method to retrieve results in standard representation.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+    @param stream CUDA stream.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    virtual void radiusMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                  OutputArray matches,
+                                  float maxDistance,
+                                  InputArray mask = noArray(),
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    virtual void radiusMatchAsync(InputArray queryDescriptors,
+                                  OutputArray matches,
+                                  float maxDistance,
+                                  const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::radiusMatchAsync to get final result.
+    Call this method only after DescriptorMatcher::radiusMatchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::radiusMatchAsync.
+    @param matches Vector of DMatch objects.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+     */
+    virtual void radiusMatchConvert(InputArray gpu_matches,
+                                    std::vector< std::vector<DMatch> >& matches,
+                                    bool compactResult = false) = 0;
 };
 
 //
diff --git a/modules/cudafeatures2d/perf/perf_features2d.cpp b/modules/cudafeatures2d/perf/perf_features2d.cpp
index 0dcb0434f..9d8134816 100644
--- a/modules/cudafeatures2d/perf/perf_features2d.cpp
+++ b/modules/cudafeatures2d/perf/perf_features2d.cpp
@@ -167,16 +167,16 @@ PERF_TEST_P(DescSize_Norm, BFMatch,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
 
         const cv::cuda::GpuMat d_query(query);
         const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_distance;
+        cv::cuda::GpuMat d_matches;
 
-        TEST_CYCLE() d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        TEST_CYCLE() d_matcher->matchAsync(d_query, d_train, d_matches);
 
         std::vector<cv::DMatch> gpu_matches;
-        d_matcher.matchDownload(d_trainIdx, d_distance, gpu_matches);
+        d_matcher->matchConvert(d_matches, gpu_matches);
 
         SANITY_CHECK_MATCHES(gpu_matches);
     }
@@ -226,16 +226,16 @@ PERF_TEST_P(DescSize_K_Norm, BFKnnMatch,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
 
         const cv::cuda::GpuMat d_query(query);
         const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_distance, d_allDist;
+        cv::cuda::GpuMat d_matches;
 
-        TEST_CYCLE() d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        TEST_CYCLE() d_matcher->knnMatchAsync(d_query, d_train, d_matches, k);
 
         std::vector< std::vector<cv::DMatch> > matchesTbl;
-        d_matcher.knnMatchDownload(d_trainIdx, d_distance, matchesTbl);
+        d_matcher->knnMatchConvert(d_matches, matchesTbl);
 
         std::vector<cv::DMatch> gpu_matches;
         toOneRowMatches(matchesTbl, gpu_matches);
@@ -280,16 +280,16 @@ PERF_TEST_P(DescSize_Norm, BFRadiusMatch,
 
     if (PERF_RUN_CUDA())
     {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);
 
         const cv::cuda::GpuMat d_query(query);
         const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_nMatches, d_distance;
+        cv::cuda::GpuMat d_matches;
 
-        TEST_CYCLE() d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, maxDistance);
+        TEST_CYCLE() d_matcher->radiusMatchAsync(d_query, d_train, d_matches, maxDistance);
 
         std::vector< std::vector<cv::DMatch> > matchesTbl;
-        d_matcher.radiusMatchDownload(d_trainIdx, d_distance, d_nMatches, matchesTbl);
+        d_matcher->radiusMatchConvert(d_matches, matchesTbl);
 
         std::vector<cv::DMatch> gpu_matches;
         toOneRowMatches(matchesTbl, gpu_matches);
diff --git a/modules/cudafeatures2d/src/brute_force_matcher.cpp b/modules/cudafeatures2d/src/brute_force_matcher.cpp
index 5de0b06e3..a00537c8e 100644
--- a/modules/cudafeatures2d/src/brute_force_matcher.cpp
+++ b/modules/cudafeatures2d/src/brute_force_matcher.cpp
@@ -47,37 +47,7 @@ using namespace cv::cuda;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
-cv::cuda::BFMatcher_CUDA::BFMatcher_CUDA(int) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::add(const std::vector<GpuMat>&) { throw_no_cuda(); }
-const std::vector<GpuMat>& cv::cuda::BFMatcher_CUDA::getTrainDescriptors() const { throw_no_cuda(); return trainDescCollection; }
-void cv::cuda::BFMatcher_CUDA::clear() { throw_no_cuda(); }
-bool cv::cuda::BFMatcher_CUDA::empty() const { throw_no_cuda(); return true; }
-bool cv::cuda::BFMatcher_CUDA::isMaskSupported() const { throw_no_cuda(); return true; }
-void cv::cuda::BFMatcher_CUDA::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchDownload(const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchConvert(const Mat&, const Mat&, std::vector<DMatch>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::match(const GpuMat&, const GpuMat&, std::vector<DMatch>&, const GpuMat&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::makeGpuCollection(GpuMat&, GpuMat&, const std::vector<GpuMat>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::matchConvert(const Mat&, const Mat&, const Mat&, std::vector<DMatch>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatchConvert(const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch2Collection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch2Download(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch2Convert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchConvert(const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchCollection(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const std::vector<GpuMat>&, Stream&) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatchConvert(const Mat&, const Mat&, const Mat&, const Mat&, std::vector< std::vector<DMatch> >&, bool) { throw_no_cuda(); }
-void cv::cuda::BFMatcher_CUDA::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_no_cuda(); }
+Ptr<cv::cuda::DescriptorMatcher> cv::cuda::DescriptorMatcher::createBFMatcher(int) { throw_no_cuda(); return Ptr<cv::cuda::DescriptorMatcher>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -155,857 +125,953 @@ namespace cv { namespace cuda { namespace device
     }
 }}}
 
-////////////////////////////////////////////////////////////////////
-// Train collection
-
-cv::cuda::BFMatcher_CUDA::BFMatcher_CUDA(int norm_) : norm(norm_)
-{
-}
-
-void cv::cuda::BFMatcher_CUDA::add(const std::vector<GpuMat>& descCollection)
-{
-    trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
-}
-
-const std::vector<GpuMat>& cv::cuda::BFMatcher_CUDA::getTrainDescriptors() const
-{
-    return trainDescCollection;
-}
-
-void cv::cuda::BFMatcher_CUDA::clear()
-{
-    trainDescCollection.clear();
-}
-
-bool cv::cuda::BFMatcher_CUDA::empty() const
-{
-    return trainDescCollection.empty();
-}
-
-bool cv::cuda::BFMatcher_CUDA::isMaskSupported() const
-{
-    return true;
-}
-
-////////////////////////////////////////////////////////////////////
-// Match
-
-void cv::cuda::BFMatcher_CUDA::matchSingle(const GpuMat& query, const GpuMat& train,
-    GpuMat& trainIdx, GpuMat& distance,
-    const GpuMat& mask, Stream& stream)
-{
-    if (query.empty() || train.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_match;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.cols == query.cols && train.type() == query.type());
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    const int nQuery = query.rows;
-
-    ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32F, distance);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-
-    matchConvert(trainIdxCPU, distanceCPU, matches);
-}
-
-void cv::cuda::BFMatcher_CUDA::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC1 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* trainIdx_ptr = trainIdx.ptr<int>();
-    const float* distance_ptr =  distance.ptr<float>();
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
-    {
-        int train_idx = *trainIdx_ptr;
-
-        if (train_idx == -1)
-            continue;
-
-        float distance_local = *distance_ptr;
-
-        DMatch m(queryIdx, train_idx, 0, distance_local);
-
-        matches.push_back(m);
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::match(const GpuMat& query, const GpuMat& train,
-    std::vector<DMatch>& matches, const GpuMat& mask)
-{
-    GpuMat trainIdx, distance;
-    matchSingle(query, train, trainIdx, distance, mask);
-    matchDownload(trainIdx, distance, matches);
-}
-
-void cv::cuda::BFMatcher_CUDA::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
-    const std::vector<GpuMat>& masks)
-{
-    if (empty())
-        return;
-
-    if (masks.empty())
-    {
-        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
-
-        PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
-
-        for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr)
-            *trainCollectionCPU_ptr = trainDescCollection[i];
-
-        trainCollection.upload(trainCollectionCPU);
-        maskCollection.release();
-    }
-    else
-    {
-        CV_Assert(masks.size() == trainDescCollection.size());
-
-        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
-        Mat maskCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepb)));
-
-        PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
-        PtrStepb* maskCollectionCPU_ptr = maskCollectionCPU.ptr<PtrStepb>();
-
-        for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr, ++maskCollectionCPU_ptr)
-        {
-            const GpuMat& train = trainDescCollection[i];
-            const GpuMat& mask = masks[i];
-
-            CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.cols == train.rows));
-
-            *trainCollectionCPU_ptr = train;
-            *maskCollectionCPU_ptr = mask;
-        }
-
-        trainCollection.upload(trainCollectionCPU);
-        maskCollection.upload(maskCollectionCPU);
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::matchCollection(const GpuMat& query, const GpuMat& trainCollection,
-    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-    const GpuMat& masks, Stream& stream)
-{
-    if (query.empty() || trainCollection.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_match;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
-                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    const int nQuery = query.rows;
-
-    ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32F, distance);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-
-    matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);
-}
-
-void cv::cuda::BFMatcher_CUDA::matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.cols == trainIdx.cols);
-    CV_Assert(distance.type() == CV_32FC1 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* trainIdx_ptr = trainIdx.ptr<int>();
-    const int* imgIdx_ptr = imgIdx.ptr<int>();
-    const float* distance_ptr =  distance.ptr<float>();
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-    {
-        int _trainIdx = *trainIdx_ptr;
-
-        if (_trainIdx == -1)
-            continue;
-
-        int _imgIdx = *imgIdx_ptr;
-
-        float _distance = *distance_ptr;
-
-        DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
-
-        matches.push_back(m);
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks)
-{
-    GpuMat trainCollection;
-    GpuMat maskCollection;
-
-    makeGpuCollection(trainCollection, maskCollection, masks);
-
-    GpuMat trainIdx, imgIdx, distance;
-
-    matchCollection(query, trainCollection, trainIdx, imgIdx, distance, maskCollection);
-    matchDownload(trainIdx, imgIdx, distance, matches);
-}
-
-////////////////////////////////////////////////////////////////////
-// KnnMatch
-
-void cv::cuda::BFMatcher_CUDA::knnMatchSingle(const GpuMat& query, const GpuMat& train,
-    GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
-    const GpuMat& mask, Stream& stream)
-{
-    if (query.empty() || train.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_knnmatch;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
-                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.type() == query.type() && train.cols == query.cols);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
-
-    if (k == 2)
-    {
-        ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
-        ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
-    }
-    else
-    {
-        ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
-        ensureSizeIsEnough(nQuery, k, CV_32F, distance);
-        ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
-    }
-
-    trainIdx.setTo(Scalar::all(-1), stream);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-
-    knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatchConvert(const Mat& trainIdx, const Mat& distance,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC2 || trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC2 || distance.type() == CV_32FC1);
-    CV_Assert(distance.size() == trainIdx.size());
-    CV_Assert(trainIdx.isContinuous() && distance.isContinuous());
-
-    const int nQuery = trainIdx.type() == CV_32SC2 ? trainIdx.cols : trainIdx.rows;
-    const int k = trainIdx.type() == CV_32SC2 ? 2 :trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* trainIdx_ptr = trainIdx.ptr<int>();
-    const float* distance_ptr = distance.ptr<float>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(k);
-
-        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
-        {
-            int _trainIdx = *trainIdx_ptr;
-
-            if (_trainIdx != -1)
-            {
-                float _distance = *distance_ptr;
-
-                DMatch m(queryIdx, _trainIdx, 0, _distance);
-
-                curMatches.push_back(m);
-            }
-        }
-
-        if (compactResult && curMatches.empty())
-            matches.pop_back();
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch(const GpuMat& query, const GpuMat& train,
-    std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask, bool compactResult)
-{
-    GpuMat trainIdx, distance, allDist;
-    knnMatchSingle(query, train, trainIdx, distance, allDist, k, mask);
-    knnMatchDownload(trainIdx, distance, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
-    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-    const GpuMat& maskCollection, Stream& stream)
-{
-    if (query.empty() || trainCollection.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_knnmatch;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
-                             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
-        match2L1_gpu<unsigned short>, match2L1_gpu<short>,
-        match2L1_gpu<int>, match2L1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
-        0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
-        0/*match2L2_gpu<int>*/, match2L2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
-        match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
-        match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
-    };
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    const int nQuery = query.rows;
-
-    ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
-
-    trainIdx.setTo(Scalar::all(-1), stream);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-
-    knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC2);
-    CV_Assert(imgIdx.type() == CV_32SC2 && imgIdx.cols == trainIdx.cols);
-    CV_Assert(distance.type() == CV_32FC2 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* trainIdx_ptr = trainIdx.ptr<int>();
-    const int* imgIdx_ptr = imgIdx.ptr<int>();
-    const float* distance_ptr = distance.ptr<float>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(2);
-
-        for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-        {
-            int _trainIdx = *trainIdx_ptr;
-
-            if (_trainIdx != -1)
-            {
-                int _imgIdx = *imgIdx_ptr;
-
-                float _distance = *distance_ptr;
-
-                DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
-
-                curMatches.push_back(m);
-            }
-        }
-
-        if (compactResult && curMatches.empty())
-            matches.pop_back();
-    }
-}
-
 namespace
 {
-    struct ImgIdxSetter
+    static void makeGpuCollection(const std::vector<GpuMat>& trainDescCollection,
+                                  const std::vector<GpuMat>& masks,
+                                  GpuMat& trainCollection,
+                                  GpuMat& maskCollection)
     {
-        explicit inline ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
-        inline void operator()(DMatch& m) const {m.imgIdx = imgIdx;}
-        int imgIdx;
-    };
-}
-
-void cv::cuda::BFMatcher_CUDA::knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
-    const std::vector<GpuMat>& masks, bool compactResult)
-{
-    if (k == 2)
-    {
-        GpuMat trainCollection;
-        GpuMat maskCollection;
-
-        makeGpuCollection(trainCollection, maskCollection, masks);
-
-        GpuMat trainIdx, imgIdx, distance;
-
-        knnMatch2Collection(query, trainCollection, trainIdx, imgIdx, distance, maskCollection);
-        knnMatch2Download(trainIdx, imgIdx, distance, matches);
-    }
-    else
-    {
-        if (query.empty() || empty())
+        if (trainDescCollection.empty())
             return;
 
-        std::vector< std::vector<DMatch> > curMatches;
-        std::vector<DMatch> temp;
-        temp.reserve(2 * k);
-
-        matches.resize(query.rows);
-        for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&std::vector<DMatch>::reserve), k));
-
-        for (size_t imgIdx = 0, size = trainDescCollection.size(); imgIdx < size; ++imgIdx)
+        if (masks.empty())
         {
-            knnMatch(query, trainDescCollection[imgIdx], curMatches, k, masks.empty() ? GpuMat() : masks[imgIdx]);
+            Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
 
-            for (int queryIdx = 0; queryIdx < query.rows; ++queryIdx)
+            PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
+
+            for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr)
+                *trainCollectionCPU_ptr = trainDescCollection[i];
+
+            trainCollection.upload(trainCollectionCPU);
+            maskCollection.release();
+        }
+        else
+        {
+            CV_Assert( masks.size() == trainDescCollection.size() );
+
+            Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepSzb)));
+            Mat maskCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(PtrStepb)));
+
+            PtrStepSzb* trainCollectionCPU_ptr = trainCollectionCPU.ptr<PtrStepSzb>();
+            PtrStepb* maskCollectionCPU_ptr = maskCollectionCPU.ptr<PtrStepb>();
+
+            for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr, ++maskCollectionCPU_ptr)
             {
-                std::vector<DMatch>& localMatch = curMatches[queryIdx];
-                std::vector<DMatch>& globalMatch = matches[queryIdx];
+                const GpuMat& train = trainDescCollection[i];
+                const GpuMat& mask = masks[i];
 
-                for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast<int>(imgIdx)));
+                CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.cols == train.rows) );
 
-                temp.clear();
-                merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));
+                *trainCollectionCPU_ptr = train;
+                *maskCollectionCPU_ptr = mask;
+            }
 
-                globalMatch.clear();
-                const size_t count = std::min((size_t)k, temp.size());
-                copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch));
+            trainCollection.upload(trainCollectionCPU);
+            maskCollection.upload(maskCollectionCPU);
+        }
+    }
+
+    class BFMatcher_Impl : public cv::cuda::DescriptorMatcher
+    {
+    public:
+        explicit BFMatcher_Impl(int norm) : norm_(norm)
+        {
+            CV_Assert( norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING );
+        }
+
+        virtual bool isMaskSupported() const { return true; }
+
+        virtual void add(const std::vector<GpuMat>& descriptors)
+        {
+            trainDescCollection_.insert(trainDescCollection_.end(), descriptors.begin(), descriptors.end());
+        }
+
+        virtual const std::vector<GpuMat>& getTrainDescriptors() const
+        {
+            return trainDescCollection_;
+        }
+
+        virtual void clear()
+        {
+            trainDescCollection_.clear();
+        }
+
+        virtual bool empty() const
+        {
+            return trainDescCollection_.empty();
+        }
+
+        virtual void train()
+        {
+        }
+
+        virtual void match(InputArray queryDescriptors, InputArray trainDescriptors,
+                           std::vector<DMatch>& matches,
+                           InputArray mask = noArray());
+
+        virtual void match(InputArray queryDescriptors,
+                           std::vector<DMatch>& matches,
+                           const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+
+        virtual void matchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                OutputArray matches,
+                                InputArray mask = noArray(),
+                                Stream& stream = Stream::Null());
+
+        virtual void matchAsync(InputArray queryDescriptors,
+                                OutputArray matches,
+                                const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                Stream& stream = Stream::Null());
+
+        virtual void matchConvert(InputArray gpu_matches,
+                                  std::vector<DMatch>& matches);
+
+        virtual void knnMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                              std::vector<std::vector<DMatch> >& matches,
+                              int k,
+                              InputArray mask = noArray(),
+                              bool compactResult = false);
+
+        virtual void knnMatch(InputArray queryDescriptors,
+                              std::vector<std::vector<DMatch> >& matches,
+                              int k,
+                              const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                              bool compactResult = false);
+
+        virtual void knnMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                   OutputArray matches,
+                                   int k,
+                                   InputArray mask = noArray(),
+                                   Stream& stream = Stream::Null());
+
+        virtual void knnMatchAsync(InputArray queryDescriptors,
+                                   OutputArray matches,
+                                   int k,
+                                   const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                   Stream& stream = Stream::Null());
+
+        virtual void knnMatchConvert(InputArray gpu_matches,
+                                     std::vector< std::vector<DMatch> >& matches,
+                                     bool compactResult = false);
+
+        virtual void radiusMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                                 std::vector<std::vector<DMatch> >& matches,
+                                 float maxDistance,
+                                 InputArray mask = noArray(),
+                                 bool compactResult = false);
+
+        virtual void radiusMatch(InputArray queryDescriptors,
+                                 std::vector<std::vector<DMatch> >& matches,
+                                 float maxDistance,
+                                 const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                 bool compactResult = false);
+
+        virtual void radiusMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                      OutputArray matches,
+                                      float maxDistance,
+                                      InputArray mask = noArray(),
+                                      Stream& stream = Stream::Null());
+
+        virtual void radiusMatchAsync(InputArray queryDescriptors,
+                                      OutputArray matches,
+                                      float maxDistance,
+                                      const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                      Stream& stream = Stream::Null());
+
+        virtual void radiusMatchConvert(InputArray gpu_matches,
+                                        std::vector< std::vector<DMatch> >& matches,
+                                        bool compactResult = false);
+
+    private:
+        int norm_;
+        std::vector<GpuMat> trainDescCollection_;
+    };
+
+    //
+    // 1 to 1 match
+    //
+
+    void BFMatcher_Impl::match(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                               std::vector<DMatch>& matches,
+                               InputArray _mask)
+    {
+        GpuMat d_matches;
+        matchAsync(_queryDescriptors, _trainDescriptors, d_matches, _mask);
+        matchConvert(d_matches, matches);
+    }
+
+    void BFMatcher_Impl::match(InputArray _queryDescriptors,
+                               std::vector<DMatch>& matches,
+                               const std::vector<GpuMat>& masks)
+    {
+        GpuMat d_matches;
+        matchAsync(_queryDescriptors, d_matches, masks);
+        matchConvert(d_matches, matches);
+    }
+
+    void BFMatcher_Impl::matchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                    OutputArray _matches,
+                                    InputArray _mask,
+                                    Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(2, nQuery, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC1, matches.ptr(0));
+        GpuMat distance(1, nQuery, CV_32FC1, matches.ptr(1));
+
+        func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::matchAsync(InputArray _queryDescriptors,
+                                    OutputArray _matches,
+                                    const std::vector<GpuMat>& masks,
+                                    Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3, nQuery, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC1, matches.ptr(0));
+        GpuMat imgIdx(1, nQuery, CV_32SC1, matches.ptr(1));
+        GpuMat distance(1, nQuery, CV_32FC1, matches.ptr(2));
+
+        func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::matchConvert(InputArray _gpu_matches,
+                                      std::vector<DMatch>& matches)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( (gpu_matches.type() == CV_32SC1) && (gpu_matches.rows == 2 || gpu_matches.rows == 3) );
+
+        const int nQuery = gpu_matches.cols;
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+
+        if (gpu_matches.rows == 2)
+        {
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(1);
+        }
+        else
+        {
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            imgIdxPtr =  gpu_matches.ptr<int>(1);
+            distancePtr =  gpu_matches.ptr<float>(2);
+        }
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            const int trainIdx = trainIdxPtr[queryIdx];
+            if (trainIdx == -1)
+                continue;
+
+            const int imgIdx = imgIdxPtr ? imgIdxPtr[queryIdx] : 0;
+            const float distance = distancePtr[queryIdx];
+
+            DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+            matches.push_back(m);
+        }
+    }
+
+    //
+    // knn match
+    //
+
+    void BFMatcher_Impl::knnMatch(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                  std::vector<std::vector<DMatch> >& matches,
+                                  int k,
+                                  InputArray _mask,
+                                  bool compactResult)
+    {
+        GpuMat d_matches;
+        knnMatchAsync(_queryDescriptors, _trainDescriptors, d_matches, k, _mask);
+        knnMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::knnMatch(InputArray _queryDescriptors,
+                                  std::vector<std::vector<DMatch> >& matches,
+                                  int k,
+                                  const std::vector<GpuMat>& masks,
+                                  bool compactResult)
+    {
+        if (k == 2)
+        {
+            GpuMat d_matches;
+            knnMatchAsync(_queryDescriptors, d_matches, k, masks);
+            knnMatchConvert(d_matches, matches, compactResult);
+        }
+        else
+        {
+            const GpuMat query = _queryDescriptors.getGpuMat();
+
+            if (query.empty() || trainDescCollection_.empty())
+            {
+                matches.clear();
+                return;
+            }
+
+            CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+            std::vector< std::vector<DMatch> > curMatches;
+            std::vector<DMatch> temp;
+            temp.reserve(2 * k);
+
+            matches.resize(query.rows);
+            for (size_t i = 0; i < matches.size(); ++i)
+                matches[i].reserve(k);
+
+            for (size_t imgIdx = 0; imgIdx < trainDescCollection_.size(); ++imgIdx)
+            {
+                knnMatch(query, trainDescCollection_[imgIdx], curMatches, k, masks.empty() ? GpuMat() : masks[imgIdx]);
+
+                for (int queryIdx = 0; queryIdx < query.rows; ++queryIdx)
+                {
+                    std::vector<DMatch>& localMatch = curMatches[queryIdx];
+                    std::vector<DMatch>& globalMatch = matches[queryIdx];
+
+                    for (size_t i = 0; i < localMatch.size(); ++i)
+                        localMatch[i].imgIdx = imgIdx;
+
+                    temp.clear();
+                    std::merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), std::back_inserter(temp));
+
+                    globalMatch.clear();
+                    const size_t count = std::min(static_cast<size_t>(k), temp.size());
+                    std::copy(temp.begin(), temp.begin() + count, std::back_inserter(globalMatch));
+                }
+            }
+
+            if (compactResult)
+            {
+                std::vector< std::vector<DMatch> >::iterator new_end = std::remove_if(matches.begin(), matches.end(), std::mem_fun_ref(&std::vector<DMatch>::empty));
+                matches.erase(new_end, matches.end());
             }
         }
+    }
 
-        if (compactResult)
+    void BFMatcher_Impl::knnMatchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                       OutputArray _matches,
+                                       int k,
+                                       InputArray _mask,
+                                       Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_knnmatch;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
         {
-            std::vector< std::vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), mem_fun_ref(&std::vector<DMatch>::empty));
-            matches.erase(new_end, matches.end());
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+                                 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+        const int nTrain = train.rows;
+
+        GpuMat trainIdx, distance, allDist;
+        if (k == 2)
+        {
+            _matches.create(2, nQuery, CV_32SC2);
+            GpuMat matches = _matches.getGpuMat();
+
+            trainIdx = GpuMat(1, nQuery, CV_32SC2, matches.ptr(0));
+            distance = GpuMat(1, nQuery, CV_32FC2, matches.ptr(1));
+        }
+        else
+        {
+            _matches.create(2 * nQuery, k, CV_32SC1);
+            GpuMat matches = _matches.getGpuMat();
+
+            trainIdx = GpuMat(nQuery, k, CV_32SC1, matches.ptr(0), matches.step);
+            distance = GpuMat(nQuery, k, CV_32FC1, matches.ptr(nQuery), matches.step);
+
+            BufferPool pool(stream);
+            allDist = pool.getBuffer(nQuery, nTrain, CV_32FC1);
+        }
+
+        trainIdx.setTo(Scalar::all(-1), stream);
+
+        func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::knnMatchAsync(InputArray _queryDescriptors,
+                                       OutputArray _matches,
+                                       int k,
+                                       const std::vector<GpuMat>& masks,
+                                       Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_knnmatch;
+
+        if (k != 2)
+        {
+            CV_Error(Error::StsNotImplemented, "only k=2 mode is supported for now");
+        }
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
+            match2L1_gpu<unsigned short>, match2L1_gpu<short>,
+            match2L1_gpu<int>, match2L1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
+            0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
+            0/*match2L2_gpu<int>*/, match2L2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
+            match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
+            match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3, nQuery, CV_32SC2);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(1, nQuery, CV_32SC2, matches.ptr(0));
+        GpuMat imgIdx(1, nQuery, CV_32SC2, matches.ptr(1));
+        GpuMat distance(1, nQuery, CV_32FC2, matches.ptr(2));
+
+        trainIdx.setTo(Scalar::all(-1), stream);
+
+        func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::knnMatchConvert(InputArray _gpu_matches,
+                                         std::vector< std::vector<DMatch> >& matches,
+                                         bool compactResult)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( ((gpu_matches.type() == CV_32SC2) && (gpu_matches.rows == 2 || gpu_matches.rows == 3)) ||
+                   (gpu_matches.type() == CV_32SC1) );
+
+        int nQuery = -1, k = -1;
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+
+        if (gpu_matches.type() == CV_32SC2)
+        {
+            nQuery = gpu_matches.cols;
+            k = 2;
+
+            if (gpu_matches.rows == 2)
+            {
+                trainIdxPtr = gpu_matches.ptr<int>(0);
+                distancePtr =  gpu_matches.ptr<float>(1);
+            }
+            else
+            {
+                trainIdxPtr = gpu_matches.ptr<int>(0);
+                imgIdxPtr =  gpu_matches.ptr<int>(1);
+                distancePtr =  gpu_matches.ptr<float>(2);
+            }
+        }
+        else
+        {
+            nQuery = gpu_matches.rows / 2;
+            k = gpu_matches.cols;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(nQuery);
+        }
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            matches.push_back(std::vector<DMatch>());
+            std::vector<DMatch>& curMatches = matches.back();
+            curMatches.reserve(k);
+
+            for (int i = 0; i < k; ++i)
+            {
+                const int trainIdx = *trainIdxPtr;
+                if (trainIdx == -1)
+                    continue;
+
+                const int imgIdx = imgIdxPtr ? *imgIdxPtr : 0;
+                const float distance = *distancePtr;
+
+                DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+                curMatches.push_back(m);
+
+                ++trainIdxPtr;
+                ++distancePtr;
+                if (imgIdxPtr)
+                    ++imgIdxPtr;
+            }
+
+            if (compactResult && curMatches.empty())
+            {
+                matches.pop_back();
+            }
+        }
+    }
+
+    //
+    // radius match
+    //
+
+    void BFMatcher_Impl::radiusMatch(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                     std::vector<std::vector<DMatch> >& matches,
+                                     float maxDistance,
+                                     InputArray _mask,
+                                     bool compactResult)
+    {
+        GpuMat d_matches;
+        radiusMatchAsync(_queryDescriptors, _trainDescriptors, d_matches, maxDistance, _mask);
+        radiusMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::radiusMatch(InputArray _queryDescriptors,
+                                     std::vector<std::vector<DMatch> >& matches,
+                                     float maxDistance,
+                                     const std::vector<GpuMat>& masks,
+                                     bool compactResult)
+    {
+        GpuMat d_matches;
+        radiusMatchAsync(_queryDescriptors, d_matches, maxDistance, masks);
+        radiusMatchConvert(d_matches, matches, compactResult);
+    }
+
+    void BFMatcher_Impl::radiusMatchAsync(InputArray _queryDescriptors, InputArray _trainDescriptors,
+                                          OutputArray _matches,
+                                          float maxDistance,
+                                          InputArray _mask,
+                                          Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_radius_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+        const GpuMat train = _trainDescriptors.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        if (query.empty() || train.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+        CV_Assert( train.cols == query.cols && train.type() == query.type() );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.rows == query.rows && mask.cols == train.rows) );
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+        const int nTrain = train.rows;
+
+        const int cols = std::max((nTrain / 100), nQuery);
+
+        _matches.create(2 * nQuery + 1, cols, CV_32SC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(nQuery, cols, CV_32SC1, matches.ptr(0), matches.step);
+        GpuMat distance(nQuery, cols, CV_32FC1, matches.ptr(nQuery), matches.step);
+        GpuMat nMatches(1, nQuery, CV_32SC1, matches.ptr(2 * nQuery));
+
+        nMatches.setTo(Scalar::all(0), stream);
+
+        func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::radiusMatchAsync(InputArray _queryDescriptors,
+                                          OutputArray _matches,
+                                          float maxDistance,
+                                          const std::vector<GpuMat>& masks,
+                                          Stream& stream)
+    {
+        using namespace cv::cuda::device::bf_radius_match;
+
+        const GpuMat query = _queryDescriptors.getGpuMat();
+
+        if (query.empty() || trainDescCollection_.empty())
+        {
+            _matches.release();
+            return;
+        }
+
+        CV_Assert( query.channels() == 1 && query.depth() < CV_64F );
+
+        GpuMat trainCollection, maskCollection;
+        makeGpuCollection(trainDescCollection_, masks, trainCollection, maskCollection);
+
+        typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                                 cudaStream_t stream);
+
+        static const caller_t callersL1[] =
+        {
+            matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
+            matchL1_gpu<unsigned short>, matchL1_gpu<short>,
+            matchL1_gpu<int>, matchL1_gpu<float>
+        };
+        static const caller_t callersL2[] =
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        };
+        static const caller_t callersHamming[] =
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
+            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
+        };
+
+        const caller_t* callers = norm_ == NORM_L1 ? callersL1 : norm_ == NORM_L2 ? callersL2 : callersHamming;
+
+        const caller_t func = callers[query.depth()];
+        if (func == 0)
+        {
+            CV_Error(Error::StsUnsupportedFormat, "unsupported combination of query.depth() and norm");
+        }
+
+        const int nQuery = query.rows;
+
+        _matches.create(3 * nQuery + 1, nQuery, CV_32FC1);
+        GpuMat matches = _matches.getGpuMat();
+
+        GpuMat trainIdx(nQuery, nQuery, CV_32SC1, matches.ptr(0), matches.step);
+        GpuMat imgIdx(nQuery, nQuery, CV_32SC1, matches.ptr(nQuery), matches.step);
+        GpuMat distance(nQuery, nQuery, CV_32FC1, matches.ptr(2 * nQuery), matches.step);
+        GpuMat nMatches(1, nQuery, CV_32SC1, matches.ptr(3 * nQuery));
+
+        nMatches.setTo(Scalar::all(0), stream);
+
+        std::vector<PtrStepSzb> trains_(trainDescCollection_.begin(), trainDescCollection_.end());
+        std::vector<PtrStepSzb> masks_(masks.begin(), masks.end());
+
+        func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
+            trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
+    }
+
+    void BFMatcher_Impl::radiusMatchConvert(InputArray _gpu_matches,
+                                            std::vector< std::vector<DMatch> >& matches,
+                                            bool compactResult)
+    {
+        Mat gpu_matches;
+        if (_gpu_matches.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_matches.getGpuMat().download(gpu_matches);
+        }
+        else
+        {
+            gpu_matches = _gpu_matches.getMat();
+        }
+
+        if (gpu_matches.empty())
+        {
+            matches.clear();
+            return;
+        }
+
+        CV_Assert( gpu_matches.type() == CV_32SC1 || gpu_matches.type() == CV_32FC1 );
+
+        int nQuery = -1;
+
+        const int* trainIdxPtr = NULL;
+        const int* imgIdxPtr = NULL;
+        const float* distancePtr = NULL;
+        const int* nMatchesPtr = NULL;
+
+        if (gpu_matches.type() == CV_32SC1)
+        {
+            nQuery = (gpu_matches.rows - 1) / 2;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            distancePtr =  gpu_matches.ptr<float>(nQuery);
+            nMatchesPtr = gpu_matches.ptr<int>(2 * nQuery);
+        }
+        else
+        {
+            nQuery = (gpu_matches.rows - 1) / 3;
+
+            trainIdxPtr = gpu_matches.ptr<int>(0);
+            imgIdxPtr = gpu_matches.ptr<int>(nQuery);
+            distancePtr =  gpu_matches.ptr<float>(2 * nQuery);
+            nMatchesPtr = gpu_matches.ptr<int>(3 * nQuery);
+        }
+
+        matches.clear();
+        matches.reserve(nQuery);
+
+        for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+        {
+            const int nMatched = std::min(nMatchesPtr[queryIdx], gpu_matches.cols);
+
+            if (nMatched == 0)
+            {
+                if (!compactResult)
+                {
+                    matches.push_back(std::vector<DMatch>());
+                }
+            }
+            else
+            {
+                matches.push_back(std::vector<DMatch>(nMatched));
+                std::vector<DMatch>& curMatches = matches.back();
+
+                for (int i = 0; i < nMatched; ++i)
+                {
+                    const int trainIdx = trainIdxPtr[i];
+
+                    const int imgIdx = imgIdxPtr ? imgIdxPtr[i] : 0;
+                    const float distance = distancePtr[i];
+
+                    DMatch m(queryIdx, trainIdx, imgIdx, distance);
+
+                    curMatches[i] = m;
+                }
+
+                std::sort(curMatches.begin(), curMatches.end());
+            }
+
+            trainIdxPtr += gpu_matches.cols;
+            distancePtr += gpu_matches.cols;
+            if (imgIdxPtr)
+                imgIdxPtr += gpu_matches.cols;
         }
     }
 }
 
-////////////////////////////////////////////////////////////////////
-// RadiusMatch
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchSingle(const GpuMat& query, const GpuMat& train,
-    GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
-    const GpuMat& mask, Stream& stream)
+Ptr<cv::cuda::DescriptorMatcher> cv::cuda::DescriptorMatcher::createBFMatcher(int norm)
 {
-    if (query.empty() || train.empty())
-        return;
-
-    using namespace cv::cuda::device::bf_radius_match;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.type() == query.type() && train.cols == query.cols);
-    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
-    if (trainIdx.empty())
-    {
-        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
-        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
-    }
-
-    nMatches.setTo(Scalar::all(0), stream);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-    Mat nMatchesCPU(nMatches);
-
-    radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());
-    CV_Assert(nMatches.type() == CV_32SC1 && nMatches.cols == trainIdx.rows);
-
-    const int nQuery = trainIdx.rows;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* nMatches_ptr = nMatches.ptr<int>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
-        const float* distance_ptr = distance.ptr<float>(queryIdx);
-
-        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
-
-        if (nMatched == 0)
-        {
-            if (!compactResult)
-                matches.push_back(std::vector<DMatch>());
-            continue;
-        }
-
-        matches.push_back(std::vector<DMatch>(nMatched));
-        std::vector<DMatch>& curMatches = matches.back();
-
-        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
-        {
-            int _trainIdx = *trainIdx_ptr;
-
-            float _distance = *distance_ptr;
-
-            DMatch m(queryIdx, _trainIdx, 0, _distance);
-
-            curMatches[i] = m;
-        }
-
-        sort(curMatches.begin(), curMatches.end());
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatch(const GpuMat& query, const GpuMat& train,
-    std::vector< std::vector<DMatch> >& matches, float maxDistance, const GpuMat& mask, bool compactResult)
-{
-    GpuMat trainIdx, distance, nMatches;
-    radiusMatchSingle(query, train, trainIdx, distance, nMatches, maxDistance, mask);
-    radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
-    float maxDistance, const std::vector<GpuMat>& masks, Stream& stream)
-{
-    if (query.empty() || empty())
-        return;
-
-    using namespace cv::cuda::device::bf_radius_match;
-
-    typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             cudaStream_t stream);
-
-    static const caller_t callersL1[] =
-    {
-        matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-        matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-        matchL1_gpu<int>, matchL1_gpu<float>
-    };
-    static const caller_t callersL2[] =
-    {
-        0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-        0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-        0/*matchL2_gpu<int>*/, matchL2_gpu<float>
-    };
-    static const caller_t callersHamming[] =
-    {
-        matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-        matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-        matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-    };
-
-    const int nQuery = query.rows;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size()));
-    CV_Assert(norm == NORM_L1 || norm == NORM_L2 || norm == NORM_HAMMING);
-
-    const caller_t* callers = norm == NORM_L1 ? callersL1 : norm == NORM_L2 ? callersL2 : callersHamming;
-
-    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
-    if (trainIdx.empty())
-    {
-        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, trainIdx);
-        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32SC1, imgIdx);
-        ensureSizeIsEnough(nQuery, std::max((nQuery / 100), 10), CV_32FC1, distance);
-    }
-
-    nMatches.setTo(Scalar::all(0), stream);
-
-    caller_t func = callers[query.depth()];
-    CV_Assert(func != 0);
-
-    std::vector<PtrStepSzb> trains_(trainDescCollection.begin(), trainDescCollection.end());
-    std::vector<PtrStepSzb> masks_(masks.begin(), masks.end());
-
-    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
-        trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-    Mat nMatchesCPU(nMatches);
-
-    radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
-    std::vector< std::vector<DMatch> >& matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.size() == trainIdx.size());
-    CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());
-    CV_Assert(nMatches.type() == CV_32SC1 && nMatches.cols == trainIdx.rows);
-
-    const int nQuery = trainIdx.rows;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int* nMatches_ptr = nMatches.ptr<int>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
-        const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
-        const float* distance_ptr = distance.ptr<float>(queryIdx);
-
-        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
-
-        if (nMatched == 0)
-        {
-            if (!compactResult)
-                matches.push_back(std::vector<DMatch>());
-            continue;
-        }
-
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(nMatched);
-
-        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-        {
-            int _trainIdx = *trainIdx_ptr;
-            int _imgIdx = *imgIdx_ptr;
-            float _distance = *distance_ptr;
-
-            DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
-
-            curMatches.push_back(m);
-        }
-
-        sort(curMatches.begin(), curMatches.end());
-    }
-}
-
-void cv::cuda::BFMatcher_CUDA::radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches,
-    float maxDistance, const std::vector<GpuMat>& masks, bool compactResult)
-{
-    GpuMat trainIdx, imgIdx, distance, nMatches;
-    radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
-    radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
+    return makePtr<BFMatcher_Impl>(norm);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/cudafeatures2d/test/test_features2d.cpp b/modules/cudafeatures2d/test/test_features2d.cpp
index 25ba48faf..3046a604b 100644
--- a/modules/cudafeatures2d/test/test_features2d.cpp
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -285,7 +285,8 @@ PARAM_TEST_CASE(BruteForceMatcher, cv::cuda::DeviceInfo, NormCode, DescriptorSiz
 
 CUDA_TEST_P(BruteForceMatcher, Match_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     cv::cuda::GpuMat mask;
     if (useMask)
@@ -295,7 +296,7 @@ CUDA_TEST_P(BruteForceMatcher, Match_Single)
     }
 
     std::vector<cv::DMatch> matches;
-    matcher.match(loadMat(query), loadMat(train), matches, mask);
+    matcher->match(loadMat(query), loadMat(train), matches, mask);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -312,13 +313,14 @@ CUDA_TEST_P(BruteForceMatcher, Match_Single)
 
 CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     cv::cuda::GpuMat d_train(train);
 
     // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
 
     // prepare masks (make first nearest match illegal)
     std::vector<cv::cuda::GpuMat> masks(2);
@@ -331,9 +333,9 @@ CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 
     std::vector<cv::DMatch> matches;
     if (useMask)
-        matcher.match(cv::cuda::GpuMat(query), matches, masks);
+        matcher->match(cv::cuda::GpuMat(query), matches, masks);
     else
-        matcher.match(cv::cuda::GpuMat(query), matches);
+        matcher->match(cv::cuda::GpuMat(query), matches);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -366,7 +368,8 @@ CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int knn = 2;
 
@@ -378,7 +381,7 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
     }
 
     std::vector< std::vector<cv::DMatch> > matches;
-    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -405,7 +408,8 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int knn = 3;
 
@@ -417,7 +421,7 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
     }
 
     std::vector< std::vector<cv::DMatch> > matches;
-    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -444,15 +448,16 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int knn = 2;
 
     cv::cuda::GpuMat d_train(train);
 
     // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
 
     // prepare masks (make first nearest match illegal)
     std::vector<cv::cuda::GpuMat> masks(2);
@@ -466,9 +471,9 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
     std::vector< std::vector<cv::DMatch> > matches;
 
     if (useMask)
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
     else
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -506,15 +511,16 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 
 CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int knn = 3;
 
     cv::cuda::GpuMat d_train(train);
 
     // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
 
     // prepare masks (make first nearest match illegal)
     std::vector<cv::cuda::GpuMat> masks(2);
@@ -528,9 +534,9 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
     std::vector< std::vector<cv::DMatch> > matches;
 
     if (useMask)
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
     else
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);
 
     ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -568,7 +574,8 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 
 CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const float radius = 1.f / countFactor;
 
@@ -577,7 +584,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
         try
         {
             std::vector< std::vector<cv::DMatch> > matches;
-            matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
+            matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius);
         }
         catch (const cv::Exception& e)
         {
@@ -594,7 +601,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
         }
 
         std::vector< std::vector<cv::DMatch> > matches;
-        matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);
+        matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);
 
         ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
@@ -617,7 +624,8 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 
 CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);
 
     const int n = 3;
     const float radius = 1.f / countFactor * n;
@@ -625,8 +633,8 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
     cv::cuda::GpuMat d_train(train);
 
     // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
 
     // prepare masks (make first nearest match illegal)
     std::vector<cv::cuda::GpuMat> masks(2);
@@ -642,7 +650,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
         try
         {
             std::vector< std::vector<cv::DMatch> > matches;
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
         }
         catch (const cv::Exception& e)
         {
@@ -654,9 +662,9 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
         std::vector< std::vector<cv::DMatch> > matches;
 
         if (useMask)
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
         else
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius);
 
         ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
 
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 49ee0f474..ee05268d7 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -154,7 +154,7 @@ void CpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
 
     matches_info.matches.clear();
 
-    Ptr<DescriptorMatcher> matcher;
+    Ptr<cv::DescriptorMatcher> matcher;
 #if 0 // TODO check this
     if (ocl::useOpenCL())
     {
@@ -220,13 +220,13 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
     descriptors1_.upload(features1.descriptors);
     descriptors2_.upload(features2.descriptors);
 
-    BFMatcher_CUDA matcher(NORM_L2);
+    Ptr<cuda::DescriptorMatcher> matcher = cuda::DescriptorMatcher::createBFMatcher(NORM_L2);
+
     MatchesSet matches;
 
     // Find 1->2 matches
     pair_matches.clear();
-    matcher.knnMatchSingle(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2);
-    matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
+    matcher->knnMatch(descriptors1_, descriptors2_, pair_matches, 2);
     for (size_t i = 0; i < pair_matches.size(); ++i)
     {
         if (pair_matches[i].size() < 2)
@@ -242,8 +242,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
 
     // Find 2->1 matches
     pair_matches.clear();
-    matcher.knnMatchSingle(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2);
-    matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
+    matcher->knnMatch(descriptors2_, descriptors1_, pair_matches, 2);
     for (size_t i = 0; i < pair_matches.size(); ++i)
     {
         if (pair_matches[i].size() < 2)
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index 0d083e5bd..09094282f 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -379,14 +379,14 @@ TEST(BruteForceMatcher)
 
     // Init CUDA matcher
 
-    cuda::BFMatcher_CUDA d_matcher(NORM_L2);
+    Ptr<cuda::DescriptorMatcher> d_matcher = cuda::DescriptorMatcher::createBFMatcher(NORM_L2);
 
     cuda::GpuMat d_query(query);
     cuda::GpuMat d_train(train);
 
     // Output
     vector< vector<DMatch> > matches(2);
-    cuda::GpuMat d_trainIdx, d_distance, d_allDist, d_nMatches;
+    cuda::GpuMat d_matches;
 
     SUBTEST << "match";
 
@@ -396,10 +396,10 @@ TEST(BruteForceMatcher)
     matcher.match(query, train, matches[0]);
     CPU_OFF;
 
-    d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+    d_matcher->matchAsync(d_query, d_train, d_matches);
 
     CUDA_ON;
-    d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+    d_matcher->matchAsync(d_query, d_train, d_matches);
     CUDA_OFF;
 
     SUBTEST << "knnMatch";
@@ -410,10 +410,10 @@ TEST(BruteForceMatcher)
     matcher.knnMatch(query, train, matches, 2);
     CPU_OFF;
 
-    d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+    d_matcher->knnMatchAsync(d_query, d_train, d_matches, 2);
 
     CUDA_ON;
-    d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
+    d_matcher->knnMatchAsync(d_query, d_train, d_matches, 2);
     CUDA_OFF;
 
     SUBTEST << "radiusMatch";
@@ -426,12 +426,10 @@ TEST(BruteForceMatcher)
     matcher.radiusMatch(query, train, matches, max_distance);
     CPU_OFF;
 
-    d_trainIdx.release();
-
-    d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+    d_matcher->radiusMatchAsync(d_query, d_train, d_matches, max_distance);
 
     CUDA_ON;
-    d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
+    d_matcher->radiusMatchAsync(d_query, d_train, d_matches, max_distance);
     CUDA_OFF;
 }
 

From 5f1282afdb0b3d137b0cf2161d5691bc877fc3f3 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Tue, 13 Jan 2015 18:23:59 +0300
Subject: [PATCH 8/8] fix documentation warnings

---
 modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
index 975726973..1d7f4e4e4 100644
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -89,7 +89,7 @@ public:
     preferable choices for SIFT and SURF descriptors, NORM_HAMMING should be used with ORB, BRISK and
     BRIEF).
      */
-    static Ptr<DescriptorMatcher> createBFMatcher(int norm = cv::NORM_L2);
+    static Ptr<DescriptorMatcher> createBFMatcher(int normType = cv::NORM_L2);
 
     //
     // Utility
@@ -248,9 +248,6 @@ public:
     less than k possible matches in total.
     @param mask Mask specifying permissible matches between an input query and train matrices of
     descriptors.
-    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
-    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
-    the matches vector does not contain matches for fully masked-out query descriptors.
     @param stream CUDA stream.
 
     These extended variants of DescriptorMatcher::matchAsync methods find several best matches for each query
@@ -335,9 +332,6 @@ public:
     in Pixels)!
     @param mask Mask specifying permissible matches between an input query and train matrices of
     descriptors.
-    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
-    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
-    the matches vector does not contain matches for fully masked-out query descriptors.
     @param stream CUDA stream.
 
     For each query descriptor, the methods find such training descriptors that the distance between the