Merge remote-tracking branch 'refs/remotes/upstream/master' into rho

2015-01-20 13:11:09 -05:00
parent 02124f19e6 95ecdc3af9
commit 045f8294bb
136 changed files with 7496 additions and 5503 deletions
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -113,7 +113,6 @@ endmacro()
 macro(ocv_add_module _name)
  ocv_debug_message("ocv_add_module(" ${_name} ${ARGN} ")")
  string(TOLOWER "${_name}" name)
-  string(REGEX REPLACE "^opencv_" "" ${name} "${name}")
  set(the_module opencv_${name})

  # the first pass - collect modules info, the second pass - create targets
@@ -787,7 +786,7 @@ macro(__ocv_parse_test_sources tests_type)
      set(__file_group_sources "")
    elseif(arg STREQUAL "DEPENDS_ON")
      set(__currentvar "OPENCV_${tests_type}_${the_module}_DEPS")
-    elseif("${__currentvar}" STREQUAL "__file_group_sources" AND NOT __file_group_name)
+    elseif(" ${__currentvar}" STREQUAL " __file_group_sources" AND NOT __file_group_name) # spaces to avoid CMP0054
      set(__file_group_name "${arg}")
    else()
      list(APPEND ${__currentvar} "${arg}")
@@ -808,7 +807,7 @@ function(ocv_add_perf_tests)
    __ocv_parse_test_sources(PERF ${ARGN})

    # opencv_imgcodecs is required for imread/imwrite
-    set(perf_deps ${the_module} opencv_ts opencv_imgcodecs ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
+    set(perf_deps opencv_ts ${the_module} opencv_imgcodecs ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
    ocv_check_dependencies(${perf_deps})

    if(OCV_DEPENDENCIES_FOUND)
@@ -829,7 +828,7 @@ function(ocv_add_perf_tests)

      ocv_add_executable(${the_target} ${OPENCV_PERF_${the_module}_SOURCES} ${${the_target}_pch})
      ocv_target_include_modules(${the_target} ${perf_deps} "${perf_path}")
-      ocv_target_link_libraries(${the_target} ${OPENCV_MODULE_${the_module}_DEPS} ${perf_deps} ${OPENCV_LINKER_LIBS})
+      ocv_target_link_libraries(${the_target} ${perf_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
      add_dependencies(opencv_perf_tests ${the_target})

      # Additional target properties
@@ -864,7 +863,7 @@ function(ocv_add_accuracy_tests)
    __ocv_parse_test_sources(TEST ${ARGN})

    # opencv_imgcodecs is required for imread/imwrite
-    set(test_deps ${the_module} opencv_ts opencv_imgcodecs opencv_videoio ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
+    set(test_deps opencv_ts ${the_module} opencv_imgcodecs opencv_videoio ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_opencv_ts_DEPS})
    ocv_check_dependencies(${test_deps})
    if(OCV_DEPENDENCIES_FOUND)
      set(the_target "opencv_test_${name}")
@@ -884,7 +883,7 @@ function(ocv_add_accuracy_tests)

      ocv_add_executable(${the_target} ${OPENCV_TEST_${the_module}_SOURCES} ${${the_target}_pch})
      ocv_target_include_modules(${the_target} ${test_deps} "${test_path}")
-      ocv_target_link_libraries(${the_target} ${OPENCV_MODULE_${the_module}_DEPS} ${test_deps} ${OPENCV_LINKER_LIBS})
+      ocv_target_link_libraries(${the_target} ${test_deps} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_LINKER_LIBS})
      add_dependencies(opencv_tests ${the_target})

      # Additional target properties
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -276,12 +276,12 @@ macro(OCV_OPTION variable description value)
    endif()
  endforeach()
  unset(__varname)
-  if("${__condition}" STREQUAL "")
+  if(__condition STREQUAL "")
    set(__condition 2 GREATER 1)
  endif()

  if(${__condition})
-    if("${__value}" MATCHES ";")
+    if(__value MATCHES ";")
      if(${__value})
        option(${variable} "${description}" ON)
      else()
--- a/doc/tutorials/highgui/raster-gdal/raster_io_gdal.markdown
+++ b/doc/tutorials/highgui/raster-gdal/raster_io_gdal.markdown
@@ -3,7 +3,7 @@ Reading Geospatial Raster files with GDAL {#tutorial_raster_io_gdal}

 Geospatial raster data is a heavily used product in Geographic Information Systems and
 Photogrammetry. Raster data typically can represent imagery and Digital Elevation Models (DEM). The
-standard library for loading GIS imagery is the Geographic Data Abstraction Library (GDAL). In this
+standard library for loading GIS imagery is the Geographic Data Abstraction Library [(GDAL)](http://www.gdal.org). In this
 example, we will show techniques for loading GIS raster formats using native OpenCV functions. In
 addition, we will show some an example of how OpenCV can use this data for novel and interesting
 purposes.
@@ -13,8 +13,8 @@ Goals

 The primary objectives for this tutorial:

-   How to use OpenCV imread to load satellite imagery.
-   How to use OpenCV imread to load SRTM Digital Elevation Models
+-   How to use OpenCV [imread](@ref imread) to load satellite imagery.
+-   How to use OpenCV [imread](@ref imread) to load SRTM Digital Elevation Models
 -   Given the corner coordinates of both the image and DEM, correllate the elevation data to the
    image to find elevations for each pixel.
 -   Show a basic, easy-to-implement example of a terrain heat map.
@@ -54,9 +54,9 @@ signed shorts.
 Notes
 -----

-### Lat/Lon (Geodetic) Coordinates should normally be avoided
+### Lat/Lon (Geographic) Coordinates should normally be avoided

-The Geodetic Coordinate System is a spherical coordinate system, meaning that using them with
+The Geographic Coordinate System is a spherical coordinate system, meaning that using them with
 Cartesian mathematics is technically incorrect. This demo uses them to increase the readability and
 is accurate enough to make the point. A better coordinate system would be Universal Transverse
 Mercator.
@@ -94,8 +94,8 @@ Below is the output of the program. Use the first image as the input. For the DE
 the SRTM file located at the USGS here.
 [<http://dds.cr.usgs.gov/srtm/version2_1/SRTM1/Region_04/N37W123.hgt.zip>](http://dds.cr.usgs.gov/srtm/version2_1/SRTM1/Region_04/N37W123.hgt.zip)

-![](images/gdal_output.jpg)
+![Input Image](images/gdal_output.jpg)

-![](images/gdal_heat-map.jpg)
+![Heat Map](images/gdal_heat-map.jpg)

-![](images/gdal_flood-zone.jpg)
+![Heat Map Overlay](images/gdal_flood-zone.jpg)
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -2972,7 +2972,13 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
    for( i = 0; i < nimages; i++ )
    {
        ni = objectPoints.getMat(i).checkVector(3, CV_32F);
-        CV_Assert( ni >= 0 );
+        if( ni <= 0 )
+            CV_Error(CV_StsUnsupportedFormat, "objectPoints should contain vector of vectors of points of type Point3f");
+        int ni1 = imagePoints1.getMat(i).checkVector(2, CV_32F);
+        if( ni1 <= 0 )
+            CV_Error(CV_StsUnsupportedFormat, "imagePoints1 should contain vector of vectors of points of type Point2f");
+        CV_Assert( ni == ni1 );
+
        total += ni;
    }

@@ -2995,8 +3001,6 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
        Mat objpt = objectPoints.getMat(i);
        Mat imgpt1 = imagePoints1.getMat(i);
        ni = objpt.checkVector(3, CV_32F);
-        int ni1 = imgpt1.checkVector(2, CV_32F);
-        CV_Assert( ni > 0 && ni == ni1 );
        npoints.at<int>(i) = ni;
        memcpy( objPtData + j, objpt.ptr(), ni*sizeof(objPtData[0]) );
        memcpy( imgPtData1 + j, imgpt1.ptr(), ni*sizeof(imgPtData1[0]) );
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -3284,7 +3284,8 @@ inline void UMat::release()
 {
    if( u && CV_XADD(&(u->urefcount), -1) == 1 )
        deallocate();
-    size.p[0] = 0;
+    for(int i = 0; i < dims; i++)
+        size.p[i] = 0;
    u = 0;
 }

--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -80,6 +80,16 @@
 namespace cv { namespace cuda {
    CV_EXPORTS cv::String getNppErrorMessage(int code);
    CV_EXPORTS cv::String getCudaDriverApiErrorMessage(int code);
+
+    CV_EXPORTS GpuMat getInputMat(InputArray _src, Stream& stream);
+
+    CV_EXPORTS GpuMat getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream);
+    static inline GpuMat getOutputMat(OutputArray _dst, Size size, int type, Stream& stream)
+    {
+        return getOutputMat(_dst, size.height, size.width, type, stream);
+    }
+
+    CV_EXPORTS void syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream);
 }}

 #ifndef HAVE_CUDA
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -2355,6 +2355,165 @@ struct Mul_SIMD<float, float>
    }
 };

+#elif CV_SSE2
+
+#if CV_SSE4_1
+
+template <>
+struct Mul_SIMD<ushort, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale != 1.0f )
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
+                                           _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
+                                           _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct Mul_SIMD<schar, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
+
+                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
+            }
+        else
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
+
+                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+template <>
+struct Mul_SIMD<short, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale != 1.0f )
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
 #endif

 template<typename T, typename WT> static void
@@ -2772,7 +2931,144 @@ struct AddWeighted_SIMD
    }
 };

-#if CV_NEON
+#if CV_SSE2
+
+template <>
+struct AddWeighted_SIMD<schar, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
+
+            __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+            __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
+
+            __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
+                                              _mm_cvtps_epi32(v_dstf1));
+
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
+        }
+
+        return x;
+    }
+
+    bool haveSSE2;
+};
+
+template <>
+struct AddWeighted_SIMD<short, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
+                                                                   _mm_cvtps_epi32(v_dstf1)));
+        }
+
+        return x;
+    }
+
+    bool haveSSE2;
+};
+
+#if CV_SSE4_1
+
+template <>
+struct AddWeighted_SIMD<ushort, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE4_1)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
+                                                                    _mm_cvtps_epi32(v_dstf1)));
+        }
+
+        return x;
+    }
+
+    bool haveSSE4_1;
+};
+
+#endif
+
+#elif CV_NEON

 template <>
 struct AddWeighted_SIMD<schar, float>
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@@ -390,6 +390,11 @@ GpuMat& cv::cuda::GpuMat::setTo(Scalar value, InputArray _mask, Stream& stream)

    GpuMat mask = _mask.getGpuMat();

+    if (mask.empty())
+    {
+        return setTo(value, stream);
+    }
+
    CV_DbgAssert( size() == mask.size() && mask.type() == CV_8UC1 );

    typedef void (*func_t)(const GpuMat& mat, const GpuMat& mask, Scalar scalar, Stream& stream);
--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@@ -342,6 +342,75 @@ void cv::cuda::ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr)
    }
 }

+GpuMat cv::cuda::getInputMat(InputArray _src, Stream& stream)
+{
+    GpuMat src;
+
+#ifndef HAVE_CUDA
+    (void) _src;
+    (void) stream;
+    throw_no_cuda();
+#else
+    if (_src.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        src = _src.getGpuMat();
+    }
+    else if (!_src.empty())
+    {
+        BufferPool pool(stream);
+        src = pool.getBuffer(_src.size(), _src.type());
+        src.upload(_src, stream);
+    }
+#endif
+
+    return src;
+}
+
+GpuMat cv::cuda::getOutputMat(OutputArray _dst, int rows, int cols, int type, Stream& stream)
+{
+    GpuMat dst;
+
+#ifndef HAVE_CUDA
+    (void) _dst;
+    (void) rows;
+    (void) cols;
+    (void) type;
+    (void) stream;
+    throw_no_cuda();
+#else
+    if (_dst.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        _dst.create(rows, cols, type);
+        dst = _dst.getGpuMat();
+    }
+    else
+    {
+        BufferPool pool(stream);
+        dst = pool.getBuffer(rows, cols, type);
+    }
+#endif
+
+    return dst;
+}
+
+void cv::cuda::syncOutput(const GpuMat& dst, OutputArray _dst, Stream& stream)
+{
+#ifndef HAVE_CUDA
+    (void) dst;
+    (void) _dst;
+    (void) stream;
+    throw_no_cuda();
+#else
+    if (_dst.kind() != _InputArray::CUDA_GPU_MAT)
+    {
+        if (stream)
+            dst.download(_dst, stream);
+        else
+            dst.download(_dst);
+    }
+#endif
+}
+
 #ifndef HAVE_CUDA

 GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator()
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -48,6 +48,13 @@
 # endif
 #endif

+#if defined ANDROID || defined __linux__
+#  include <unistd.h>
+#  include <fcntl.h>
+#  include <elf.h>
+#  include <linux/auxvec.h>
+#endif
+
 #if defined WIN32 || defined _WIN32 || defined WINCE
 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
  #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
@@ -251,6 +258,29 @@ struct HWFeatures
            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
        }

+    #if defined ANDROID || defined __linux__
+        int cpufile = open("/proc/self/auxv", O_RDONLY);
+
+        if (cpufile >= 0)
+        {
+            Elf32_auxv_t auxv;
+            const size_t size_auxv_t = sizeof(Elf32_auxv_t);
+
+            while (read(cpufile, &auxv, sizeof(Elf32_auxv_t)) == size_auxv_t)
+            {
+                if (auxv.a_type == AT_HWCAP)
+                {
+                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
+                    break;
+                }
+            }
+
+            close(cpufile);
+        }
+    #elif (defined __clang__ || defined __APPLE__) && defined __ARM_NEON__
+        f.have[CV_CPU_NEON] = true;
+    #endif
+
        return f;
    }

--- a/modules/cuda/CMakeLists.txt
+++ b/modules/cuda/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "CUDA-accelerated Computer Vision")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 /wd4515 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)

-ocv_define_module(cuda opencv_calib3d opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
+ocv_define_module(cuda opencv_calib3d opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
--- a/modules/cuda/include/opencv2/cuda.hpp
+++ b/modules/cuda/include/opencv2/cuda.hpp
@@ -53,274 +53,11 @@
    @addtogroup cuda
    @{
        @defgroup cuda_calib3d Camera Calibration and 3D Reconstruction
-        @defgroup cuda_objdetect Object Detection
    @}
 */

 namespace cv { namespace cuda {

-//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
-
-//! @addtogroup cuda_objdetect
-//! @{
-
-struct CV_EXPORTS HOGConfidence
-{
-   double scale;
-   std::vector<Point> locations;
-   std::vector<double> confidences;
-   std::vector<double> part_scores[4];
-};
-
-/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
-
-Interfaces of all methods are kept similar to the CPU HOG descriptor and detector analogues as much
-as possible.
-
-@note
-   -   An example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/cpp/peopledetect.cpp
-    -   A CUDA example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/gpu/hog.cpp
-    -   (Python) An example applying the HOG descriptor for people detection can be found at
-        opencv_source_code/samples/python2/peopledetect.py
- */
-struct CV_EXPORTS HOGDescriptor
-{
-    enum { DEFAULT_WIN_SIGMA = -1 };
-    enum { DEFAULT_NLEVELS = 64 };
-    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
-
-    /** @brief Creates the HOG descriptor and detector.
-
-    @param win_size Detection window size. Align to block size and block stride.
-    @param block_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
-    @param block_stride Block stride. It must be a multiple of cell size.
-    @param cell_size Cell size. Only (8, 8) is supported for now.
-    @param nbins Number of bins. Only 9 bins per cell are supported for now.
-    @param win_sigma Gaussian smoothing window parameter.
-    @param threshold_L2hys L2-Hys normalization method shrinkage.
-    @param gamma_correction Flag to specify whether the gamma correction preprocessing is required or
-    not.
-    @param nlevels Maximum number of detection window increases.
-     */
-    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
-                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
-                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
-                  double threshold_L2hys=0.2, bool gamma_correction=true,
-                  int nlevels=DEFAULT_NLEVELS);
-
-    /** @brief Returns the number of coefficients required for the classification.
-     */
-    size_t getDescriptorSize() const;
-    /** @brief Returns the block histogram size.
-    */
-    size_t getBlockHistogramSize() const;
-
-    /** @brief Sets coefficients for the linear SVM classifier.
-    */
-    void setSVMDetector(const std::vector<float>& detector);
-
-    /** @brief Returns coefficients of the classifier trained for people detection (for default window size).
-    */
-    static std::vector<float> getDefaultPeopleDetector();
-    /** @brief Returns coefficients of the classifier trained for people detection (for 48x96 windows).
-    */
-    static std::vector<float> getPeopleDetector48x96();
-    /** @brief Returns coefficients of the classifier trained for people detection (for 64x128 windows).
-    */
-    static std::vector<float> getPeopleDetector64x128();
-
-    /** @brief Performs object detection without a multi-scale window.
-
-    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
-    @param found_locations Left-top corner points of detected objects boundaries.
-    @param hit_threshold Threshold for the distance between features and SVM classifying plane.
-    Usually it is 0 and should be specfied in the detector coefficients (as the last free
-    coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
-    manually here.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-     */
-    void detect(const GpuMat& img, std::vector<Point>& found_locations,
-                double hit_threshold=0, Size win_stride=Size(),
-                Size padding=Size());
-
-    /** @brief Performs object detection with a multi-scale window.
-
-    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
-    @param found_locations Detected objects boundaries.
-    @param hit_threshold Threshold for the distance between features and SVM classifying plane. See
-    cuda::HOGDescriptor::detect for details.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-    @param scale0 Coefficient of the detection window increase.
-    @param group_threshold Coefficient to regulate the similarity threshold. When detected, some
-    objects can be covered by many rectangles. 0 means not to perform grouping. See groupRectangles .
-     */
-    void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                          double hit_threshold=0, Size win_stride=Size(),
-                          Size padding=Size(), double scale0=1.05,
-                          int group_threshold=2);
-
-    void computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
-                                                Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences);
-
-    void computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                                                                    double hit_threshold, Size win_stride, Size padding,
-                                                                    std::vector<HOGConfidence> &conf_out, int group_threshold);
-
-    /** @brief Returns block descriptors computed for the whole image.
-
-    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param descriptors 2D array of descriptors.
-    @param descr_format Descriptor storage format:
-    -   **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
-    -   **DESCR_FORMAT_COL_BY_COL** - Column-major order.
-
-    The function is mainly used to learn the classifier.
-     */
-    void getDescriptors(const GpuMat& img, Size win_stride,
-                        GpuMat& descriptors,
-                        int descr_format=DESCR_FORMAT_COL_BY_COL);
-
-    Size win_size;
-    Size block_size;
-    Size block_stride;
-    Size cell_size;
-    int nbins;
-    double win_sigma;
-    double threshold_L2hys;
-    bool gamma_correction;
-    int nlevels;
-
-protected:
-    void computeBlockHistograms(const GpuMat& img);
-    void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
-
-    double getWinSigma() const;
-    bool checkDetectorSize() const;
-
-    static int numPartsWithin(int size, int part_size, int stride);
-    static Size numPartsWithin(Size size, Size part_size, Size stride);
-
-    // Coefficients of the separating plane
-    float free_coef;
-    GpuMat detector;
-
-    // Results of the last classification step
-    GpuMat labels, labels_buf;
-    Mat labels_host;
-
-    // Results of the last histogram evaluation step
-    GpuMat block_hists, block_hists_buf;
-
-    // Gradients conputation results
-    GpuMat grad, qangle, grad_buf, qangle_buf;
-
-    // returns subbuffer with required size, reallocates buffer if nessesary.
-    static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
-    static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);
-
-    std::vector<GpuMat> image_scales;
-};
-
-//////////////////////////// CascadeClassifier ////////////////////////////
-
-/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
-
-@note
-   -   A cascade classifier example can be found at
-        opencv_source_code/samples/gpu/cascadeclassifier.cpp
-    -   A Nvidea API specific cascade classifier example can be found at
-        opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp
- */
-class CV_EXPORTS CascadeClassifier_CUDA
-{
-public:
-    CascadeClassifier_CUDA();
-    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
-
-    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
-    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
-    type of OpenCV XML cascade supported for LBP.
-     */
-    CascadeClassifier_CUDA(const String& filename);
-    ~CascadeClassifier_CUDA();
-
-    /** @brief Checks whether the classifier is loaded or not.
-    */
-    bool empty() const;
-    /** @brief Loads the classifier from a file. The previous content is destroyed.
-
-    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
-    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
-    type of OpenCV XML cascade supported for LBP.
-     */
-    bool load(const String& filename);
-    /** @brief Destroys the loaded classifier.
-    */
-    void release();
-
-    /** @overload */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor = 1.2, int minNeighbors = 4, Size minSize = Size());
-    /** @brief Detects objects of different sizes in the input image.
-
-    @param image Matrix of type CV_8U containing an image where objects should be detected.
-    @param objectsBuf Buffer to store detected objects (rectangles). If it is empty, it is allocated
-    with the default size. If not empty, the function searches not more than N objects, where
-    N = sizeof(objectsBufer's data)/sizeof(cv::Rect).
-    @param maxObjectSize Maximum possible object size. Objects larger than that are ignored. Used for
-    second signature and supported only for LBP cascades.
-    @param scaleFactor Parameter specifying how much the image size is reduced at each image scale.
-    @param minNeighbors Parameter specifying how many neighbors each candidate rectangle should have
-    to retain it.
-    @param minSize Minimum possible object size. Objects smaller than that are ignored.
-
-    The detected objects are returned as a list of rectangles.
-
-    The function returns the number of detected objects, so you can retrieve them as in the following
-    example:
-    @code
-        cuda::CascadeClassifier_CUDA cascade_gpu(...);
-
-        Mat image_cpu = imread(...)
-        GpuMat image_gpu(image_cpu);
-
-        GpuMat objbuf;
-        int detections_number = cascade_gpu.detectMultiScale( image_gpu,
-                  objbuf, 1.2, minNeighbors);
-
-        Mat obj_host;
-        // download only detected number of rectangles
-        objbuf.colRange(0, detections_number).download(obj_host);
-
-        Rect* faces = obj_host.ptr<Rect>();
-        for(int i = 0; i < detections_num; ++i)
-           cv::rectangle(image_cpu, faces[i], Scalar(255));
-
-        imshow("Faces", image_cpu);
-    @endcode
-    @sa CascadeClassifier::detectMultiScale
-     */
-    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
-
-    bool findLargestObject;
-    bool visualizeInPlace;
-
-    Size getClassifierSize() const;
-
-private:
-    struct CascadeClassifierImpl;
-    CascadeClassifierImpl* impl;
-    struct HaarCascade;
-    struct LbpCascade;
-    friend class CascadeClassifier_CUDA_LBP;
-};
-
-//! @} cuda_objdetect
-
 //////////////////////////// Labeling ////////////////////////////

 //! @addtogroup cuda
--- a/modules/cuda/perf/perf_precomp.hpp
+++ b/modules/cuda/perf/perf_precomp.hpp
@@ -56,7 +56,6 @@

 #include "opencv2/cuda.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
--- a/modules/cuda/src/hog.cpp
+++ b/modules/cuda/src/hog.cpp
--- a/modules/cuda/src/precomp.hpp
+++ b/modules/cuda/src/precomp.hpp
@@ -47,7 +47,6 @@
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudawarping.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"

 #include "opencv2/core/private.cuda.hpp"
 #include "opencv2/core/utility.hpp"
--- a/modules/cuda/test/test_precomp.hpp
+++ b/modules/cuda/test/test_precomp.hpp
@@ -60,7 +60,6 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/opengl.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"

 #include "cvconfig.h"

--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -130,12 +130,6 @@ This function, in contrast to divide, uses a round-down rounding mode.
 */
 CV_EXPORTS void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());

-//! computes element-wise weighted reciprocal of an array (dst = scale/src2)
-static inline void divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
-{
-    divide(src1, src2, dst, 1.0, dtype, stream);
-}
-
 /** @brief Computes per-element absolute difference of two matrices (or of a matrix and scalar).

@param src1 First source matrix or scalar.
@@ -530,116 +524,53 @@ CV_EXPORTS void copyMakeBorder(InputArray src, OutputArray dst, int top, int bot
@param src1 Source matrix. Any matrices except 64F are supported.
@param normType Norm type. NORM_L1 , NORM_L2 , and NORM_INF are supported for now.
@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.

@sa norm
 */
-CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer, no mask
-*/
-static inline double norm(InputArray src, int normType)
-{
-    GpuMat buf;
-    return norm(src, normType, GpuMat(), buf);
-}
-/** @overload
-no mask
-*/
-static inline double norm(InputArray src, int normType, GpuMat& buf)
-{
-    return norm(src, normType, GpuMat(), buf);
-}
+CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void calcNorm(InputArray src, OutputArray dst, int normType, InputArray mask = noArray(), Stream& stream = Stream::Null());

 /** @brief Returns the difference of two matrices.

@param src1 Source matrix. Any matrices except 64F are supported.
@param src2 Second source matrix (if any) with the same size and type as src1.
@param normType Norm type. NORM_L1 , NORM_L2 , and NORM_INF are supported for now.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.

@sa norm
 */
-CV_EXPORTS double norm(InputArray src1, InputArray src2, GpuMat& buf, int normType=NORM_L2);
-/** @overload
-uses new buffer
-*/
-static inline double norm(InputArray src1, InputArray src2, int normType=NORM_L2)
-{
-    GpuMat buf;
-    return norm(src1, src2, buf, normType);
-}
+CV_EXPORTS double norm(InputArray src1, InputArray src2, int normType=NORM_L2);
+/** @overload */
+CV_EXPORTS void calcNormDiff(InputArray src1, InputArray src2, OutputArray dst, int normType=NORM_L2, Stream& stream = Stream::Null());

 /** @brief Returns the sum of matrix elements.

@param src Source image of any depth except for CV_64F .
@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.

@sa sum
 */
-CV_EXPORTS Scalar sum(InputArray src, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer, no mask
-*/
-static inline Scalar sum(InputArray src)
-{
-    GpuMat buf;
-    return sum(src, GpuMat(), buf);
-}
-/** @overload
-no mask
-*/
-static inline Scalar sum(InputArray src, GpuMat& buf)
-{
-    return sum(src, GpuMat(), buf);
-}
+CV_EXPORTS Scalar sum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void calcSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());

 /** @brief Returns the sum of absolute values for matrix elements.

@param src Source image of any depth except for CV_64F .
@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 */
-CV_EXPORTS Scalar absSum(InputArray src, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer, no mask
-*/
-static inline Scalar absSum(InputArray src)
-{
-    GpuMat buf;
-    return absSum(src, GpuMat(), buf);
-}
-/** @overload
-no mask
-*/
-static inline Scalar absSum(InputArray src, GpuMat& buf)
-{
-    return absSum(src, GpuMat(), buf);
-}
+CV_EXPORTS Scalar absSum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void calcAbsSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());

 /** @brief Returns the squared sum of matrix elements.

@param src Source image of any depth except for CV_64F .
@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
 */
-CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer, no mask
-*/
-static inline Scalar sqrSum(InputArray src)
-{
-    GpuMat buf;
-    return sqrSum(src, GpuMat(), buf);
-}
-/** @overload
-no mask
-*/
-static inline Scalar sqrSum(InputArray src, GpuMat& buf)
-{
-    return sqrSum(src, GpuMat(), buf);
-}
+CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void calcSqrSum(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());

 /** @brief Finds global minimum and maximum matrix elements and returns their values.

@@ -647,21 +578,14 @@ static inline Scalar sqrSum(InputArray src, GpuMat& buf)
@param minVal Pointer to the returned minimum value. Use NULL if not required.
@param maxVal Pointer to the returned maximum value. Use NULL if not required.
@param mask Optional mask to select a sub-matrix.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.

 The function does not work with CV_64F images on GPUs with the compute capability \< 1.3.

@sa minMaxLoc
 */
-CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf);
-/** @overload
-uses new buffer
-*/
-static inline void minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
-{
-    GpuMat buf;
-    minMax(src, minVal, maxVal, mask, buf);
-}
+CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void findMinMax(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());

 /** @brief Finds global minimum and maximum matrix elements and returns their values with locations.

@@ -671,44 +595,28 @@ static inline void minMax(InputArray src, double* minVal, double* maxVal=0, Inpu
@param minLoc Pointer to the returned minimum location. Use NULL if not required.
@param maxLoc Pointer to the returned maximum location. Use NULL if not required.
@param mask Optional mask to select a sub-matrix.
-@param valbuf Optional values buffer to avoid extra memory allocations. It is resized
-automatically.
-@param locbuf Optional locations buffer to avoid extra memory allocations. It is resized
-automatically.
+
 The function does not work with CV_64F images on GPU with the compute capability \< 1.3.

@sa minMaxLoc
 */
 CV_EXPORTS void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                          InputArray mask, GpuMat& valbuf, GpuMat& locbuf);
-/** @overload
-uses new buffer
-*/
-static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
-                             InputArray mask=noArray())
-{
-    GpuMat valBuf, locBuf;
-    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
-}
+                          InputArray mask = noArray());
+/** @overload */
+CV_EXPORTS void findMinMaxLoc(InputArray src, OutputArray minMaxVals, OutputArray loc,
+                              InputArray mask = noArray(), Stream& stream = Stream::Null());

 /** @brief Counts non-zero matrix elements.

@param src Single-channel source image.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.

 The function does not work with CV_64F images on GPUs with the compute capability \< 1.3.

@sa countNonZero
 */
-CV_EXPORTS int countNonZero(InputArray src, GpuMat& buf);
-/** @overload
-uses new buffer
-*/
-static inline int countNonZero(const GpuMat& src)
-{
-    GpuMat buf;
-    return countNonZero(src, buf);
-}
+CV_EXPORTS int countNonZero(InputArray src);
+/** @overload */
+CV_EXPORTS void countNonZero(InputArray src, OutputArray dst, Stream& stream = Stream::Null());

 /** @brief Reduces a matrix to a vector.

@@ -743,19 +651,12 @@ CV_EXPORTS void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, i
@param mtx Source matrix. CV_8UC1 matrices are supported for now.
@param mean Mean value.
@param stddev Standard deviation value.
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.

@sa meanStdDev
 */
-CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
-/** @overload
-uses new buffer
-*/
-static inline void meanStdDev(InputArray src, Scalar& mean, Scalar& stddev)
-{
-    GpuMat buf;
-    meanStdDev(src, mean, stddev, buf);
-}
+CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev);
+/** @overload */
+CV_EXPORTS void meanStdDev(InputArray mtx, OutputArray dst, Stream& stream = Stream::Null());

 /** @brief Computes a standard deviation of integral images.

@@ -779,64 +680,32 @@ normalization.
@param dtype When negative, the output array has the same type as src; otherwise, it has the same
 number of channels as src and the depth =CV_MAT_DEPTH(dtype).
@param mask Optional operation mask.
-@param norm_buf Optional buffer to avoid extra memory allocations. It is resized automatically.
-@param cvt_buf Optional buffer to avoid extra memory allocations. It is resized automatically.
+@param stream Stream for the asynchronous version.

@sa normalize
 */
 CV_EXPORTS void normalize(InputArray src, OutputArray dst, double alpha, double beta,
-                          int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf);
-/** @overload
-uses new buffers
-*/
-static inline void normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0,
-                             int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
-{
-    GpuMat norm_buf;
-    GpuMat cvt_buf;
-    normalize(src, dst, alpha, beta, norm_type, dtype, mask, norm_buf, cvt_buf);
-}
+                          int norm_type, int dtype, InputArray mask = noArray(),
+                          Stream& stream = Stream::Null());

 /** @brief Computes an integral image.

@param src Source image. Only CV_8UC1 images are supported for now.
@param sum Integral image containing 32-bit unsigned integer values packed into CV_32SC1 .
-@param buffer Optional buffer to avoid extra memory allocations. It is resized automatically.
@param stream Stream for the asynchronous version.

@sa integral
 */
-CV_EXPORTS void integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null());
-static inline void integralBuffered(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
-{
-    integral(src, sum, buffer, stream);
-}
-/** @overload
-uses new buffer
-*/
-static inline void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
-{
-    GpuMat buffer;
-    integral(src, sum, buffer, stream);
-}
+CV_EXPORTS void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null());

 /** @brief Computes a squared integral image.

@param src Source image. Only CV_8UC1 images are supported for now.
@param sqsum Squared integral image containing 64-bit unsigned integer values packed into
 CV_64FC1 .
-@param buf Optional buffer to avoid extra memory allocations. It is resized automatically.
@param stream Stream for the asynchronous version.
 */
-CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null());
-/** @overload
-uses new buffer
-*/
-static inline void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
-{
-    GpuMat buffer;
-    sqrIntegral(src, sqsum, buffer, stream);
-}
+CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null());

 //! @} cudaarithm_reduce

--- a/modules/cudaarithm/perf/perf_reductions.cpp
+++ b/modules/cudaarithm/perf/perf_reductions.cpp
@@ -108,10 +108,9 @@ PERF_TEST_P(Sz_Norm, NormDiff,
    {
        const cv::cuda::GpuMat d_src1(src1);
        const cv::cuda::GpuMat d_src2(src2);
-        cv::cuda::GpuMat d_buf;
        double gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, d_buf, normType);
+        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, normType);

        SANITY_CHECK(gpu_dst);

@@ -146,10 +145,9 @@ PERF_TEST_P(Sz_Depth_Cn, Sum,
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
        cv::Scalar gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src);

        SANITY_CHECK(gpu_dst, 1e-5, ERROR_RELATIVE);
    }
@@ -183,10 +181,9 @@ PERF_TEST_P(Sz_Depth_Cn, SumAbs,
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
        cv::Scalar gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src);

        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
    }
@@ -216,10 +213,9 @@ PERF_TEST_P(Sz_Depth_Cn, SumSqr,
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
        cv::Scalar gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src);

        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
    }
@@ -248,10 +244,9 @@ PERF_TEST_P(Sz_Depth, MinMax,
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
        double gpu_minVal, gpu_maxVal;

-        TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat(), d_buf);
+        TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat());

        SANITY_CHECK(gpu_minVal, 1e-10);
        SANITY_CHECK(gpu_maxVal, 1e-10);
@@ -286,11 +281,10 @@ PERF_TEST_P(Sz_Depth, MinMaxLoc,
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_valbuf, d_locbuf;
        double gpu_minVal, gpu_maxVal;
        cv::Point gpu_minLoc, gpu_maxLoc;

-        TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc, cv::cuda::GpuMat(), d_valbuf, d_locbuf);
+        TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc);

        SANITY_CHECK(gpu_minVal, 1e-10);
        SANITY_CHECK(gpu_maxVal, 1e-10);
@@ -323,10 +317,9 @@ PERF_TEST_P(Sz_Depth, CountNonZero,
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
        int gpu_dst = 0;

-        TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src, d_buf);
+        TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src);

        SANITY_CHECK(gpu_dst);
    }
@@ -414,9 +407,8 @@ PERF_TEST_P(Sz_Depth_NormType, Normalize,
    {
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_norm_buf, d_cvt_buf;

-        TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat(), d_norm_buf, d_cvt_buf);
+        TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat());

        CUDA_SANITY_CHECK(dst, 1e-6);
    }
@@ -445,11 +437,10 @@ PERF_TEST_P(Sz, MeanStdDev,
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat d_buf;
        cv::Scalar gpu_mean;
        cv::Scalar gpu_stddev;

-        TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev, d_buf);
+        TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev);

        SANITY_CHECK(gpu_mean);
        SANITY_CHECK(gpu_stddev);
@@ -481,9 +472,8 @@ PERF_TEST_P(Sz, Integral,
    {
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat dst;
-        cv::cuda::GpuMat d_buf;

-        TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
+        TEST_CYCLE() cv::cuda::integral(d_src, dst);

        CUDA_SANITY_CHECK(dst);
    }
@@ -511,9 +501,9 @@ PERF_TEST_P(Sz, IntegralSqr,
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat dst, buf;
+        cv::cuda::GpuMat dst;

-        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
+        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst);

        CUDA_SANITY_CHECK(dst);
    }
--- a/modules/cudaarithm/src/arithm.cpp
+++ b/modules/cudaarithm/src/arithm.cpp
@@ -169,9 +169,9 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
 #else
    // CUBLAS works with column-major matrices

-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
-    GpuMat src3 = _src3.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);
+    GpuMat src3 = getInputMat(_src3, stream);

    CV_Assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 );
    CV_Assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) );
@@ -200,8 +200,7 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
    CV_Assert( src1Size.width == src2Size.height );
    CV_Assert( src3.empty() || src3Size == dstSize );

-    _dst.create(dstSize, src1.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, dstSize, src1.type(), stream);

    if (beta != 0)
    {
@@ -281,6 +280,8 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
    }

    cublasSafeCall( cublasDestroy_v2(handle) );
+
+    syncOutput(dst, _dst, stream);
 #endif
 }

@@ -297,7 +298,7 @@ void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags,
    (void) stream;
    throw_no_cuda();
 #else
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );

@@ -314,13 +315,20 @@ void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags,
    // We don't support real-to-real transform
    CV_Assert( is_complex_input || is_complex_output );

-    GpuMat src_cont = src;
-
    // Make sure here we work with the continuous input,
    // as CUFFT can't handle gaps
-    createContinuous(src.rows, src.cols, src.type(), src_cont);
-    if (src_cont.data != src.data)
+    GpuMat src_cont;
+    if (src.isContinuous())
+    {
+        src_cont = src;
+    }
+    else
+    {
+        BufferPool pool(stream);
+        src_cont.allocator = pool.getAllocator();
+        createContinuous(src.rows, src.cols, src.type(), src_cont);
        src.copyTo(src_cont, stream);
+    }

    Size dft_size_opt = dft_size;
    if (is_1d_input && !is_row_dft)
@@ -462,16 +470,15 @@ namespace

    void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream)
    {
-        GpuMat image = _image.getGpuMat();
-        GpuMat templ = _templ.getGpuMat();
+        GpuMat image = getInputMat(_image, _stream);
+        GpuMat templ = getInputMat(_templ, _stream);

        CV_Assert( image.type() == CV_32FC1 );
        CV_Assert( templ.type() == CV_32FC1 );

        create(image.size(), templ.size());

-        _result.create(result_size, CV_32FC1);
-        GpuMat result = _result.getGpuMat();
+        GpuMat result = getOutputMat(_result, result_size, CV_32FC1, _stream);

        cudaStream_t stream = StreamAccessor::getStream(_stream);

@@ -520,6 +527,8 @@ namespace

        cufftSafeCall( cufftDestroy(planR2C) );
        cufftSafeCall( cufftDestroy(planC2R) );
+
+        syncOutput(result, _result, _stream);
    }
 }

--- a/modules/cudaarithm/src/core.cpp
+++ b/modules/cudaarithm/src/core.cpp
@@ -119,15 +119,17 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);

    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaarithm/src/cuda/add_weighted.cu
+++ b/modules/cudaarithm/src/cuda/add_weighted.cu
@@ -50,7 +50,10 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
@@ -63,7 +66,7 @@ namespace

        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
        {
-            return saturate_cast<D>(a * alpha + b * beta + gamma);
+            return cudev::saturate_cast<D>(a * alpha + b * beta + gamma);
        }
    };

@@ -555,8 +558,8 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou
        }
    };

-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);

    int sdepth1 = src1.depth();
    int sdepth2 = src2.depth();
@@ -564,19 +567,18 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou
    ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
    const int cn = src1.channels();

-    CV_DbgAssert( src2.size() == src1.size() && src2.channels() == cn );
-    CV_DbgAssert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.channels() == cn );
+    CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );

-    _dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_MAKE_TYPE(ddepth, cn), stream);

-    GpuMat src1_ = src1.reshape(1);
-    GpuMat src2_ = src2.reshape(1);
-    GpuMat dst_ = dst.reshape(1);
+    GpuMat src1_single = src1.reshape(1);
+    GpuMat src2_single = src2.reshape(1);
+    GpuMat dst_single = dst.reshape(1);

    if (sdepth1 > sdepth2)
    {
-        src1_.swap(src2_);
+        src1_single.swap(src2_single);
        std::swap(alpha, beta);
        std::swap(sdepth1, sdepth2);
    }
@@ -586,7 +588,9 @@ void cv::cuda::addWeighted(InputArray _src1, double alpha, InputArray _src2, dou
    if (!func)
        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");

-    func(src1_, alpha, src2_, beta, gamma, dst_, stream);
+    func(src1_single, alpha, src2_single, beta, gamma, dst_single, stream);
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/cuda/bitwise_mat.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_mat.cu
@@ -50,7 +50,10 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
@@ -60,16 +63,15 @@ void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& m

 void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
+    GpuMat mask = getInputMat(_mask, stream);

    const int depth = src.depth();

    CV_DbgAssert( depth <= CV_32F );
    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

    if (mask.empty())
    {
@@ -125,6 +127,8 @@ void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask,
            gridTransformUnary(vsrc, vdst, bit_not<uchar>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
        }
    }
+
+    syncOutput(dst, _dst, stream);
 }

 //////////////////////////////////////////////////////////////////////////////
--- a/modules/cudaarithm/src/cuda/copy_make_border.cu
+++ b/modules/cudaarithm/src/cuda/copy_make_border.cu
@@ -50,7 +50,10 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
@@ -133,7 +136,7 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo
        {    copyMakeBorderImpl<float , 1>  , 0 /*copyMakeBorderImpl<float , 2>*/,     copyMakeBorderImpl<float , 3>  ,     copyMakeBorderImpl<float  ,4>  }
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    const int depth = src.depth();
    const int cn = src.channels();
@@ -141,8 +144,7 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo
    CV_Assert( depth <= CV_32F && cn <= 4 );
    CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );

-    _dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.rows + top + bottom, src.cols + left + right, src.type(), stream);

    const func_t func = funcs[depth][cn - 1];

@@ -150,6 +152,8 @@ void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bo
        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");

    func(src, dst, top, left, borderType, value, stream);
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/cuda/countnonzero.cu
+++ b/modules/cudaarithm/src/cuda/countnonzero.cu
@@ -50,47 +50,64 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
 {
-    template <typename T>
-    int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf)
+    template <typename T, typename D>
+    void countNonZeroImpl(const GpuMat& _src, GpuMat& _dst, Stream& stream)
    {
        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
-        GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
+        GpuMat_<D>& dst = (GpuMat_<D>&) _dst;

-        gridCountNonZero(src, buf);
-
-        int data;
-        buf.download(cv::Mat(1, 1, buf.type(), &data));
-
-        return data;
+        gridCountNonZero(src, dst, stream);
    }
 }

-int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
+void cv::cuda::countNonZero(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf);
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Stream& stream);
    static const func_t funcs[] =
    {
-        countNonZeroImpl<uchar>,
-        countNonZeroImpl<schar>,
-        countNonZeroImpl<ushort>,
-        countNonZeroImpl<short>,
-        countNonZeroImpl<int>,
-        countNonZeroImpl<float>,
-        countNonZeroImpl<double>
+        countNonZeroImpl<uchar, int>,
+        countNonZeroImpl<schar, int>,
+        countNonZeroImpl<ushort, int>,
+        countNonZeroImpl<short, int>,
+        countNonZeroImpl<int, int>,
+        countNonZeroImpl<float, int>,
+        countNonZeroImpl<double, int>,
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

+    CV_Assert( src.depth() <= CV_64F );
    CV_Assert( src.channels() == 1 );

-    const func_t func = funcs[src.depth()];
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_32SC1, stream);

-    return func(src, buf);
+    const func_t func = funcs[src.depth()];
+    func(src, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+int cv::cuda::countNonZero(InputArray _src)
+{
+    Stream& stream = Stream::Null();
+
+    BufferPool pool(stream);
+    GpuMat buf = pool.getBuffer(1, 1, CV_32SC1);
+
+    countNonZero(_src, buf, stream);
+
+    int data;
+    buf.download(Mat(1, 1, CV_32SC1, &data));
+
+    return data;
 }

 #endif
--- a/modules/cudaarithm/src/cuda/integral.cu
+++ b/modules/cudaarithm/src/cuda/integral.cu
@@ -50,51 +50,58 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 ////////////////////////////////////////////////////////////////////////
 // integral

-void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& stream)
+void cv::cuda::integral(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    CV_Assert( src.type() == CV_8UC1 );

-    GpuMat_<int>& res = (GpuMat_<int>&) buffer;
+    BufferPool pool(stream);
+    GpuMat_<int> res(src.size(), pool.getAllocator());

    gridIntegral(globPtr<uchar>(src), res, stream);

-    _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.rows + 1, src.cols + 1, CV_32SC1, stream);

    dst.setTo(Scalar::all(0), stream);

    GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
    res.copyTo(inner, stream);
+
+    syncOutput(dst, _dst, stream);
 }

 //////////////////////////////////////////////////////////////////////////////
 // sqrIntegral

-void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& stream)
+void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    CV_Assert( src.type() == CV_8UC1 );

-    GpuMat_<double>& res = (GpuMat_<double>&) buf;
+    BufferPool pool(Stream::Null());
+    GpuMat_<double> res(pool.getBuffer(src.size(), CV_64FC1));

    gridIntegral(sqr_(cvt_<int>(globPtr<uchar>(src))), res, stream);

-    _dst.create(src.rows + 1, src.cols + 1, CV_64FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.rows + 1, src.cols + 1, CV_64FC1, stream);

    dst.setTo(Scalar::all(0), stream);

    GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
    res.copyTo(inner, stream);
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/cuda/lut.cu
+++ b/modules/cudaarithm/src/cuda/lut.cu
@@ -50,8 +50,10 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

 using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
@@ -165,7 +167,7 @@ namespace

    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& stream)
    {
-        GpuMat src = _src.getGpuMat();
+        GpuMat src = getInputMat(_src, stream);

        const int cn = src.channels();
        const int lut_cn = d_lut.channels();
@@ -173,8 +175,7 @@ namespace
        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
        CV_Assert( lut_cn == 1 || lut_cn == cn );

-        _dst.create(src.size(), src.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

        if (lut_cn == 1)
        {
@@ -196,6 +197,8 @@ namespace

            dst3.assign(lut_(src3, tbl), stream);
        }
+
+        syncOutput(dst, _dst, stream);
    }
 }

--- a/modules/cudaarithm/src/cuda/math.cu
+++ b/modules/cudaarithm/src/cuda/math.cu
@@ -50,7 +50,10 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
@@ -92,16 +95,15 @@ void cv::cuda::abs(InputArray _src, OutputArray _dst, Stream& stream)
        absMat<double>
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );

-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);

-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -113,7 +115,7 @@ namespace
    {
        __device__ __forceinline__ T operator ()(T x) const
        {
-            return saturate_cast<T>(x * x);
+            return cudev::saturate_cast<T>(x * x);
        }
    };

@@ -138,16 +140,15 @@ void cv::cuda::sqr(InputArray _src, OutputArray _dst, Stream& stream)
        sqrMat<double>
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );

-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);

-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -176,16 +177,15 @@ void cv::cuda::sqrt(InputArray _src, OutputArray _dst, Stream& stream)
        sqrtMat<double>
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );

-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);

-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }

 ////////////////////////////////////////////////////////////////////////
@@ -198,7 +198,7 @@ namespace
        __device__ __forceinline__ T operator ()(T x) const
        {
            exp_func<T> f;
-            return saturate_cast<T>(f(x));
+            return cudev::saturate_cast<T>(f(x));
        }
    };

@@ -223,16 +223,15 @@ void cv::cuda::exp(InputArray _src, OutputArray _dst, Stream& stream)
        expMat<double>
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );

-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);

-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }

 ////////////////////////////////////////////////////////////////////////
@@ -261,16 +260,15 @@ void cv::cuda::log(InputArray _src, OutputArray _dst, Stream& stream)
        logMat<double>
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );

-    CV_DbgAssert( depth <= CV_64F );
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), dst.reshape(1), stream);

-    funcs[depth](src.reshape(1), dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }

 ////////////////////////////////////////////////////////////////////////
@@ -284,7 +282,7 @@ namespace

        __device__ __forceinline__ T operator()(T e) const
        {
-            return saturate_cast<T>(__powf((float)e, power));
+            return cudev::saturate_cast<T>(__powf((float)e, power));
        }
    };
    template<typename T> struct PowOp<T, true> : unary_function<T, T>
@@ -293,7 +291,7 @@ namespace

        __device__ __forceinline__ T operator()(T e) const
        {
-            T res = saturate_cast<T>(__powf((float)e, power));
+            T res = cudev::saturate_cast<T>(__powf((float)e, power));

            if ((e < 0) && (1 & static_cast<int>(power)))
                res *= -1;
@@ -344,16 +342,15 @@ void cv::cuda::pow(InputArray _src, double power, OutputArray _dst, Stream& stre
        powMat<double>
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

-    const int depth = src.depth();
+    CV_Assert( src.depth() <= CV_64F );

-    CV_DbgAssert(depth <= CV_64F);
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    funcs[src.depth()](src.reshape(1), power, dst.reshape(1), stream);

-    funcs[depth](src.reshape(1), power, dst.reshape(1), stream);
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/cuda/minmax.cu
+++ b/modules/cudaarithm/src/cuda/minmax.cu
@@ -50,62 +50,140 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
 {
-    template <typename T>
-    void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal)
+    template <typename T, typename R>
+    void minMaxImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
    {
-        typedef typename SelectIf<
-                TypesEquals<T, double>::value,
-                double,
-                typename SelectIf<TypesEquals<T, float>::value, float, int>::type
-                >::type work_type;
-
        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
-        GpuMat_<work_type>& buf = (GpuMat_<work_type>&) _buf;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;

        if (mask.empty())
-            gridFindMinMaxVal(src, buf);
+            gridFindMinMaxVal(src, dst, stream);
        else
-            gridFindMinMaxVal(src, buf, globPtr<uchar>(mask));
+            gridFindMinMaxVal(src, dst, globPtr<uchar>(mask), stream);
+    }

-        work_type data[2];
-        buf.download(cv::Mat(1, 2, buf.type(), data));
+    template <typename T, typename R>
+    void minMaxImpl(const GpuMat& src, const GpuMat& mask, double* minVal, double* maxVal)
+    {
+        BufferPool pool(Stream::Null());
+        GpuMat buf(pool.getBuffer(1, 2, DataType<R>::type));

-        if (minVal)
-            *minVal = data[0];
+        minMaxImpl<T, R>(src, mask, buf, Stream::Null());
+
+        R data[2];
+        buf.download(Mat(1, 2, buf.type(), data));

-        if (maxVal)
-            *maxVal = data[1];
    }
 }

-void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
+void cv::cuda::findMinMax(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf, double* minVal, double* maxVal);
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
    static const func_t funcs[] =
    {
-        minMaxImpl<uchar>,
-        minMaxImpl<schar>,
-        minMaxImpl<ushort>,
-        minMaxImpl<short>,
-        minMaxImpl<int>,
-        minMaxImpl<float>,
-        minMaxImpl<double>
+        minMaxImpl<uchar, int>,
+        minMaxImpl<schar, int>,
+        minMaxImpl<ushort, int>,
+        minMaxImpl<short, int>,
+        minMaxImpl<int, int>,
+        minMaxImpl<float, float>,
+        minMaxImpl<double, double>
    };

-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);

    CV_Assert( src.channels() == 1 );
-    CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    const int src_depth = src.depth();
+    const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth;
+
+    GpuMat dst = getOutputMat(_dst, 1, 2, dst_depth, stream);

    const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);

-    func(src, mask, buf, minVal, maxVal);
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    findMinMax(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().convertTo(Mat(1, 2, CV_64FC1, &vals[0]), CV_64F);
+
+    if (minVal)
+        *minVal = vals[0];
+
+    if (maxVal)
+        *maxVal = vals[1];
+}
+
+namespace cv { namespace cuda { namespace internal {
+
+void findMaxAbs(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream);
+
+}}}
+
+namespace
+{
+    template <typename T, typename R>
+    void findMaxAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;
+
+        if (mask.empty())
+            gridFindMaxVal(abs_(src), dst, stream);
+        else
+            gridFindMaxVal(abs_(src), dst, globPtr<uchar>(mask), stream);
+    }
+}
+
+void cv::cuda::internal::findMaxAbs(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        findMaxAbsImpl<uchar, int>,
+        findMaxAbsImpl<schar, int>,
+        findMaxAbsImpl<ushort, int>,
+        findMaxAbsImpl<short, int>,
+        findMaxAbsImpl<int, int>,
+        findMaxAbsImpl<float, float>,
+        findMaxAbsImpl<double, double>
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    const int src_depth = src.depth();
+    const int dst_depth = src_depth < CV_32F ? CV_32S : src_depth;
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, dst_depth, stream);
+
+    const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/cuda/minmaxloc.cu
+++ b/modules/cudaarithm/src/cuda/minmaxloc.cu
@@ -50,78 +50,110 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
 {
-    template <typename T>
-    void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc)
+    template <typename T, typename R>
+    void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, Stream& stream)
    {
-        typedef typename SelectIf<
-                TypesEquals<T, double>::value,
-                double,
-                typename SelectIf<TypesEquals<T, float>::value, float, int>::type
-                >::type work_type;
-
        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
-        GpuMat_<work_type>& valBuf = (GpuMat_<work_type>&) _valBuf;
+        GpuMat_<R>& valBuf = (GpuMat_<R>&) _valBuf;
        GpuMat_<int>& locBuf = (GpuMat_<int>&) _locBuf;

        if (mask.empty())
-            gridMinMaxLoc(src, valBuf, locBuf);
+            gridMinMaxLoc(src, valBuf, locBuf, stream);
        else
-            gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask));
-
-        cv::Mat_<work_type> h_valBuf;
-        cv::Mat_<int> h_locBuf;
-
-        valBuf.download(h_valBuf);
-        locBuf.download(h_locBuf);
-
-        if (minVal)
-            *minVal = h_valBuf(0, 0);
-
-        if (maxVal)
-            *maxVal = h_valBuf(1, 0);
-
-        if (minLoc)
-        {
-            const int idx = h_locBuf(0, 0);
-            *minLoc = cv::Point(idx % src.cols, idx / src.cols);
-        }
-
-        if (maxLoc)
-        {
-            const int idx = h_locBuf(1, 0);
-            *maxLoc = cv::Point(idx % src.cols, idx / src.cols);
-        }
+            gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask), stream);
    }
 }

-void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
+void cv::cuda::findMinMaxLoc(InputArray _src, OutputArray _minMaxVals, OutputArray _loc, InputArray _mask, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc);
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, Stream& stream);
    static const func_t funcs[] =
    {
-        minMaxLocImpl<uchar>,
-        minMaxLocImpl<schar>,
-        minMaxLocImpl<ushort>,
-        minMaxLocImpl<short>,
-        minMaxLocImpl<int>,
-        minMaxLocImpl<float>,
-        minMaxLocImpl<double>
+        minMaxLocImpl<uchar, int>,
+        minMaxLocImpl<schar, int>,
+        minMaxLocImpl<ushort, int>,
+        minMaxLocImpl<short, int>,
+        minMaxLocImpl<int, int>,
+        minMaxLocImpl<float, float>,
+        minMaxLocImpl<double, double>
    };

-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);

    CV_Assert( src.channels() == 1 );
-    CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );

-    const func_t func = funcs[src.depth()];
+    const int src_depth = src.depth();

-    func(src, mask, valBuf, locBuf, minVal, maxVal, minLoc, maxLoc);
+    BufferPool pool(stream);
+    GpuMat valBuf(pool.getAllocator());
+    GpuMat locBuf(pool.getAllocator());
+
+    const func_t func = funcs[src_depth];
+    func(src, mask, valBuf, locBuf, stream);
+
+    GpuMat minMaxVals = valBuf.colRange(0, 1);
+    GpuMat loc = locBuf.colRange(0, 1);
+
+    if (_minMaxVals.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        minMaxVals.copyTo(_minMaxVals, stream);
+    }
+    else
+    {
+        minMaxVals.download(_minMaxVals, stream);
+    }
+
+    if (_loc.kind() == _InputArray::CUDA_GPU_MAT)
+    {
+        loc.copyTo(_loc, stream);
+    }
+    else
+    {
+        loc.download(_loc, stream);
+    }
+}
+
+void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem minMaxVals, locVals;
+    findMinMaxLoc(_src, minMaxVals, locVals, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    minMaxVals.createMatHeader().convertTo(Mat(minMaxVals.size(), CV_64FC1, &vals[0]), CV_64F);
+
+    int locs[2];
+    locVals.createMatHeader().copyTo(Mat(locVals.size(), CV_32SC1, &locs[0]));
+    Size size = _src.size();
+    cv::Point locs2D[] = {
+        cv::Point(locs[0] % size.width, locs[0] / size.width),
+        cv::Point(locs[1] % size.width, locs[1] / size.width),
+    };
+
+    if (minVal)
+        *minVal = vals[0];
+
+    if (maxVal)
+        *maxVal = vals[1];
+
+    if (minLoc)
+        *minLoc = locs2D[0];
+
+    if (maxLoc)
+        *maxLoc = locs2D[1];
 }

 #endif
--- a/modules/cudaarithm/src/cuda/mul_spectrums.cu
+++ b/modules/cudaarithm/src/cuda/mul_spectrums.cu
@@ -50,7 +50,10 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 //////////////////////////////////////////////////////////////////////////////
@@ -120,33 +123,33 @@ void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst
 {
    (void) flags;

-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);

    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
    CV_Assert( src1.size() == src2.size() );

-    _dst.create(src1.size(), CV_32FC2);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_32FC2, stream);

    if (conjB)
        gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul_conj(), stream);
    else
        gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul(), stream);
+
+    syncOutput(dst, _dst, stream);
 }

 void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
 {
    (void) flags;

-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);

    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
    CV_Assert( src1.size() == src2.size() );

-    _dst.create(src1.size(), CV_32FC2);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src1.size(), CV_32FC2, stream);

    if (conjB)
    {
@@ -160,6 +163,8 @@ void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputAr
        op.scale = scale;
        gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
    }
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/cuda/norm.cu
+++ b/modules/cudaarithm/src/cuda/norm.cu
@@ -50,70 +50,140 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
 {
-    double normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
+    void normDiffInf(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
    {
        const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
        const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
-        GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
+        GpuMat_<int>& dst = (GpuMat_<int>&) _dst;

-        gridFindMinMaxVal(abs_(cvt_<int>(src1) - cvt_<int>(src2)), buf);
-
-        int data[2];
-        buf.download(cv::Mat(1, 2, buf.type(), data));
-
-        return data[1];
+        gridFindMaxVal(abs_(cvt_<int>(src1) - cvt_<int>(src2)), dst, stream);
    }

-    double normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
+    void normDiffL1(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
    {
        const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
        const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
-        GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
+        GpuMat_<int>& dst = (GpuMat_<int>&) _dst;

-        gridCalcSum(abs_(cvt_<int>(src1) - cvt_<int>(src2)), buf);
-
-        int data;
-        buf.download(cv::Mat(1, 1, buf.type(), &data));
-
-        return data;
+        gridCalcSum(abs_(cvt_<int>(src1) - cvt_<int>(src2)), dst, stream);
    }

-    double normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf)
+    void normDiffL2(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream)
    {
        const GpuMat_<uchar>& src1 = (const GpuMat_<uchar>&) _src1;
        const GpuMat_<uchar>& src2 = (const GpuMat_<uchar>&) _src2;
-        GpuMat_<double>& buf = (GpuMat_<double>&) _buf;
+        GpuMat_<double>& dst = (GpuMat_<double>&) _dst;

-        gridCalcSum(sqr_(cvt_<double>(src1) - cvt_<double>(src2)), buf);
+        BufferPool pool(stream);
+        GpuMat_<double> buf(1, 1, pool.getAllocator());

-        double data;
-        buf.download(cv::Mat(1, 1, buf.type(), &data));
-
-        return std::sqrt(data);
+        gridCalcSum(sqr_(cvt_<double>(src1) - cvt_<double>(src2)), buf, stream);
+        gridTransformUnary(buf, dst, sqrt_func<double>(), stream);
    }
 }

-double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
+void cv::cuda::calcNormDiff(InputArray _src1, InputArray _src2, OutputArray _dst, int normType, Stream& stream)
 {
-    typedef double (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _buf);
+    typedef void (*func_t)(const GpuMat& _src1, const GpuMat& _src2, GpuMat& _dst, Stream& stream);
    static const func_t funcs[] =
    {
        0, normDiffInf, normDiffL1, 0, normDiffL2
    };

-    GpuMat src1 = _src1.getGpuMat();
-    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src1 = getInputMat(_src1, stream);
+    GpuMat src2 = getInputMat(_src2, stream);

    CV_Assert( src1.type() == CV_8UC1 );
    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );

-    return funcs[normType](src1, src2, buf);
+    GpuMat dst = getOutputMat(_dst, 1, 1, normType == NORM_L2 ? CV_64FC1 : CV_32SC1, stream);
+
+    const func_t func = funcs[normType];
+    func(src1, src2, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+double cv::cuda::norm(InputArray _src1, InputArray _src2, int normType)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcNormDiff(_src1, _src2, dst, normType, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(Mat(1, 1, CV_64FC1, &val), CV_64F);
+
+    return val;
+}
+
+namespace cv { namespace cuda { namespace internal {
+
+void normL2(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+}}}
+
+namespace
+{
+    template <typename T, typename R>
+    void normL2Impl(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream)
+    {
+        const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+        GpuMat_<R>& dst = (GpuMat_<R>&) _dst;
+
+        BufferPool pool(stream);
+        GpuMat_<double> buf(1, 1, pool.getAllocator());
+
+        if (mask.empty())
+        {
+            gridCalcSum(sqr_(cvt_<double>(src)), buf, stream);
+        }
+        else
+        {
+            gridCalcSum(sqr_(cvt_<double>(src)), buf, globPtr<uchar>(mask), stream);
+        }
+
+        gridTransformUnary(buf, dst, sqrt_func<double>(), stream);
+    }
+}
+
+void cv::cuda::internal::normL2(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _dst, Stream& stream);
+    static const func_t funcs[] =
+    {
+        normL2Impl<uchar, double>,
+        normL2Impl<schar, double>,
+        normL2Impl<ushort, double>,
+        normL2Impl<short, double>,
+        normL2Impl<int, double>,
+        normL2Impl<float, double>,
+        normL2Impl<double, double>
+    };
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC1, stream);
+
+    const func_t func = funcs[src.depth()];
+    func(src, mask, dst, stream);
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/cuda/normalize.cu
+++ b/modules/cudaarithm/src/cuda/normalize.cu
@@ -0,0 +1,290 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+using namespace cv::cudev;
+
+namespace {
+
+template <typename T, typename R, typename I>
+struct ConvertorMinMax : unary_function<T, R>
+{
+    typedef typename LargerType<T, R>::type larger_type1;
+    typedef typename LargerType<larger_type1, I>::type larger_type2;
+    typedef typename LargerType<larger_type2, float>::type scalar_type;
+
+    scalar_type dmin, dmax;
+    const I* minMaxVals;
+
+    __device__ R operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        const scalar_type smin = minMaxVals[0];
+        const scalar_type smax = minMaxVals[1];
+
+        const scalar_type scale = (dmax - dmin) * (smax - smin > numeric_limits<scalar_type>::epsilon() ? 1.0 / (smax - smin) : 0.0);
+        const scalar_type shift = dmin - smin * scale;
+
+        return cudev::saturate_cast<R>(scale * src + shift);
+    }
+};
+
+template <typename T, typename R, typename I>
+void normalizeMinMax(const GpuMat& _src, GpuMat& _dst, double a, double b, const GpuMat& mask, Stream& stream)
+{
+    const GpuMat_<T>& src = (const GpuMat_<T>&)_src;
+    GpuMat_<R>& dst = (GpuMat_<R>&)_dst;
+
+    BufferPool pool(stream);
+    GpuMat_<I> minMaxVals(1, 2, pool.getAllocator());
+
+    if (mask.empty())
+    {
+        gridFindMinMaxVal(src, minMaxVals, stream);
+    }
+    else
+    {
+        gridFindMinMaxVal(src, minMaxVals, globPtr<uchar>(mask), stream);
+    }
+
+    ConvertorMinMax<T, R, I> cvt;
+    cvt.dmin = std::min(a, b);
+    cvt.dmax = std::max(a, b);
+    cvt.minMaxVals = minMaxVals[0];
+
+    if (mask.empty())
+    {
+        gridTransformUnary(src, dst, cvt, stream);
+    }
+    else
+    {
+        dst.setTo(Scalar::all(0), stream);
+        gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+    }
+}
+
+template <typename T, typename R, typename I, bool normL2>
+struct ConvertorNorm : unary_function<T, R>
+{
+    typedef typename LargerType<T, R>::type larger_type1;
+    typedef typename LargerType<larger_type1, I>::type larger_type2;
+    typedef typename LargerType<larger_type2, float>::type scalar_type;
+
+    scalar_type a;
+    const I* normVal;
+
+    __device__ R operator ()(typename TypeTraits<T>::parameter_type src) const
+    {
+        sqrt_func<scalar_type> sqrt;
+
+        scalar_type scale = normL2 ? sqrt(*normVal) : *normVal;
+        scale = scale > numeric_limits<scalar_type>::epsilon() ? a / scale : 0.0;
+
+        return cudev::saturate_cast<R>(scale * src);
+    }
+};
+
+template <typename T, typename R, typename I>
+void normalizeNorm(const GpuMat& _src, GpuMat& _dst, double a, int normType, const GpuMat& mask, Stream& stream)
+{
+    const GpuMat_<T>& src = (const GpuMat_<T>&)_src;
+    GpuMat_<R>& dst = (GpuMat_<R>&)_dst;
+
+    BufferPool pool(stream);
+    GpuMat_<I> normVal(1, 1, pool.getAllocator());
+
+    if (normType == NORM_L1)
+    {
+        if (mask.empty())
+        {
+            gridCalcSum(abs_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridCalcSum(abs_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+    else if (normType == NORM_L2)
+    {
+        if (mask.empty())
+        {
+            gridCalcSum(sqr_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridCalcSum(sqr_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+    else // NORM_INF
+    {
+        if (mask.empty())
+        {
+            gridFindMaxVal(abs_(cvt_<I>(src)), normVal, stream);
+        }
+        else
+        {
+            gridFindMaxVal(abs_(cvt_<I>(src)), normVal, globPtr<uchar>(mask), stream);
+        }
+    }
+
+    if (normType == NORM_L2)
+    {
+        ConvertorNorm<T, R, I, true> cvt;
+        cvt.a = a;
+        cvt.normVal = normVal[0];
+
+        if (mask.empty())
+        {
+            gridTransformUnary(src, dst, cvt, stream);
+        }
+        else
+        {
+            dst.setTo(Scalar::all(0), stream);
+            gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+        }
+    }
+    else
+    {
+        ConvertorNorm<T, R, I, false> cvt;
+        cvt.a = a;
+        cvt.normVal = normVal[0];
+
+        if (mask.empty())
+        {
+            gridTransformUnary(src, dst, cvt, stream);
+        }
+        else
+        {
+            dst.setTo(Scalar::all(0), stream);
+            gridTransformUnary(src, dst, cvt, globPtr<uchar>(mask), stream);
+        }
+    }
+}
+
+} // namespace
+
+void cv::cuda::normalize(InputArray _src, OutputArray _dst, double a, double b, int normType, int dtype, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_minmax_t)(const GpuMat& _src, GpuMat& _dst, double a, double b, const GpuMat& mask, Stream& stream);
+    typedef void (*func_norm_t)(const GpuMat& _src, GpuMat& _dst, double a, int normType, const GpuMat& mask, Stream& stream);
+
+    static const func_minmax_t funcs_minmax[] =
+    {
+        normalizeMinMax<uchar, float, float>,
+        normalizeMinMax<schar, float, float>,
+        normalizeMinMax<ushort, float, float>,
+        normalizeMinMax<short, float, float>,
+        normalizeMinMax<int, float, float>,
+        normalizeMinMax<float, float, float>,
+        normalizeMinMax<double, double, double>
+    };
+
+    static const func_norm_t funcs_norm[] =
+    {
+        normalizeNorm<uchar, float, float>,
+        normalizeNorm<schar, float, float>,
+        normalizeNorm<ushort, float, float>,
+        normalizeNorm<short, float, float>,
+        normalizeNorm<int, float, float>,
+        normalizeNorm<float, float, float>,
+        normalizeNorm<double, double, double>
+    };
+
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_MINMAX );
+
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    dtype = CV_MAT_DEPTH(dtype);
+
+    const int src_depth = src.depth();
+    const int tmp_depth = src_depth <= CV_32F ? CV_32F : src_depth;
+
+    GpuMat dst;
+    if (dtype == tmp_depth)
+    {
+        _dst.create(src.size(), tmp_depth);
+        dst = getOutputMat(_dst, src.size(), tmp_depth, stream);
+    }
+    else
+    {
+        BufferPool pool(stream);
+        dst = pool.getBuffer(src.size(), tmp_depth);
+    }
+
+    if (normType == NORM_MINMAX)
+    {
+        const func_minmax_t func = funcs_minmax[src_depth];
+        func(src, dst, a, b, mask, stream);
+    }
+    else
+    {
+        const func_norm_t func = funcs_norm[src_depth];
+        func(src, dst, a, normType, mask, stream);
+    }
+
+    if (dtype == tmp_depth)
+    {
+        syncOutput(dst, _dst, stream);
+    }
+    else
+    {
+        dst.convertTo(_dst, dtype, stream);
+    }
+}
+
+#endif
--- a/modules/cudaarithm/src/cuda/polar_cart.cu
+++ b/modules/cudaarithm/src/cuda/polar_cart.cu
@@ -50,55 +50,59 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 void cv::cuda::magnitude(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
 {
-    GpuMat x = _x.getGpuMat();
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);

-    CV_DbgAssert( x.depth() == CV_32F );
-    CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );

-    _dst.create(x.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);

    GpuMat_<float> xc(x.reshape(1));
    GpuMat_<float> yc(y.reshape(1));
    GpuMat_<float> magc(dst.reshape(1));

    gridTransformBinary(xc, yc, magc, magnitude_func<float>(), stream);
+
+    syncOutput(dst, _dst, stream);
 }

 void cv::cuda::magnitudeSqr(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream)
 {
-    GpuMat x = _x.getGpuMat();
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);

-    CV_DbgAssert( x.depth() == CV_32F );
-    CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );

-    _dst.create(x.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);

    GpuMat_<float> xc(x.reshape(1));
    GpuMat_<float> yc(y.reshape(1));
    GpuMat_<float> magc(dst.reshape(1));

    gridTransformBinary(xc, yc, magc, magnitude_sqr_func<float>(), stream);
+
+    syncOutput(dst, _dst, stream);
 }

 void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleInDegrees, Stream& stream)
 {
-    GpuMat x = _x.getGpuMat();
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);

-    CV_DbgAssert( x.depth() == CV_32F );
-    CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );

-    _dst.create(x.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, x.size(), CV_32FC1, stream);

    GpuMat_<float> xc(x.reshape(1));
    GpuMat_<float> yc(y.reshape(1));
@@ -108,21 +112,20 @@ void cv::cuda::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleI
        gridTransformBinary(xc, yc, anglec, direction_func<float, true>(), stream);
    else
        gridTransformBinary(xc, yc, anglec, direction_func<float, false>(), stream);
+
+    syncOutput(dst, _dst, stream);
 }

 void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, OutputArray _angle, bool angleInDegrees, Stream& stream)
 {
-    GpuMat x = _x.getGpuMat();
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getInputMat(_x, stream);
+    GpuMat y = getInputMat(_y, stream);

-    CV_DbgAssert( x.depth() == CV_32F );
-    CV_DbgAssert( y.type() == x.type() && y.size() == x.size() );
+    CV_Assert( x.depth() == CV_32F );
+    CV_Assert( y.type() == x.type() && y.size() == x.size() );

-    _mag.create(x.size(), CV_32FC1);
-    GpuMat mag = _mag.getGpuMat();
-
-    _angle.create(x.size(), CV_32FC1);
-    GpuMat angle = _angle.getGpuMat();
+    GpuMat mag = getOutputMat(_mag, x.size(), CV_32FC1, stream);
+    GpuMat angle = getOutputMat(_angle, x.size(), CV_32FC1, stream);

    GpuMat_<float> xc(x.reshape(1));
    GpuMat_<float> yc(y.reshape(1));
@@ -147,6 +150,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu
                               binaryTupleAdapter<0, 1>(direction_func<float, false>())),
                           stream);
    }
+
+    syncOutput(mag, _mag, stream);
+    syncOutput(angle, _angle, stream);
 }

 namespace
@@ -173,17 +179,14 @@ namespace

 void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, OutputArray _y, bool angleInDegrees, Stream& _stream)
 {
-    GpuMat mag = _mag.getGpuMat();
-    GpuMat angle = _angle.getGpuMat();
+    GpuMat mag = getInputMat(_mag, _stream);
+    GpuMat angle = getInputMat(_angle, _stream);

-    CV_DbgAssert( angle.depth() == CV_32F );
-    CV_DbgAssert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) );
+    CV_Assert( angle.depth() == CV_32F );
+    CV_Assert( mag.empty() || (mag.type() == angle.type() && mag.size() == angle.size()) );

-    _x.create(angle.size(), CV_32FC1);
-    GpuMat x = _x.getGpuMat();
-
-    _y.create(angle.size(), CV_32FC1);
-    GpuMat y = _y.getGpuMat();
+    GpuMat x = getOutputMat(_x, angle.size(), CV_32FC1, _stream);
+    GpuMat y = getOutputMat(_y, angle.size(), CV_32FC1, _stream);

    GpuMat_<float> xc(x.reshape(1));
    GpuMat_<float> yc(y.reshape(1));
@@ -204,6 +207,9 @@ void cv::cuda::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, O

    CV_CUDEV_SAFE_CALL( cudaGetLastError() );

+    syncOutput(x, _x, _stream);
+    syncOutput(y, _y, _stream);
+
    if (stream == 0)
        CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
 }
--- a/modules/cudaarithm/src/cuda/reduce.cu
+++ b/modules/cudaarithm/src/cuda/reduce.cu
@@ -50,7 +50,10 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
@@ -125,7 +128,7 @@ namespace

 void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    CV_Assert( src.channels() <= 4 );
    CV_Assert( dim == 0 || dim == 1 );
@@ -134,8 +137,7 @@ void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp,
    if (dtype < 0)
        dtype = src.depth();

-    _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, 1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()), stream);

    if (dim == 0)
    {
@@ -292,6 +294,8 @@ void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp,

        func(src, dst, reduceOp, stream);
    }
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/cuda/split_merge.cu
+++ b/modules/cudaarithm/src/cuda/split_merge.cu
@@ -50,7 +50,10 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 ////////////////////////////////////////////////////////////////////////
@@ -92,20 +95,18 @@ namespace

    void mergeImpl(const GpuMat* src, size_t n, cv::OutputArray _dst, Stream& stream)
    {
-        CV_DbgAssert( src != 0 );
-        CV_DbgAssert( n > 0 && n <= 4 );
+        CV_Assert( src != 0 );
+        CV_Assert( n > 0 && n <= 4 );

        const int depth = src[0].depth();
        const cv::Size size = src[0].size();

-#ifdef _DEBUG
        for (size_t i = 0; i < n; ++i)
        {
            CV_Assert( src[i].size() == size );
            CV_Assert( src[i].depth() == depth );
            CV_Assert( src[i].channels() == 1 );
        }
-#endif

        if (n == 1)
        {
@@ -123,8 +124,7 @@ namespace

            const int channels = static_cast<int>(n);

-            _dst.create(size, CV_MAKE_TYPE(depth, channels));
-            GpuMat dst = _dst.getGpuMat();
+            GpuMat dst = getOutputMat(_dst, size, CV_MAKE_TYPE(depth, channels), stream);

            const func_t func = funcs[channels - 2][CV_ELEM_SIZE(depth) / 2];

@@ -132,6 +132,8 @@ namespace
                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");

            func(src, dst, stream);
+
+            syncOutput(dst, _dst, stream);
        }
    }
 }
@@ -203,12 +205,12 @@ namespace
            {SplitFunc<4, uchar>::call, SplitFunc<4, ushort>::call, SplitFunc<4, int>::call, 0, SplitFunc<4, double>::call}
        };

-        CV_DbgAssert( dst != 0 );
+        CV_Assert( dst != 0 );

        const int depth = src.depth();
        const int channels = src.channels();

-        CV_DbgAssert( channels <= 4 );
+        CV_Assert( channels <= 4 );

        if (channels == 0)
            return;
@@ -233,13 +235,13 @@ namespace

 void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
    splitImpl(src, dst, stream);
 }

 void cv::cuda::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);
    dst.resize(src.channels());
    if (src.channels() > 0)
        splitImpl(src, &dst[0], stream);
--- a/modules/cudaarithm/src/cuda/sum.cu
+++ b/modules/cudaarithm/src/cuda/sum.cu
@@ -50,126 +50,153 @@

 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
 {
    template <typename T, typename R, int cn>
-    cv::Scalar sumImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
+    void sumImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
    {
        typedef typename MakeVec<T, cn>::type src_type;
        typedef typename MakeVec<R, cn>::type res_type;

        const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
-        GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;

        if (mask.empty())
-            gridCalcSum(src, buf);
+            gridCalcSum(src, dst, stream);
        else
-            gridCalcSum(src, buf, globPtr<uchar>(mask));
-
-        cv::Scalar_<R> res;
-        cv::Mat res_mat(buf.size(), buf.type(), res.val);
-        buf.download(res_mat);
-
-        return res;
+            gridCalcSum(src, dst, globPtr<uchar>(mask), stream);
    }

    template <typename T, typename R, int cn>
-    cv::Scalar sumAbsImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
+    void sumAbsImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
    {
        typedef typename MakeVec<T, cn>::type src_type;
        typedef typename MakeVec<R, cn>::type res_type;

        const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
-        GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;

        if (mask.empty())
-            gridCalcSum(abs_(cvt_<res_type>(src)), buf);
+            gridCalcSum(abs_(cvt_<res_type>(src)), dst, stream);
        else
-            gridCalcSum(abs_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
-
-        cv::Scalar_<R> res;
-        cv::Mat res_mat(buf.size(), buf.type(), res.val);
-        buf.download(res_mat);
-
-        return res;
+            gridCalcSum(abs_(cvt_<res_type>(src)), dst, globPtr<uchar>(mask), stream);
    }

    template <typename T, typename R, int cn>
-    cv::Scalar sumSqrImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf)
+    void sumSqrImpl(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream)
    {
        typedef typename MakeVec<T, cn>::type src_type;
        typedef typename MakeVec<R, cn>::type res_type;

        const GpuMat_<src_type>& src = (const GpuMat_<src_type>&) _src;
-        GpuMat_<res_type>& buf = (GpuMat_<res_type>&) _buf;
+        GpuMat_<res_type>& dst = (GpuMat_<res_type>&) _dst;

        if (mask.empty())
-            gridCalcSum(sqr_(cvt_<res_type>(src)), buf);
+            gridCalcSum(sqr_(cvt_<res_type>(src)), dst, stream);
        else
-            gridCalcSum(sqr_(cvt_<res_type>(src)), buf, globPtr<uchar>(mask));
-
-        cv::Scalar_<R> res;
-        cv::Mat res_mat(buf.size(), buf.type(), res.val);
-        buf.download(res_mat);
-
-        return res;
+            gridCalcSum(sqr_(cvt_<res_type>(src)), dst, globPtr<uchar>(mask), stream);
    }
 }

-cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
+void cv::cuda::calcSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
 {
-    typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
    static const func_t funcs[7][4] =
    {
-        {sumImpl<uchar , uint  , 1>, sumImpl<uchar , uint  , 2>, sumImpl<uchar , uint  , 3>, sumImpl<uchar , uint  , 4>},
-        {sumImpl<schar , int   , 1>, sumImpl<schar , int   , 2>, sumImpl<schar , int   , 3>, sumImpl<schar , int   , 4>},
-        {sumImpl<ushort, uint  , 1>, sumImpl<ushort, uint  , 2>, sumImpl<ushort, uint  , 3>, sumImpl<ushort, uint  , 4>},
-        {sumImpl<short , int   , 1>, sumImpl<short , int   , 2>, sumImpl<short , int   , 3>, sumImpl<short , int   , 4>},
-        {sumImpl<int   , int   , 1>, sumImpl<int   , int   , 2>, sumImpl<int   , int   , 3>, sumImpl<int   , int   , 4>},
-        {sumImpl<float , float , 1>, sumImpl<float , float , 2>, sumImpl<float , float , 3>, sumImpl<float , float , 4>},
+        {sumImpl<uchar , double, 1>, sumImpl<uchar , double, 2>, sumImpl<uchar , double, 3>, sumImpl<uchar , double, 4>},
+        {sumImpl<schar , double, 1>, sumImpl<schar , double, 2>, sumImpl<schar , double, 3>, sumImpl<schar , double, 4>},
+        {sumImpl<ushort, double, 1>, sumImpl<ushort, double, 2>, sumImpl<ushort, double, 3>, sumImpl<ushort, double, 4>},
+        {sumImpl<short , double, 1>, sumImpl<short , double, 2>, sumImpl<short , double, 3>, sumImpl<short , double, 4>},
+        {sumImpl<int   , double, 1>, sumImpl<int   , double, 2>, sumImpl<int   , double, 3>, sumImpl<int   , double, 4>},
+        {sumImpl<float , double, 1>, sumImpl<float , double, 2>, sumImpl<float , double, 3>, sumImpl<float , double, 4>},
        {sumImpl<double, double, 1>, sumImpl<double, double, 2>, sumImpl<double, double, 3>, sumImpl<double, double, 4>}
    };

-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);

-    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

-    const func_t func = funcs[src.depth()][src.channels() - 1];
+    const int src_depth = src.depth();
+    const int channels = src.channels();

-    return func(src, mask, buf);
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
 }

-cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
+cv::Scalar cv::cuda::sum(InputArray _src, InputArray _mask)
 {
-    typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
+}
+
+void cv::cuda::calcAbsSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
    static const func_t funcs[7][4] =
    {
-        {sumAbsImpl<uchar , uint  , 1>, sumAbsImpl<uchar , uint  , 2>, sumAbsImpl<uchar , uint  , 3>, sumAbsImpl<uchar , uint  , 4>},
-        {sumAbsImpl<schar , int   , 1>, sumAbsImpl<schar , int   , 2>, sumAbsImpl<schar , int   , 3>, sumAbsImpl<schar , int   , 4>},
-        {sumAbsImpl<ushort, uint  , 1>, sumAbsImpl<ushort, uint  , 2>, sumAbsImpl<ushort, uint  , 3>, sumAbsImpl<ushort, uint  , 4>},
-        {sumAbsImpl<short , int   , 1>, sumAbsImpl<short , int   , 2>, sumAbsImpl<short , int   , 3>, sumAbsImpl<short , int   , 4>},
-        {sumAbsImpl<int   , int   , 1>, sumAbsImpl<int   , int   , 2>, sumAbsImpl<int   , int   , 3>, sumAbsImpl<int   , int   , 4>},
-        {sumAbsImpl<float , float , 1>, sumAbsImpl<float , float , 2>, sumAbsImpl<float , float , 3>, sumAbsImpl<float , float , 4>},
+        {sumAbsImpl<uchar , double, 1>, sumAbsImpl<uchar , double, 2>, sumAbsImpl<uchar , double, 3>, sumAbsImpl<uchar , double, 4>},
+        {sumAbsImpl<schar , double, 1>, sumAbsImpl<schar , double, 2>, sumAbsImpl<schar , double, 3>, sumAbsImpl<schar , double, 4>},
+        {sumAbsImpl<ushort, double, 1>, sumAbsImpl<ushort, double, 2>, sumAbsImpl<ushort, double, 3>, sumAbsImpl<ushort, double, 4>},
+        {sumAbsImpl<short , double, 1>, sumAbsImpl<short , double, 2>, sumAbsImpl<short , double, 3>, sumAbsImpl<short , double, 4>},
+        {sumAbsImpl<int   , double, 1>, sumAbsImpl<int   , double, 2>, sumAbsImpl<int   , double, 3>, sumAbsImpl<int   , double, 4>},
+        {sumAbsImpl<float , double, 1>, sumAbsImpl<float , double, 2>, sumAbsImpl<float , double, 3>, sumAbsImpl<float , double, 4>},
        {sumAbsImpl<double, double, 1>, sumAbsImpl<double, double, 2>, sumAbsImpl<double, double, 3>, sumAbsImpl<double, double, 4>}
    };

-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);

-    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

-    const func_t func = funcs[src.depth()][src.channels() - 1];
+    const int src_depth = src.depth();
+    const int channels = src.channels();

-    return func(src, mask, buf);
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
 }

-cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
+cv::Scalar cv::cuda::absSum(InputArray _src, InputArray _mask)
 {
-    typedef cv::Scalar (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _buf);
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcAbsSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
+}
+
+void cv::cuda::calcSqrSum(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, const GpuMat& mask, Stream& stream);
    static const func_t funcs[7][4] =
    {
        {sumSqrImpl<uchar , double, 1>, sumSqrImpl<uchar , double, 2>, sumSqrImpl<uchar , double, 3>, sumSqrImpl<uchar , double, 4>},
@@ -181,14 +208,35 @@ cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
        {sumSqrImpl<double, double, 1>, sumSqrImpl<double, double, 2>, sumSqrImpl<double, double, 3>, sumSqrImpl<double, double, 4>}
    };

-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+    const GpuMat src = getInputMat(_src, stream);
+    const GpuMat mask = getInputMat(_mask, stream);

-    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

-    const func_t func = funcs[src.depth()][src.channels() - 1];
+    const int src_depth = src.depth();
+    const int channels = src.channels();

-    return func(src, mask, buf);
+    GpuMat dst = getOutputMat(_dst, 1, 1, CV_64FC(channels), stream);
+
+    const func_t func = funcs[src_depth][channels - 1];
+    func(src, dst, mask, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+cv::Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    calcSqrSum(_src, dst, _mask, stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    dst.createMatHeader().convertTo(cv::Mat(dst.size(), CV_64FC(dst.channels()), val.val), CV_64F);
+
+    return val;
 }

 #endif
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -52,6 +52,8 @@
 #include "opencv2/cudev.hpp"
 #include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 namespace
@@ -95,15 +97,14 @@ namespace

 double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    const int depth = src.depth();

-    CV_DbgAssert( src.channels() == 1 && depth <= CV_64F );
-    CV_DbgAssert( type <= 4 /*THRESH_TOZERO_INV*/ );
+    CV_Assert( src.channels() == 1 && depth <= CV_64F );
+    CV_Assert( type <= 4 /*THRESH_TOZERO_INV*/ );

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

    if (depth == CV_32F && type == 2 /*THRESH_TRUNC*/)
    {
@@ -142,6 +143,8 @@ double cv::cuda::threshold(InputArray _src, OutputArray _dst, double thresh, dou
        funcs[depth](src, dst, thresh, maxVal, type, stream);
    }

+    syncOutput(dst, _dst, stream);
+
    return thresh;
 }

--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -52,18 +52,19 @@
 #include "opencv2/cudev.hpp"
 #include "opencv2/core/private.cuda.hpp"

+using namespace cv;
+using namespace cv::cuda;
 using namespace cv::cudev;

 void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    const size_t elemSize = src.elemSize();

    CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );

-    _dst.create( src.cols, src.rows, src.type() );
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);

    if (elemSize == 1)
    {
@@ -87,6 +88,8 @@ void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
    {
        gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
    }
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
@@ -107,11 +107,11 @@ namespace

        GpuMat src1;
        if (!isScalar1)
-            src1 = _src1.getGpuMat();
+            src1 = getInputMat(_src1, stream);

        GpuMat src2;
        if (!isScalar2)
-            src2 = _src2.getGpuMat();
+            src2 = getInputMat(_src2, stream);

        Mat scalar;
        if (isScalar1)
@@ -126,7 +126,7 @@ namespace
            scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
        }

-        GpuMat mask = _mask.getGpuMat();
+        GpuMat mask = getInputMat(_mask, stream);

        const int sdepth = src1.empty() ? src2.depth() : src1.depth();
        const int cn = src1.empty() ? src2.channels() : src1.channels();
@@ -147,8 +147,7 @@ namespace
                CV_Error(Error::StsUnsupportedFormat, "The device doesn't support double");
        }

-        _dst.create(size, CV_MAKE_TYPE(ddepth, cn));
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, size, CV_MAKE_TYPE(ddepth, cn), stream);

        if (isScalar1)
            mat_scalar_func(src2, val, true, dst, mask, scale, stream, op);
@@ -156,6 +155,8 @@ namespace
            mat_scalar_func(src1, val, false, dst, mask, scale, stream, op);
        else
            mat_mat_func(src1, src2, dst, mask, scale, stream, op);
+
+        syncOutput(dst, _dst, stream);
    }
 }

@@ -196,27 +197,29 @@ void cv::cuda::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, do
 {
    if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
    {
-        GpuMat src1 = _src1.getGpuMat();
-        GpuMat src2 = _src2.getGpuMat();
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);

        CV_Assert( src1.size() == src2.size() );

-        _dst.create(src1.size(), src1.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);

        mulMat_8uc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
    }
    else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
    {
-        GpuMat src1 = _src1.getGpuMat();
-        GpuMat src2 = _src2.getGpuMat();
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);

        CV_Assert( src1.size() == src2.size() );

-        _dst.create(src1.size(), src1.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);

        mulMat_16sc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
    }
    else
    {
@@ -237,27 +240,29 @@ void cv::cuda::divide(InputArray _src1, InputArray _src2, OutputArray _dst, doub
 {
    if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
    {
-        GpuMat src1 = _src1.getGpuMat();
-        GpuMat src2 = _src2.getGpuMat();
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);

        CV_Assert( src1.size() == src2.size() );

-        _dst.create(src1.size(), src1.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);

        divMat_8uc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
    }
    else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
    {
-        GpuMat src1 = _src1.getGpuMat();
-        GpuMat src2 = _src2.getGpuMat();
+        GpuMat src1 = getInputMat(_src1, stream);
+        GpuMat src2 = getInputMat(_src2, stream);

        CV_Assert( src1.size() == src2.size() );

-        _dst.create(src1.size(), src1.type());
-        GpuMat dst = _dst.getGpuMat();
+        GpuMat dst = getOutputMat(_dst, src1.size(), src1.type(), stream);

        divMat_16sc4_32f(src1, src2, dst, stream);
+
+        syncOutput(dst, _dst, stream);
    }
    else
    {
@@ -389,15 +394,16 @@ void cv::cuda::rshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Strea
        {NppShift<CV_32S, 1, nppiRShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiRShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiRShiftC_32s_C4R>::call},
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    CV_Assert( src.depth() < CV_32F );
    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

    funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }

 void cv::cuda::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Stream& stream)
@@ -412,15 +418,16 @@ void cv::cuda::lshift(InputArray _src, Scalar_<int> val, OutputArray _dst, Strea
        {NppShift<CV_32S, 1, nppiLShiftC_32s_C1R>::call, 0, NppShift<CV_32S, 3, nppiLShiftC_32s_C3R>::call, NppShift<CV_32S, 4, nppiLShiftC_32s_C4R>::call},
    };

-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

    CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S );
    CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );

-    _dst.create(src.size(), src.type());
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

    funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -475,22 +482,24 @@ namespace

 void cv::cuda::magnitude(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

-    _dst.create(src.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);

    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }

 void cv::cuda::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    GpuMat src = getInputMat(_src, stream);

-    _dst.create(src.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, stream);

    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
+
+    syncOutput(dst, _dst, stream);
 }

 #endif
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -47,110 +47,106 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-double cv::cuda::norm(InputArray, int, InputArray, GpuMat&) { throw_no_cuda(); return 0.0; }
-double cv::cuda::norm(InputArray, InputArray, GpuMat&, int) { throw_no_cuda(); return 0.0; }
+double cv::cuda::norm(InputArray, int, InputArray) { throw_no_cuda(); return 0.0; }
+void cv::cuda::calcNorm(InputArray, OutputArray, int, InputArray, Stream&) { throw_no_cuda(); }
+double cv::cuda::norm(InputArray, InputArray, int) { throw_no_cuda(); return 0.0; }
+void cv::cuda::calcNormDiff(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }

-Scalar cv::cuda::sum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::cuda::absSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::cuda::sqrSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::cuda::sum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+Scalar cv::cuda::absSum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcAbsSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+Scalar cv::cuda::sqrSum(InputArray, InputArray) { throw_no_cuda(); return Scalar(); }
+void cv::cuda::calcSqrSum(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }

-void cv::cuda::minMax(InputArray, double*, double*, InputArray, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::cuda::minMax(InputArray, double*, double*, InputArray) { throw_no_cuda(); }
+void cv::cuda::findMinMax(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray) { throw_no_cuda(); }
+void cv::cuda::findMinMaxLoc(InputArray, OutputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); }

-int cv::cuda::countNonZero(InputArray, GpuMat&) { throw_no_cuda(); return 0; }
+int cv::cuda::countNonZero(InputArray) { throw_no_cuda(); return 0; }
+void cv::cuda::countNonZero(InputArray, OutputArray, Stream&) { throw_no_cuda(); }

 void cv::cuda::reduce(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); }

-void cv::cuda::meanStdDev(InputArray, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
+void cv::cuda::meanStdDev(InputArray, Scalar&, Scalar&) { throw_no_cuda(); }
+void cv::cuda::meanStdDev(InputArray, OutputArray, Stream&) { throw_no_cuda(); }

 void cv::cuda::rectStdDev(InputArray, InputArray, OutputArray, Rect, Stream&) { throw_no_cuda(); }

-void cv::cuda::normalize(InputArray, OutputArray, double, double, int, int, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::cuda::normalize(InputArray, OutputArray, double, double, int, int, InputArray, Stream&) { throw_no_cuda(); }

-void cv::cuda::integral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::cuda::sqrIntegral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::integral(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::sqrIntegral(InputArray, OutputArray, Stream&) { throw_no_cuda(); }

 #else

-namespace
-{
-    class DeviceBuffer
-    {
-    public:
-        explicit DeviceBuffer(int count_ = 1) : count(count_)
-        {
-            cudaSafeCall( cudaMalloc(&pdev, count * sizeof(double)) );
-        }
-        ~DeviceBuffer()
-        {
-            cudaSafeCall( cudaFree(pdev) );
-        }
-
-        operator double*() {return pdev;}
-
-        void download(double* hptr)
-        {
-            double hbuf;
-            cudaSafeCall( cudaMemcpy(&hbuf, pdev, sizeof(double), cudaMemcpyDeviceToHost) );
-            *hptr = hbuf;
-        }
-        void download(double** hptrs)
-        {
-            AutoBuffer<double, 2 * sizeof(double)> hbuf(count);
-            cudaSafeCall( cudaMemcpy((void*)hbuf, pdev, count * sizeof(double), cudaMemcpyDeviceToHost) );
-            for (int i = 0; i < count; ++i)
-                *hptrs[i] = hbuf[i];
-        }
-
-    private:
-        double* pdev;
-        int count;
-    };
-}
-
 ////////////////////////////////////////////////////////////////////////
 // norm

-double cv::cuda::norm(InputArray _src, int normType, InputArray _mask, GpuMat& buf)
-{
-    GpuMat src = _src.getGpuMat();
-    GpuMat mask = _mask.getGpuMat();
+namespace cv { namespace cuda { namespace internal {

+void normL2(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+void findMaxAbs(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _mask, Stream& stream);
+
+}}}
+
+void cv::cuda::calcNorm(InputArray _src, OutputArray dst, int normType, InputArray mask, Stream& stream)
+{
    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1) );
+
+    GpuMat src = getInputMat(_src, stream);

    GpuMat src_single_channel = src.reshape(1);

    if (normType == NORM_L1)
-        return cuda::absSum(src_single_channel, mask, buf)[0];
+    {
+        calcAbsSum(src_single_channel, dst, mask, stream);
+    }
+    else if (normType == NORM_L2)
+    {
+        internal::normL2(src_single_channel, dst, mask, stream);
+    }
+    else // NORM_INF
+    {
+        internal::findMaxAbs(src_single_channel, dst, mask, stream);
+    }
+}

-    if (normType == NORM_L2)
-        return std::sqrt(cuda::sqrSum(src_single_channel, mask, buf)[0]);
+double cv::cuda::norm(InputArray _src, int normType, InputArray _mask)
+{
+    Stream& stream = Stream::Null();

-    // NORM_INF
-    double min_val, max_val;
-    cuda::minMax(src_single_channel, &min_val, &max_val, mask, buf);
-    return std::max(std::abs(min_val), std::abs(max_val));
+    HostMem dst;
+    calcNorm(_src, dst, normType, _mask, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(Mat(1, 1, CV_64FC1, &val), CV_64F);
+
+    return val;
 }

 ////////////////////////////////////////////////////////////////////////
 // meanStdDev

-void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& buf)
+void cv::cuda::meanStdDev(InputArray _src, OutputArray _dst, Stream& stream)
 {
-    GpuMat src = _src.getGpuMat();
+    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
+        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
+
+    const GpuMat src = getInputMat(_src, stream);

    CV_Assert( src.type() == CV_8UC1 );

-    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
-        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
+    GpuMat dst = getOutputMat(_dst, 1, 2, CV_64FC1, stream);

    NppiSize sz;
    sz.width  = src.cols;
    sz.height = src.rows;

-    DeviceBuffer dbuf(2);
-
    int bufSize;
 #if (CUDA_VERSION <= 4020)
    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
@@ -158,14 +154,30 @@ void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat&
    nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
 #endif

-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
+    BufferPool pool(stream);
+    GpuMat buf = pool.getBuffer(1, bufSize, CV_8UC1);

-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
+    NppStreamHandler h(StreamAccessor::getStream(stream));

-    cudaSafeCall( cudaDeviceSynchronize() );
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dst.ptr<Npp64f>(), dst.ptr<Npp64f>() + 1) );

-    double* ptrs[2] = {mean.val, stddev.val};
-    dbuf.download(ptrs);
+    syncOutput(dst, _dst, stream);
+}
+
+void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev)
+{
+    Stream& stream = Stream::Null();
+
+    HostMem dst;
+    meanStdDev(_src, dst, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().copyTo(Mat(1, 2, CV_64FC1, &vals[0]));
+
+    mean = Scalar(vals[0]);
+    stddev = Scalar(vals[1]);
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -173,13 +185,12 @@ void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat&

 void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Rect rect, Stream& _stream)
 {
-    GpuMat src = _src.getGpuMat();
-    GpuMat sqr = _sqr.getGpuMat();
+    GpuMat src = getInputMat(_src, _stream);
+    GpuMat sqr = getInputMat(_sqr, _stream);

    CV_Assert( src.type() == CV_32SC1 && sqr.type() == CV_64FC1 );

-    _dst.create(src.size(), CV_32FC1);
-    GpuMat dst = _dst.getGpuMat();
+    GpuMat dst = getOutputMat(_dst, src.size(), CV_32FC1, _stream);

    NppiSize sz;
    sz.width = src.cols;
@@ -200,45 +211,8 @@ void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Re

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
-}

-////////////////////////////////////////////////////////////////////////
-// normalize
-
-void cv::cuda::normalize(InputArray _src, OutputArray dst, double a, double b, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-{
-    GpuMat src = _src.getGpuMat();
-
-    double scale = 1, shift = 0;
-
-    if (norm_type == NORM_MINMAX)
-    {
-        double smin = 0, smax = 0;
-        double dmin = std::min(a, b), dmax = std::max(a, b);
-        cuda::minMax(src, &smin, &smax, mask, norm_buf);
-        scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
-        shift = dmin - smin * scale;
-    }
-    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
-    {
-        scale = cuda::norm(src, norm_type, mask, norm_buf);
-        scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
-        shift = 0;
-    }
-    else
-    {
-        CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
-    }
-
-    if (mask.empty())
-    {
-        src.convertTo(dst, dtype, scale, shift);
-    }
-    else
-    {
-        src.convertTo(cvt_buf, dtype, scale, shift);
-        cvt_buf.copyTo(dst, mask);
-    }
+    syncOutput(dst, _dst, _stream);
 }

 #endif
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -1329,7 +1329,7 @@ CUDA_TEST_P(Divide_Scalar_First, Accuracy)
        try
        {
            cv::cuda::GpuMat dst;
-            cv::cuda::divide(scale, loadMat(mat), dst, depth.second);
+            cv::cuda::divide(scale, loadMat(mat), dst, 1.0, depth.second);
        }
        catch (const cv::Exception& e)
        {
@@ -1339,7 +1339,7 @@ CUDA_TEST_P(Divide_Scalar_First, Accuracy)
    else
    {
        cv::cuda::GpuMat dst = createMat(size, depth.second, useRoi);
-        cv::cuda::divide(scale, loadMat(mat, useRoi), dst, depth.second);
+        cv::cuda::divide(scale, loadMat(mat, useRoi), dst, 1.0, depth.second);

        cv::Mat dst_gold;
        cv::divide(scale, mat, dst_gold, depth.second);
--- a/modules/cudaarithm/test/test_reductions.cpp
+++ b/modules/cudaarithm/test/test_reductions.cpp
@@ -74,8 +74,27 @@ CUDA_TEST_P(Norm, Accuracy)
    cv::Mat src = randomMat(size, depth);
    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);

-    cv::cuda::GpuMat d_buf;
-    double val = cv::cuda::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi), d_buf);
+    double val = cv::cuda::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi));
+
+    double val_gold = cv::norm(src, normCode, mask);
+
+    EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
+}
+
+CUDA_TEST_P(Norm, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcNorm(loadMat(src, useRoi), dst, normCode, loadMat(mask, useRoi), stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    dst.createMatHeader().convertTo(cv::Mat(1, 1, CV_64FC1, &val), CV_64F);

    double val_gold = cv::norm(src, normCode, mask);

@@ -127,6 +146,27 @@ CUDA_TEST_P(NormDiff, Accuracy)
    EXPECT_NEAR(val_gold, val, 0.0);
 }

+CUDA_TEST_P(NormDiff, Async)
+{
+    cv::Mat src1 = randomMat(size, CV_8UC1);
+    cv::Mat src2 = randomMat(size, CV_8UC1);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcNormDiff(loadMat(src1, useRoi), loadMat(src2, useRoi), dst, normCode, stream);
+
+    stream.waitForCompletion();
+
+    double val;
+    const cv::Mat val_mat(1, 1, CV_64FC1, &val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    double val_gold = cv::norm(src1, src2, normCode);
+
+    EXPECT_NEAR(val_gold, val, 0.0);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, NormDiff, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -247,6 +287,24 @@ CUDA_TEST_P(Sum, Simple)
    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }

+CUDA_TEST_P(Sum, Simple_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = cv::sum(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
 CUDA_TEST_P(Sum, Abs)
 {
    cv::Scalar val = cv::cuda::absSum(loadMat(src, useRoi));
@@ -256,6 +314,24 @@ CUDA_TEST_P(Sum, Abs)
    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }

+CUDA_TEST_P(Sum, Abs_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcAbsSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = absSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
 CUDA_TEST_P(Sum, Sqr)
 {
    cv::Scalar val = cv::cuda::sqrSum(loadMat(src, useRoi));
@@ -265,6 +341,24 @@ CUDA_TEST_P(Sum, Sqr)
    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }

+CUDA_TEST_P(Sum, Sqr_Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::calcSqrSum(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    cv::Scalar val;
+    cv::Mat val_mat(dst.size(), CV_64FC(dst.channels()), val.val);
+    dst.createMatHeader().convertTo(val_mat, CV_64F);
+
+    cv::Scalar val_gold = sqrSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Sum, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -321,6 +415,28 @@ CUDA_TEST_P(MinMax, WithoutMask)
    }
 }

+CUDA_TEST_P(MinMax, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::findMinMax(loadMat(src, useRoi), dst, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    const cv::Mat vals_mat(1, 2, CV_64FC1, &vals[0]);
+    dst.createMatHeader().convertTo(vals_mat, CV_64F);
+
+    double minVal_gold, maxVal_gold;
+    minMaxLocGold(src, &minVal_gold, &maxVal_gold);
+
+    EXPECT_DOUBLE_EQ(minVal_gold, vals[0]);
+    EXPECT_DOUBLE_EQ(maxVal_gold, vals[1]);
+}
+
 CUDA_TEST_P(MinMax, WithMask)
 {
    cv::Mat src = randomMat(size, depth);
@@ -471,6 +587,41 @@ CUDA_TEST_P(MinMaxLoc, WithoutMask)
    }
 }

+CUDA_TEST_P(MinMaxLoc, Async)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem minMaxVals, locVals;
+    cv::cuda::findMinMaxLoc(loadMat(src, useRoi), minMaxVals, locVals, cv::noArray(), stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    const cv::Mat vals_mat(2, 1, CV_64FC1, &vals[0]);
+    minMaxVals.createMatHeader().convertTo(vals_mat, CV_64F);
+
+    int locs[2];
+    const cv::Mat locs_mat(2, 1, CV_32SC1, &locs[0]);
+    locVals.createMatHeader().copyTo(locs_mat);
+
+    cv::Point locs2D[] = {
+        cv::Point(locs[0] % src.cols, locs[0] / src.cols),
+        cv::Point(locs[1] % src.cols, locs[1] / src.cols),
+    };
+
+    double minVal_gold, maxVal_gold;
+    cv::Point minLoc_gold, maxLoc_gold;
+    minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+    EXPECT_DOUBLE_EQ(minVal_gold, vals[0]);
+    EXPECT_DOUBLE_EQ(maxVal_gold, vals[1]);
+
+    expectEqual(src, minLoc_gold, locs2D[0]);
+    expectEqual(src, maxLoc_gold, locs2D[1]);
+}
+
 CUDA_TEST_P(MinMaxLoc, WithMask)
 {
    cv::Mat src = randomMat(size, depth);
@@ -564,6 +715,7 @@ PARAM_TEST_CASE(CountNonZero, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
    int depth;
    bool useRoi;

+    cv::Mat src;

    virtual void SetUp()
    {
@@ -573,15 +725,14 @@ PARAM_TEST_CASE(CountNonZero, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
        useRoi = GET_PARAM(3);

        cv::cuda::setDevice(devInfo.deviceID());
+
+        cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5);
+        srcBase.convertTo(src, depth);
    }
 };

 CUDA_TEST_P(CountNonZero, Accuracy)
 {
-    cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5);
-    cv::Mat src;
-    srcBase.convertTo(src, depth);
-
    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
    {
        try
@@ -603,6 +754,24 @@ CUDA_TEST_P(CountNonZero, Accuracy)
    }
 }

+CUDA_TEST_P(CountNonZero, Async)
+{
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::countNonZero(loadMat(src, useRoi), dst, stream);
+
+    stream.waitForCompletion();
+
+    int val;
+    const cv::Mat val_mat(1, 1, CV_32SC1, &val);
+    dst.createMatHeader().copyTo(val_mat);
+
+    int val_gold = cv::countNonZero(src);
+
+    ASSERT_EQ(val_gold, val);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, CountNonZero, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -750,7 +919,7 @@ CUDA_TEST_P(Normalize, WithMask)
    dst_gold.setTo(cv::Scalar::all(0));
    cv::normalize(src, dst_gold, alpha, beta, norm_type, type, mask);

-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+    EXPECT_MAT_NEAR(dst_gold, dst, type < CV_32F ? 1.0 : 1e-4);
 }

 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Normalize, testing::Combine(
@@ -811,6 +980,28 @@ CUDA_TEST_P(MeanStdDev, Accuracy)
    }
 }

+CUDA_TEST_P(MeanStdDev, Async)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::Stream stream;
+
+    cv::cuda::HostMem dst;
+    cv::cuda::meanStdDev(loadMat(src, useRoi), dst, stream);
+
+    stream.waitForCompletion();
+
+    double vals[2];
+    dst.createMatHeader().copyTo(cv::Mat(1, 2, CV_64FC1, &vals[0]));
+
+    cv::Scalar mean_gold;
+    cv::Scalar stddev_gold;
+    cv::meanStdDev(src, mean_gold, stddev_gold);
+
+    EXPECT_SCALAR_NEAR(mean_gold, cv::Scalar(vals[0]), 1e-5);
+    EXPECT_SCALAR_NEAR(stddev_gold, cv::Scalar(vals[1]), 1e-5);
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Arithm, MeanStdDev, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
--- a/modules/cudabgsegm/src/fgd.cpp
+++ b/modules/cudabgsegm/src/fgd.cpp
@@ -266,7 +266,7 @@ namespace
 {
    int bgfgClassification(const GpuMat& prevFrame, const GpuMat& curFrame,
                           const GpuMat& Ftd, const GpuMat& Fbd,
-                           GpuMat& foreground, GpuMat& countBuf,
+                           GpuMat& foreground,
                           const FGDParams& params, int out_cn)
    {
        typedef void (*func_t)(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
@@ -298,7 +298,7 @@ namespace
                                                                             deltaC, deltaCC, params.alpha2,
                                                                             params.N1c, params.N1cc, 0);

-        int count = cuda::countNonZero(foreground, countBuf);
+        int count = cuda::countNonZero(foreground);

        cuda::multiply(foreground, Scalar::all(255), foreground);

@@ -605,8 +605,6 @@ namespace
        GpuMat hist_;
        GpuMat histBuf_;

-        GpuMat countBuf_;
-
        GpuMat buf_;
        GpuMat filterBrd_;

@@ -649,7 +647,7 @@ namespace
        changeDetection(prevFrame_, curFrame, Ftd_, hist_, histBuf_);
        changeDetection(background_, curFrame, Fbd_, hist_, histBuf_);

-        int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, countBuf_, params_, 4);
+        int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, params_, 4);

 #ifdef HAVE_OPENCV_CUDAFILTERS
        if (params_.perform_morphing > 0)
--- a/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
+++ b/modules/cudafeatures2d/include/opencv2/cudafeatures2d.hpp
@@ -48,6 +48,7 @@
 #endif

 #include "opencv2/core/cuda.hpp"
+#include "opencv2/features2d.hpp"
 #include "opencv2/cudafilters.hpp"

 /**
@@ -62,262 +63,396 @@ namespace cv { namespace cuda {
 //! @addtogroup cudafeatures2d
 //! @{

-/** @brief Brute-force descriptor matcher.
+//
+// DescriptorMatcher
+//

-For each descriptor in the first set, this matcher finds the closest descriptor in the second set
-by trying each one. This descriptor matcher supports masking permissible matches between descriptor
-sets.
+/** @brief Abstract base class for matching keypoint descriptors.

-The class BFMatcher_CUDA has an interface similar to the class DescriptorMatcher. It has two groups
-of match methods: for matching descriptors of one image with another image or with an image set.
-Also, all functions have an alternative to save results either to the GPU memory or to the CPU
-memory.
-
-@sa DescriptorMatcher, BFMatcher
+It has two groups of match methods: for matching descriptors of an image with another image or with
+an image set.
 */
-class CV_EXPORTS BFMatcher_CUDA
+class CV_EXPORTS DescriptorMatcher : public cv::Algorithm
 {
 public:
-    explicit BFMatcher_CUDA(int norm = cv::NORM_L2);
+    //
+    // Factories
+    //

-    //! Add descriptors to train descriptor collection
-    void add(const std::vector<GpuMat>& descCollection);
+    /** @brief Brute-force descriptor matcher.

-    //! Get train descriptors collection
-    const std::vector<GpuMat>& getTrainDescriptors() const;
+    For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+    by trying each one. This descriptor matcher supports masking permissible matches of descriptor
+    sets.

-    //! Clear train descriptors collection
-    void clear();
+    @param normType One of NORM_L1, NORM_L2, NORM_HAMMING. L1 and L2 norms are
+    preferable choices for SIFT and SURF descriptors, NORM_HAMMING should be used with ORB, BRISK and
+    BRIEF).
+     */
+    static Ptr<DescriptorMatcher> createBFMatcher(int normType = cv::NORM_L2);

-    //! Return true if there are not train descriptors in collection
-    bool empty() const;
+    //
+    // Utility
+    //

-    //! Return true if the matcher supports mask in match methods
-    bool isMaskSupported() const;
+    /** @brief Returns true if the descriptor matcher supports masking permissible matches.
+     */
+    virtual bool isMaskSupported() const = 0;

-    //! Find one best match for each query descriptor
-    void matchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    //
+    // Descriptor collection
+    //

-    //! Download trainIdx and distance and convert it to CPU vector with DMatch
-    static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    //! Convert trainIdx and distance to vector with DMatch
-    static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);
+    /** @brief Adds descriptors to train a descriptor collection.

-    //! Find one best match for each query descriptor
-    void match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());
+    If the collection is not empty, the new descriptors are added to existing train descriptors.

-    //! Make gpu collection of trains and masks in suitable format for matchCollection function
-    void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+    @param descriptors Descriptors to add. Each descriptors[i] is a set of descriptors from the same
+    train image.
+     */
+    virtual void add(const std::vector<GpuMat>& descriptors) = 0;

-    //! Find one best match from train collection for each query descriptor
-    void matchCollection(const GpuMat& query, const GpuMat& trainCollection,
-        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-        const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Returns a constant link to the train descriptor collection.
+     */
+    virtual const std::vector<GpuMat>& getTrainDescriptors() const = 0;

-    //! Download trainIdx, imgIdx and distance and convert it to vector with DMatch
-    static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches);
-    //! Convert trainIdx, imgIdx and distance to vector with DMatch
-    static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);
+    /** @brief Clears the train descriptor collection.
+     */
+    virtual void clear() = 0;

-    //! Find one best match from train collection for each query descriptor.
-    void match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+    /** @brief Returns true if there are no train descriptors in the collection.
+     */
+    virtual bool empty() const = 0;

-    //! Find k best matches for each query descriptor (in increasing order of distances)
-    void knnMatchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Trains a descriptor matcher.

-    //! Download trainIdx and distance and convert it to vector with DMatch
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx and distance to vector with DMatch
-    static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    Trains a descriptor matcher (for example, the flann index). In all methods to match, the method
+    train() is run every time before matching.
+     */
+    virtual void train() = 0;

-    //! Find k best matches for each query descriptor (in increasing order of distances).
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    void knnMatch(const GpuMat& query, const GpuMat& train,
-        std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),
-        bool compactResult = false);
+    //
+    // 1 to 1 match
+    //

-    //! Find k best matches from train collection for each query descriptor (in increasing order of distances)
-    void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
-        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
-        const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());
+    /** @brief Finds the best match for each descriptor from a query set (blocking version).

-    //! Download trainIdx and distance and convert it to vector with DMatch
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    //! @see BFMatcher_CUDA::knnMatchDownload
-    static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx and distance to vector with DMatch
-    //! @see BFMatcher_CUDA::knnMatchConvert
-    static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
+    descriptor. So, matches size may be smaller than the query descriptors count.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.

-    //! Find k best matches  for each query descriptor (in increasing order of distances).
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    virtual void match(InputArray queryDescriptors, InputArray trainDescriptors,
+                       std::vector<DMatch>& matches,
+                       InputArray mask = noArray()) = 0;

-    //! Find best matches for each query descriptor which have distance less than maxDistance.
-    //! nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
-    //! carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
-    //! because it didn't have enough memory.
-    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
-    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    //! Matches doesn't sorted.
-    void radiusMatchSingle(const GpuMat& query, const GpuMat& train,
-        GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
-        const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+    /** @overload
+     */
+    virtual void match(InputArray queryDescriptors,
+                       std::vector<DMatch>& matches,
+                       const std::vector<GpuMat>& masks = std::vector<GpuMat>()) = 0;

-    //! Download trainIdx, nMatches and distance and convert it to vector with DMatch.
-    //! matches will be sorted in increasing order of distances.
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx, nMatches and distance to vector with DMatch.
-    static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    /** @brief Finds the best match for each descriptor from a query set (asynchronous version).

-    //! Find best matches for each query descriptor which have distance less than maxDistance
-    //! in increasing order of distances).
-    void radiusMatch(const GpuMat& query, const GpuMat& train,
-        std::vector< std::vector<DMatch> >& matches, float maxDistance,
-        const GpuMat& mask = GpuMat(), bool compactResult = false);
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::matchConvert method to retrieve results in standard representation.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.

-    //! Find best matches for each query descriptor which have distance less than maxDistance.
-    //! If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
-    //! otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-    //! Matches doesn't sorted.
-    void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    virtual void matchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                            OutputArray matches,
+                            InputArray mask = noArray(),
+                            Stream& stream = Stream::Null()) = 0;

-    //! Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
-    //! matches will be sorted in increasing order of distances.
-    //! compactResult is used when mask is not empty. If compactResult is false matches
-    //! vector will have the same size as queryDescriptors rows. If compactResult is true
-    //! matches vector will not contain matches for fully masked out query descriptors.
-    static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-    //! Convert trainIdx, nMatches and distance to vector with DMatch.
-    static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
-        std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+    /** @overload
+     */
+    virtual void matchAsync(InputArray queryDescriptors,
+                            OutputArray matches,
+                            const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                            Stream& stream = Stream::Null()) = 0;

-    //! Find best matches from train collection for each query descriptor which have distance less than
-    //! maxDistance (in increasing order of distances).
-    void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
-        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
+    /** @brief Converts matches array from internal representation to standard matches vector.

-    int norm;
+    The method is supposed to be used with DescriptorMatcher::matchAsync to get final result.
+    Call this method only after DescriptorMatcher::matchAsync is completed (ie. after synchronization).

-private:
-    std::vector<GpuMat> trainDescCollection;
+    @param gpu_matches Matches, returned from DescriptorMatcher::matchAsync.
+    @param matches Vector of DMatch objects.
+     */
+    virtual void matchConvert(InputArray gpu_matches,
+                              std::vector<DMatch>& matches) = 0;
+
+    //
+    // knn match
+    //
+
+    /** @brief Finds the k best matches for each descriptor from a query set (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    These extended variants of DescriptorMatcher::match methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::match
+    for the details about query and train descriptors.
+     */
+    virtual void knnMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                          std::vector<std::vector<DMatch> >& matches,
+                          int k,
+                          InputArray mask = noArray(),
+                          bool compactResult = false) = 0;
+
+    /** @overload
+     */
+    virtual void knnMatch(InputArray queryDescriptors,
+                          std::vector<std::vector<DMatch> >& matches,
+                          int k,
+                          const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                          bool compactResult = false) = 0;
+
+    /** @brief Finds the k best matches for each descriptor from a query set (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::knnMatchConvert method to retrieve results in standard representation.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
+
+    These extended variants of DescriptorMatcher::matchAsync methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::matchAsync
+    for the details about query and train descriptors.
+     */
+    virtual void knnMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                               OutputArray matches,
+                               int k,
+                               InputArray mask = noArray(),
+                               Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    virtual void knnMatchAsync(InputArray queryDescriptors,
+                               OutputArray matches,
+                               int k,
+                               const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                               Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::knnMatchAsync to get final result.
+    Call this method only after DescriptorMatcher::knnMatchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::knnMatchAsync.
+    @param matches Vector of DMatch objects.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+     */
+    virtual void knnMatchConvert(InputArray gpu_matches,
+                                 std::vector< std::vector<DMatch> >& matches,
+                                 bool compactResult = false) = 0;
+
+    //
+    // radius match
+    //
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (blocking version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Found matches.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    virtual void radiusMatch(InputArray queryDescriptors, InputArray trainDescriptors,
+                             std::vector<std::vector<DMatch> >& matches,
+                             float maxDistance,
+                             InputArray mask = noArray(),
+                             bool compactResult = false) = 0;
+
+    /** @overload
+     */
+    virtual void radiusMatch(InputArray queryDescriptors,
+                             std::vector<std::vector<DMatch> >& matches,
+                             float maxDistance,
+                             const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                             bool compactResult = false) = 0;
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance (asynchronous version).
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches array stored in GPU memory. Internal representation is not defined.
+    Use DescriptorMatcher::radiusMatchConvert method to retrieve results in standard representation.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param stream CUDA stream.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    virtual void radiusMatchAsync(InputArray queryDescriptors, InputArray trainDescriptors,
+                                  OutputArray matches,
+                                  float maxDistance,
+                                  InputArray mask = noArray(),
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @overload
+     */
+    virtual void radiusMatchAsync(InputArray queryDescriptors,
+                                  OutputArray matches,
+                                  float maxDistance,
+                                  const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts matches array from internal representation to standard matches vector.
+
+    The method is supposed to be used with DescriptorMatcher::radiusMatchAsync to get final result.
+    Call this method only after DescriptorMatcher::radiusMatchAsync is completed (ie. after synchronization).
+
+    @param gpu_matches Matches, returned from DescriptorMatcher::radiusMatchAsync.
+    @param matches Vector of DMatch objects.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+     */
+    virtual void radiusMatchConvert(InputArray gpu_matches,
+                                    std::vector< std::vector<DMatch> >& matches,
+                                    bool compactResult = false) = 0;
 };

-/** @brief Class used for corner detection using the FAST algorithm. :
+//
+// Feature2DAsync
+//
+
+/** @brief Abstract base class for CUDA asynchronous 2D image feature detectors and descriptor extractors.
 */
-class CV_EXPORTS FAST_CUDA
+class CV_EXPORTS Feature2DAsync
+{
+public:
+    virtual ~Feature2DAsync();
+
+    /** @brief Detects keypoints in an image.
+
+    @param image Image.
+    @param keypoints The detected keypoints.
+    @param mask Mask specifying where to look for keypoints (optional). It must be a 8-bit integer
+    matrix with non-zero values in the region of interest.
+    @param stream CUDA stream.
+     */
+    virtual void detectAsync(InputArray image,
+                             OutputArray keypoints,
+                             InputArray mask = noArray(),
+                             Stream& stream = Stream::Null());
+
+    /** @brief Computes the descriptors for a set of keypoints detected in an image.
+
+    @param image Image.
+    @param keypoints Input collection of keypoints.
+    @param descriptors Computed descriptors. Row j is the descriptor for j-th keypoint.
+    @param stream CUDA stream.
+     */
+    virtual void computeAsync(InputArray image,
+                              OutputArray keypoints,
+                              OutputArray descriptors,
+                              Stream& stream = Stream::Null());
+
+    /** Detects keypoints and computes the descriptors. */
+    virtual void detectAndComputeAsync(InputArray image,
+                                       InputArray mask,
+                                       OutputArray keypoints,
+                                       OutputArray descriptors,
+                                       bool useProvidedKeypoints = false,
+                                       Stream& stream = Stream::Null());
+
+    /** Converts keypoints array from internal representation to standard vector. */
+    virtual void convert(InputArray gpu_keypoints,
+                         std::vector<KeyPoint>& keypoints) = 0;
+};
+
+//
+// FastFeatureDetector
+//
+
+/** @brief Wrapping class for feature detection using the FAST method.
+ */
+class CV_EXPORTS FastFeatureDetector : public cv::FastFeatureDetector, public Feature2DAsync
 {
 public:
    enum
    {
        LOCATION_ROW = 0,
        RESPONSE_ROW,
-        ROWS_COUNT
+        ROWS_COUNT,
+
+        FEATURE_SIZE = 7
    };

-    //! all features have same size
-    static const int FEATURE_SIZE = 7;
+    static Ptr<FastFeatureDetector> create(int threshold=10,
+                                           bool nonmaxSuppression=true,
+                                           int type=FastFeatureDetector::TYPE_9_16,
+                                           int max_npoints = 5000);

-    /** @brief Constructor.
-
-    @param threshold Threshold on difference between intensity of the central pixel and pixels on a
-    circle around this pixel.
-    @param nonmaxSuppression If it is true, non-maximum suppression is applied to detected corners
-    (keypoints).
-    @param keypointsRatio Inner buffer size for keypoints store is determined as (keypointsRatio \*
-    image_width \* image_height).
-     */
-    explicit FAST_CUDA(int threshold, bool nonmaxSuppression = true, double keypointsRatio = 0.05);
-
-    /** @brief Finds the keypoints using FAST detector.
-
-    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
-    supported.
-    @param mask Optional input mask that marks the regions where we should detect features.
-    @param keypoints The output vector of keypoints. Can be stored both in CPU and GPU memory. For GPU
-    memory:
-    -   keypoints.ptr\<Vec2s\>(LOCATION_ROW)[i] will contain location of i'th point
-    -   keypoints.ptr\<float\>(RESPONSE_ROW)[i] will contain response of i'th point (if non-maximum
-    suppression is applied)
-     */
-    void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
-    /** @overload */
-    void operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Download keypoints from GPU to CPU memory.
-    */
-    static void downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
-    */
-    static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
-
-    /** @brief Releases inner buffer memory.
-    */
-    void release();
-
-    bool nonmaxSuppression;
-
-    int threshold;
-
-    //! max keypoints = keypointsRatio * img.size().area()
-    double keypointsRatio;
-
-    /** @brief Find keypoints and compute it's response if nonmaxSuppression is true.
-
-    @param image Image where keypoints (corners) are detected. Only 8-bit grayscale images are
-    supported.
-    @param mask Optional input mask that marks the regions where we should detect features.
-
-    The function returns count of detected keypoints.
-     */
-    int calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask);
-
-    /** @brief Gets final array of keypoints.
-
-    @param keypoints The output vector of keypoints.
-
-    The function performs non-max suppression if needed and returns final count of keypoints.
-     */
-    int getKeyPoints(GpuMat& keypoints);
-
-private:
-    GpuMat kpLoc_;
-    int count_;
-
-    GpuMat score_;
-
-    GpuMat d_keypoints_;
+    virtual void setMaxNumPoints(int max_npoints) = 0;
+    virtual int getMaxNumPoints() const = 0;
 };

-/** @brief Class for extracting ORB features and descriptors from an image. :
+//
+// ORB
+//
+
+/** @brief Class implementing the ORB (*oriented BRIEF*) keypoint detector and descriptor extractor
+ *
+ * @sa cv::ORB
 */
-class CV_EXPORTS ORB_CUDA
+class CV_EXPORTS ORB : public cv::ORB, public Feature2DAsync
 {
 public:
    enum
@@ -331,113 +466,20 @@ public:
        ROWS_COUNT
    };

-    enum
-    {
-        DEFAULT_FAST_THRESHOLD = 20
-    };
-
-    /** @brief Constructor.
-
-    @param nFeatures The number of desired features.
-    @param scaleFactor Coefficient by which we divide the dimensions from one scale pyramid level to
-    the next.
-    @param nLevels The number of levels in the scale pyramid.
-    @param edgeThreshold How far from the boundary the points should be.
-    @param firstLevel The level at which the image is given. If 1, that means we will also look at the
-    image scaleFactor times bigger.
-    @param WTA_K
-    @param scoreType
-    @param patchSize
-     */
-    explicit ORB_CUDA(int nFeatures = 500, float scaleFactor = 1.2f, int nLevels = 8, int edgeThreshold = 31,
-                     int firstLevel = 0, int WTA_K = 2, int scoreType = 0, int patchSize = 31);
-
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
-
-    /** @brief Detects keypoints and computes descriptors for them.
-
-    @param image Input 8-bit grayscale image.
-    @param mask Optional input mask that marks the regions where we should detect features.
-    @param keypoints The input/output vector of keypoints. Can be stored both in CPU and GPU memory.
-    For GPU memory:
-    -   keypoints.ptr\<float\>(X_ROW)[i] contains x coordinate of the i'th feature.
-    -   keypoints.ptr\<float\>(Y_ROW)[i] contains y coordinate of the i'th feature.
-    -   keypoints.ptr\<float\>(RESPONSE_ROW)[i] contains the response of the i'th feature.
-    -   keypoints.ptr\<float\>(ANGLE_ROW)[i] contains orientation of the i'th feature.
-    -   keypoints.ptr\<float\>(OCTAVE_ROW)[i] contains the octave of the i'th feature.
-    -   keypoints.ptr\<float\>(SIZE_ROW)[i] contains the size of the i'th feature.
-    @param descriptors Computed descriptors. if blurForDescriptor is true, image will be blurred
-    before descriptors calculation.
-     */
-    void operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors);
-    /** @overload */
-    void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors);
-
-    /** @brief Download keypoints from GPU to CPU memory.
-    */
-    static void downloadKeyPoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-    /** @brief Converts keypoints from CUDA representation to vector of KeyPoint.
-    */
-    static void convertKeyPoints(const Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
-
-    //! returns the descriptor size in bytes
-    inline int descriptorSize() const { return kBytes; }
-
-    inline void setFastParams(int threshold, bool nonmaxSuppression = true)
-    {
-        fastDetector_.threshold = threshold;
-        fastDetector_.nonmaxSuppression = nonmaxSuppression;
-    }
-
-    /** @brief Releases inner buffer memory.
-    */
-    void release();
+    static Ptr<ORB> create(int nfeatures=500,
+                           float scaleFactor=1.2f,
+                           int nlevels=8,
+                           int edgeThreshold=31,
+                           int firstLevel=0,
+                           int WTA_K=2,
+                           int scoreType=ORB::HARRIS_SCORE,
+                           int patchSize=31,
+                           int fastThreshold=20,
+                           bool blurForDescriptor=false);

    //! if true, image will be blurred before descriptors calculation
-    bool blurForDescriptor;
-
-private:
-    enum { kBytes = 32 };
-
-    void buildScalePyramids(const GpuMat& image, const GpuMat& mask);
-
-    void computeKeyPointsPyramid();
-
-    void computeDescriptors(GpuMat& descriptors);
-
-    void mergeKeyPoints(GpuMat& keypoints);
-
-    int nFeatures_;
-    float scaleFactor_;
-    int nLevels_;
-    int edgeThreshold_;
-    int firstLevel_;
-    int WTA_K_;
-    int scoreType_;
-    int patchSize_;
-
-    //! The number of desired features per scale
-    std::vector<size_t> n_features_per_level_;
-
-    //! Points to compute BRIEF descriptors from
-    GpuMat pattern_;
-
-    std::vector<GpuMat> imagePyr_;
-    std::vector<GpuMat> maskPyr_;
-
-    GpuMat buf_;
-
-    std::vector<GpuMat> keyPointsPyr_;
-    std::vector<int> keyPointsCount_;
-
-    FAST_CUDA fastDetector_;
-
-    Ptr<cuda::Filter> blurFilter;
-
-    GpuMat d_keypoints_;
+    virtual void setBlurForDescriptor(bool blurForDescriptor) = 0;
+    virtual bool getBlurForDescriptor() const = 0;
 };

 //! @}
--- a/modules/cudafeatures2d/perf/perf_features2d.cpp
+++ b/modules/cudafeatures2d/perf/perf_features2d.cpp
@@ -64,15 +64,18 @@ PERF_TEST_P(Image_Threshold_NonMaxSuppression, FAST,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::FAST_CUDA d_fast(threshold, nonMaxSuppersion, 0.5);
+        cv::Ptr<cv::cuda::FastFeatureDetector> d_fast =
+                cv::cuda::FastFeatureDetector::create(threshold, nonMaxSuppersion,
+                                                      cv::FastFeatureDetector::TYPE_9_16,
+                                                      0.5 * img.size().area());

        const cv::cuda::GpuMat d_img(img);
        cv::cuda::GpuMat d_keypoints;

-        TEST_CYCLE() d_fast(d_img, cv::cuda::GpuMat(), d_keypoints);
+        TEST_CYCLE() d_fast->detectAsync(d_img, d_keypoints);

        std::vector<cv::KeyPoint> gpu_keypoints;
-        d_fast.downloadKeypoints(d_keypoints, gpu_keypoints);
+        d_fast->convert(d_keypoints, gpu_keypoints);

        sortKeyPoints(gpu_keypoints);

@@ -106,15 +109,15 @@ PERF_TEST_P(Image_NFeatures, ORB,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::ORB_CUDA d_orb(nFeatures);
+        cv::Ptr<cv::cuda::ORB> d_orb = cv::cuda::ORB::create(nFeatures);

        const cv::cuda::GpuMat d_img(img);
        cv::cuda::GpuMat d_keypoints, d_descriptors;

-        TEST_CYCLE() d_orb(d_img, cv::cuda::GpuMat(), d_keypoints, d_descriptors);
+        TEST_CYCLE() d_orb->detectAndComputeAsync(d_img, cv::noArray(), d_keypoints, d_descriptors);

        std::vector<cv::KeyPoint> gpu_keypoints;
-        d_orb.downloadKeyPoints(d_keypoints, gpu_keypoints);
+        d_orb->convert(d_keypoints, gpu_keypoints);

        cv::Mat gpu_descriptors(d_descriptors);

@@ -164,16 +167,16 @@ PERF_TEST_P(DescSize_Norm, BFMatch,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);

        const cv::cuda::GpuMat d_query(query);
        const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_distance;
+        cv::cuda::GpuMat d_matches;

-        TEST_CYCLE() d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        TEST_CYCLE() d_matcher->matchAsync(d_query, d_train, d_matches);

        std::vector<cv::DMatch> gpu_matches;
-        d_matcher.matchDownload(d_trainIdx, d_distance, gpu_matches);
+        d_matcher->matchConvert(d_matches, gpu_matches);

        SANITY_CHECK_MATCHES(gpu_matches);
    }
@@ -223,16 +226,16 @@ PERF_TEST_P(DescSize_K_Norm, BFKnnMatch,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);

        const cv::cuda::GpuMat d_query(query);
        const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_distance, d_allDist;
+        cv::cuda::GpuMat d_matches;

-        TEST_CYCLE() d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        TEST_CYCLE() d_matcher->knnMatchAsync(d_query, d_train, d_matches, k);

        std::vector< std::vector<cv::DMatch> > matchesTbl;
-        d_matcher.knnMatchDownload(d_trainIdx, d_distance, matchesTbl);
+        d_matcher->knnMatchConvert(d_matches, matchesTbl);

        std::vector<cv::DMatch> gpu_matches;
        toOneRowMatches(matchesTbl, gpu_matches);
@@ -277,16 +280,16 @@ PERF_TEST_P(DescSize_Norm, BFRadiusMatch,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::BFMatcher_CUDA d_matcher(normType);
+        cv::Ptr<cv::cuda::DescriptorMatcher> d_matcher = cv::cuda::DescriptorMatcher::createBFMatcher(normType);

        const cv::cuda::GpuMat d_query(query);
        const cv::cuda::GpuMat d_train(train);
-        cv::cuda::GpuMat d_trainIdx, d_nMatches, d_distance;
+        cv::cuda::GpuMat d_matches;

-        TEST_CYCLE() d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, maxDistance);
+        TEST_CYCLE() d_matcher->radiusMatchAsync(d_query, d_train, d_matches, maxDistance);

        std::vector< std::vector<cv::DMatch> > matchesTbl;
-        d_matcher.radiusMatchDownload(d_trainIdx, d_distance, d_nMatches, matchesTbl);
+        d_matcher->radiusMatchConvert(d_matches, matchesTbl);

        std::vector<cv::DMatch> gpu_matches;
        toOneRowMatches(matchesTbl, gpu_matches);
--- a/modules/cudafeatures2d/src/brute_force_matcher.cpp
+++ b/modules/cudafeatures2d/src/brute_force_matcher.cpp
--- a/modules/cudafeatures2d/src/cuda/fast.cu
+++ b/modules/cudafeatures2d/src/cuda/fast.cu
@@ -279,7 +279,7 @@ namespace cv { namespace cuda { namespace device
            #endif
        }

-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold)
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -290,29 +290,29 @@ namespace cv { namespace cuda { namespace device
            grid.x = divUp(img.cols - 6, block.x);
            grid.y = divUp(img.rows - 6, block.y);

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );

            if (score.data)
            {
                if (mask.data)
-                    calcKeypoints<true><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
                else
-                    calcKeypoints<true><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
            }
            else
            {
                if (mask.data)
-                    calcKeypoints<false><<<grid, block>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold);
                else
-                    calcKeypoints<false><<<grid, block>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
+                    calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold);
            }

            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-
            unsigned int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );

            return count;
        }
@@ -356,7 +356,7 @@ namespace cv { namespace cuda { namespace device
            #endif
        }

-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response)
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -366,15 +366,15 @@ namespace cv { namespace cuda { namespace device
            dim3 grid;
            grid.x = divUp(count, block.x);

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) );

-            nonmaxSuppression<<<grid, block>>>(kpLoc, count, score, loc, response);
+            nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response);
            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-
            unsigned int new_count;
-            cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+            cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
+
+            cudaSafeCall( cudaStreamSynchronize(stream) );

            return new_count;
        }
--- a/modules/cudafeatures2d/src/fast.cpp
+++ b/modules/cudafeatures2d/src/fast.cpp
@@ -47,124 +47,162 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::cuda::FAST_CUDA::FAST_CUDA(int, bool, double) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::FAST_CUDA::release() { throw_no_cuda(); }
-int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_no_cuda(); return 0; }
-int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat&) { throw_no_cuda(); return 0; }
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int, bool, int, int) { throw_no_cuda(); return Ptr<cv::cuda::FastFeatureDetector>(); }

 #else /* !defined (HAVE_CUDA) */

-cv::cuda::FAST_CUDA::FAST_CUDA(int _threshold, bool _nonmaxSuppression, double _keypointsRatio) :
-    nonmaxSuppression(_nonmaxSuppression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
-{
-}
-
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    if (image.empty())
-        return;
-
-    (*this)(image, mask, d_keypoints_);
-    downloadKeypoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::FAST_CUDA::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-        return;
-
-    Mat h_keypoints(d_keypoints);
-    convertKeypoints(h_keypoints, keypoints);
-}
-
-void cv::cuda::FAST_CUDA::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (h_keypoints.empty())
-        return;
-
-    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
-
-    int npoints = h_keypoints.cols;
-
-    keypoints.resize(npoints);
-
-    const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
-    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
-
-    for (int i = 0; i < npoints; ++i)
-    {
-        KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
-        keypoints[i] = kp;
-    }
-}
-
-void cv::cuda::FAST_CUDA::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
-{
-    calcKeyPointsLocation(img, mask);
-    keypoints.cols = getKeyPoints(keypoints);
-}
-
 namespace cv { namespace cuda { namespace device
 {
    namespace fast
    {
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
-        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream);
+        int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream);
    }
 }}}

-int cv::cuda::FAST_CUDA::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
+namespace
 {
-    using namespace cv::cuda::device::fast;
-
-    CV_Assert(img.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
-
-    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
-
-    ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
-
-    if (nonmaxSuppression)
+    class FAST_Impl : public cv::cuda::FastFeatureDetector
+    {
+    public:
+        FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints);
+
+        virtual void detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask);
+        virtual void detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream);
+
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
+
+        virtual void setThreshold(int threshold) { threshold_ = threshold; }
+        virtual int getThreshold() const { return threshold_; }
+
+        virtual void setNonmaxSuppression(bool f) { nonmaxSuppression_ = f; }
+        virtual bool getNonmaxSuppression() const { return nonmaxSuppression_; }
+
+        virtual void setMaxNumPoints(int max_npoints) { max_npoints_ = max_npoints; }
+        virtual int getMaxNumPoints() const { return max_npoints_; }
+
+        virtual void setType(int type) { CV_Assert( type == TYPE_9_16 ); }
+        virtual int getType() const { return TYPE_9_16; }
+
+    private:
+        int threshold_;
+        bool nonmaxSuppression_;
+        int max_npoints_;
+    };
+
+    FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
+        threshold_(threshold), nonmaxSuppression_(nonmaxSuppression), max_npoints_(max_npoints)
    {
-        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
-        score_.setTo(Scalar::all(0));
    }

-    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSuppression ? score_ : PtrStepSzi(), threshold);
-    count_ = std::min(count_, maxKeypoints);
+    void FAST_Impl::detect(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask)
+    {
+        if (_image.empty())
+        {
+            keypoints.clear();
+            return;
+        }

-    return count_;
+        BufferPool pool(Stream::Null());
+        GpuMat d_keypoints = pool.getBuffer(ROWS_COUNT, max_npoints_, CV_16SC2);
+
+        detectAsync(_image, d_keypoints, _mask, Stream::Null());
+        convert(d_keypoints, keypoints);
+    }
+
+    void FAST_Impl::detectAsync(InputArray _image, OutputArray _keypoints, InputArray _mask, Stream& stream)
+    {
+        using namespace cv::cuda::device::fast;
+
+        const GpuMat img = _image.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()) );
+
+        BufferPool pool(stream);
+
+        GpuMat kpLoc = pool.getBuffer(1, max_npoints_, CV_16SC2);
+
+        GpuMat score;
+        if (nonmaxSuppression_)
+        {
+            score = pool.getBuffer(img.size(), CV_32SC1);
+            score.setTo(Scalar::all(0), stream);
+        }
+
+        int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream));
+        count = std::min(count, max_npoints_);
+
+        if (count == 0)
+        {
+            _keypoints.release();
+            return;
+        }
+
+        ensureSizeIsEnough(ROWS_COUNT, count, CV_32FC1, _keypoints);
+        GpuMat& keypoints = _keypoints.getGpuMatRef();
+
+        if (nonmaxSuppression_)
+        {
+            count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream));
+            if (count == 0)
+            {
+                keypoints.release();
+            }
+            else
+            {
+                keypoints.cols = count;
+            }
+        }
+        else
+        {
+            GpuMat locRow(1, count, kpLoc.type(), keypoints.ptr(0));
+            kpLoc.colRange(0, count).copyTo(locRow, stream);
+            keypoints.row(1).setTo(Scalar::all(0), stream);
+        }
+    }
+
+    void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
+    {
+        if (_gpu_keypoints.empty())
+        {
+            keypoints.clear();
+            return;
+        }
+
+        Mat h_keypoints;
+        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_keypoints.getGpuMat().download(h_keypoints);
+        }
+        else
+        {
+            h_keypoints = _gpu_keypoints.getMat();
+        }
+
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
+        CV_Assert( h_keypoints.elemSize() == 4 );
+
+        const int npoints = h_keypoints.cols;
+
+        keypoints.resize(npoints);
+
+        const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
+        const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
+
+        for (int i = 0; i < npoints; ++i)
+        {
+            KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
+            keypoints[i] = kp;
+        }
+    }
 }

-int cv::cuda::FAST_CUDA::getKeyPoints(GpuMat& keypoints)
+Ptr<cv::cuda::FastFeatureDetector> cv::cuda::FastFeatureDetector::create(int threshold, bool nonmaxSuppression, int type, int max_npoints)
 {
-    using namespace cv::cuda::device::fast;
-
-    if (count_ == 0)
-        return 0;
-
-    ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
-
-    if (nonmaxSuppression)
-        return nonmaxSuppression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
-
-    GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
-    kpLoc_.colRange(0, count_).copyTo(locRow);
-    keypoints.row(1).setTo(Scalar::all(0));
-
-    return count_;
-}
-
-void cv::cuda::FAST_CUDA::release()
-{
-    kpLoc_.release();
-    score_.release();
-
-    d_keypoints_.release();
+    CV_Assert( type == TYPE_9_16 );
+    return makePtr<FAST_Impl>(threshold, nonmaxSuppression, max_npoints);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudafeatures2d/src/feature2d_async.cpp
+++ b/modules/cudafeatures2d/src/feature2d_async.cpp
@@ -0,0 +1,85 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+cv::cuda::Feature2DAsync::~Feature2DAsync()
+{
+}
+
+void cv::cuda::Feature2DAsync::detectAsync(InputArray image,
+                                           OutputArray keypoints,
+                                           InputArray mask,
+                                           Stream& stream)
+{
+    if (image.empty())
+    {
+        keypoints.clear();
+        return;
+    }
+
+    detectAndComputeAsync(image, mask, keypoints, noArray(), false, stream);
+}
+
+void cv::cuda::Feature2DAsync::computeAsync(InputArray image,
+                                            OutputArray keypoints,
+                                            OutputArray descriptors,
+                                            Stream& stream)
+{
+    if (image.empty())
+    {
+        descriptors.release();
+        return;
+    }
+
+    detectAndComputeAsync(image, noArray(), keypoints, descriptors, true, stream);
+}
+
+void cv::cuda::Feature2DAsync::detectAndComputeAsync(InputArray /*image*/,
+                                                     InputArray /*mask*/,
+                                                     OutputArray /*keypoints*/,
+                                                     OutputArray /*descriptors*/,
+                                                     bool /*useProvidedKeypoints*/,
+                                                     Stream& /*stream*/)
+{
+    CV_Error(Error::StsNotImplemented, "");
+}
--- a/modules/cudafeatures2d/src/orb.cpp
+++ b/modules/cudafeatures2d/src/orb.cpp
@@ -47,18 +47,7 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::cuda::ORB_CUDA::ORB_CUDA(int, float, int, int, int, int, int, int) : fastDetector_(20) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::downloadKeyPoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::convertKeyPoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::release() { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::buildScalePyramids(const GpuMat&, const GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::computeKeyPointsPyramid() { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::computeDescriptors(GpuMat&) { throw_no_cuda(); }
-void cv::cuda::ORB_CUDA::mergeKeyPoints(GpuMat&) { throw_no_cuda(); }
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int, float, int, int, int, int, int, int, int, bool) { throw_no_cuda(); return Ptr<cv::cuda::ORB>(); }

 #else /* !defined (HAVE_CUDA) */

@@ -346,7 +335,100 @@ namespace
        -1,-6, 0,-11/*mean (0.127148), correlation (0.547401)*/
    };

-    void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
+    class ORB_Impl : public cv::cuda::ORB
+    {
+    public:
+        ORB_Impl(int nfeatures,
+                 float scaleFactor,
+                 int nlevels,
+                 int edgeThreshold,
+                 int firstLevel,
+                 int WTA_K,
+                 int scoreType,
+                 int patchSize,
+                 int fastThreshold,
+                 bool blurForDescriptor);
+
+        virtual void detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints);
+        virtual void detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream);
+
+        virtual void convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints);
+
+        virtual int descriptorSize() const { return kBytes; }
+        virtual int descriptorType() const { return CV_8U; }
+        virtual int defaultNorm() const { return NORM_HAMMING; }
+
+        virtual void setMaxFeatures(int maxFeatures) { nFeatures_ = maxFeatures; }
+        virtual int getMaxFeatures() const { return nFeatures_; }
+
+        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
+        virtual double getScaleFactor() const { return scaleFactor_; }
+
+        virtual void setNLevels(int nlevels) { nLevels_ = nlevels; }
+        virtual int getNLevels() const { return nLevels_; }
+
+        virtual void setEdgeThreshold(int edgeThreshold) { edgeThreshold_ = edgeThreshold; }
+        virtual int getEdgeThreshold() const { return edgeThreshold_; }
+
+        virtual void setFirstLevel(int firstLevel) { firstLevel_ = firstLevel; }
+        virtual int getFirstLevel() const { return firstLevel_; }
+
+        virtual void setWTA_K(int wta_k) { WTA_K_ = wta_k; }
+        virtual int getWTA_K() const { return WTA_K_; }
+
+        virtual void setScoreType(int scoreType) { scoreType_ = scoreType; }
+        virtual int getScoreType() const { return scoreType_; }
+
+        virtual void setPatchSize(int patchSize) { patchSize_ = patchSize; }
+        virtual int getPatchSize() const { return patchSize_; }
+
+        virtual void setFastThreshold(int fastThreshold) { fastThreshold_ = fastThreshold; }
+        virtual int getFastThreshold() const { return fastThreshold_; }
+
+        virtual void setBlurForDescriptor(bool blurForDescriptor) { blurForDescriptor_ = blurForDescriptor; }
+        virtual bool getBlurForDescriptor() const { return blurForDescriptor_; }
+
+    private:
+        int nFeatures_;
+        float scaleFactor_;
+        int nLevels_;
+        int edgeThreshold_;
+        int firstLevel_;
+        int WTA_K_;
+        int scoreType_;
+        int patchSize_;
+        int fastThreshold_;
+        bool blurForDescriptor_;
+
+    private:
+        void buildScalePyramids(InputArray _image, InputArray _mask);
+        void computeKeyPointsPyramid();
+        void computeDescriptors(OutputArray _descriptors);
+        void mergeKeyPoints(OutputArray _keypoints);
+
+    private:
+        Ptr<cv::cuda::FastFeatureDetector> fastDetector_;
+
+        //! The number of desired features per scale
+        std::vector<size_t> n_features_per_level_;
+
+        //! Points to compute BRIEF descriptors from
+        GpuMat pattern_;
+
+        std::vector<GpuMat> imagePyr_;
+        std::vector<GpuMat> maskPyr_;
+
+        GpuMat buf_;
+
+        std::vector<GpuMat> keyPointsPyr_;
+        std::vector<int> keyPointsCount_;
+
+        Ptr<cuda::Filter> blurFilter_;
+
+        GpuMat d_keypoints_;
+    };
+
+    static void initializeOrbPattern(const Point* pattern0, Mat& pattern, int ntuples, int tupleSize, int poolSize)
    {
        RNG rng(0x12345678);

@@ -381,7 +463,7 @@ namespace
        }
    }

-    void makeRandomPattern(int patchSize, Point* pattern, int npoints)
+    static void makeRandomPattern(int patchSize, Point* pattern, int npoints)
    {
        // we always start with a fixed seed,
        // to make patterns the same on each run
@@ -393,155 +475,189 @@ namespace
            pattern[i].y = rng.uniform(-patchSize / 2, patchSize / 2 + 1);
        }
    }
-}

-cv::cuda::ORB_CUDA::ORB_CUDA(int nFeatures, float scaleFactor, int nLevels, int edgeThreshold, int firstLevel, int WTA_K, int scoreType, int patchSize) :
-    nFeatures_(nFeatures), scaleFactor_(scaleFactor), nLevels_(nLevels), edgeThreshold_(edgeThreshold), firstLevel_(firstLevel), WTA_K_(WTA_K),
-    scoreType_(scoreType), patchSize_(patchSize),
-    fastDetector_(DEFAULT_FAST_THRESHOLD)
-{
-    CV_Assert(patchSize_ >= 2);
-
-    // fill the extractors and descriptors for the corresponding scales
-    float factor = 1.0f / scaleFactor_;
-    float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));
-
-    n_features_per_level_.resize(nLevels_);
-    size_t sum_n_features = 0;
-    for (int level = 0; level < nLevels_ - 1; ++level)
+    ORB_Impl::ORB_Impl(int nFeatures,
+                       float scaleFactor,
+                       int nLevels,
+                       int edgeThreshold,
+                       int firstLevel,
+                       int WTA_K,
+                       int scoreType,
+                       int patchSize,
+                       int fastThreshold,
+                       bool blurForDescriptor) :
+        nFeatures_(nFeatures),
+        scaleFactor_(scaleFactor),
+        nLevels_(nLevels),
+        edgeThreshold_(edgeThreshold),
+        firstLevel_(firstLevel),
+        WTA_K_(WTA_K),
+        scoreType_(scoreType),
+        patchSize_(patchSize),
+        fastThreshold_(fastThreshold),
+        blurForDescriptor_(blurForDescriptor)
    {
-        n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
-        sum_n_features += n_features_per_level_[level];
-        n_desired_features_per_scale *= factor;
-    }
-    n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;
+        CV_Assert( patchSize_ >= 2 );
+        CV_Assert( WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4 );

-    // pre-compute the end of a row in a circular patch
-    int half_patch_size = patchSize_ / 2;
-    std::vector<int> u_max(half_patch_size + 2);
-    for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
-        u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+        fastDetector_ = cuda::FastFeatureDetector::create(fastThreshold_);

-    // Make sure we are symmetric
-    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
-    {
-        while (u_max[v_0] == u_max[v_0 + 1])
-            ++v_0;
-        u_max[v] = v_0;
-        ++v_0;
-    }
-    CV_Assert(u_max.size() < 32);
-    cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));
+        // fill the extractors and descriptors for the corresponding scales
+        float factor = 1.0f / scaleFactor_;
+        float n_desired_features_per_scale = nFeatures_ * (1.0f - factor) / (1.0f - std::pow(factor, nLevels_));

-    // Calc pattern
-    const int npoints = 512;
-    Point pattern_buf[npoints];
-    const Point* pattern0 = (const Point*)bit_pattern_31_;
-    if (patchSize_ != 31)
-    {
-        pattern0 = pattern_buf;
-        makeRandomPattern(patchSize_, pattern_buf, npoints);
-    }
-
-    CV_Assert(WTA_K_ == 2 || WTA_K_ == 3 || WTA_K_ == 4);
-
-    Mat h_pattern;
-
-    if (WTA_K_ == 2)
-    {
-        h_pattern.create(2, npoints, CV_32SC1);
-
-        int* pattern_x_ptr = h_pattern.ptr<int>(0);
-        int* pattern_y_ptr = h_pattern.ptr<int>(1);
-
-        for (int i = 0; i < npoints; ++i)
+        n_features_per_level_.resize(nLevels_);
+        size_t sum_n_features = 0;
+        for (int level = 0; level < nLevels_ - 1; ++level)
        {
-            pattern_x_ptr[i] = pattern0[i].x;
-            pattern_y_ptr[i] = pattern0[i].y;
+            n_features_per_level_[level] = cvRound(n_desired_features_per_scale);
+            sum_n_features += n_features_per_level_[level];
+            n_desired_features_per_scale *= factor;
        }
-    }
-    else
-    {
-        int ntuples = descriptorSize() * 4;
-        initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
-    }
+        n_features_per_level_[nLevels_ - 1] = nFeatures - sum_n_features;

-    pattern_.upload(h_pattern);
-
-    blurFilter = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
-
-    blurForDescriptor = false;
-}
-
-namespace
-{
-    inline float getScale(float scaleFactor, int firstLevel, int level)
-    {
-        return pow(scaleFactor, level - firstLevel);
-    }
-}
-
-void cv::cuda::ORB_CUDA::buildScalePyramids(const GpuMat& image, const GpuMat& mask)
-{
-    CV_Assert(image.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
-
-    imagePyr_.resize(nLevels_);
-    maskPyr_.resize(nLevels_);
-
-    for (int level = 0; level < nLevels_; ++level)
-    {
-        float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
-
-        Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
-
-        ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
-        ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
-        maskPyr_[level].setTo(Scalar::all(255));
-
-        // Compute the resized image
-        if (level != firstLevel_)
+        // pre-compute the end of a row in a circular patch
+        int half_patch_size = patchSize_ / 2;
+        std::vector<int> u_max(half_patch_size + 2);
+        for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
        {
-            if (level < firstLevel_)
-            {
-                cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+            u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+        }

-                if (!mask.empty())
-                    cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
-            }
-            else
-            {
-                cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+        // Make sure we are symmetric
+        for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
+        {
+            while (u_max[v_0] == u_max[v_0 + 1])
+                ++v_0;
+            u_max[v] = v_0;
+            ++v_0;
+        }
+        CV_Assert( u_max.size() < 32 );
+        cv::cuda::device::orb::loadUMax(&u_max[0], static_cast<int>(u_max.size()));

-                if (!mask.empty())
-                {
-                    cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
-                    cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
-                }
+        // Calc pattern
+        const int npoints = 512;
+        Point pattern_buf[npoints];
+        const Point* pattern0 = (const Point*)bit_pattern_31_;
+        if (patchSize_ != 31)
+        {
+            pattern0 = pattern_buf;
+            makeRandomPattern(patchSize_, pattern_buf, npoints);
+        }
+
+        Mat h_pattern;
+        if (WTA_K_ == 2)
+        {
+            h_pattern.create(2, npoints, CV_32SC1);
+
+            int* pattern_x_ptr = h_pattern.ptr<int>(0);
+            int* pattern_y_ptr = h_pattern.ptr<int>(1);
+
+            for (int i = 0; i < npoints; ++i)
+            {
+                pattern_x_ptr[i] = pattern0[i].x;
+                pattern_y_ptr[i] = pattern0[i].y;
            }
        }
        else
        {
-            image.copyTo(imagePyr_[level]);
-
-            if (!mask.empty())
-                mask.copyTo(maskPyr_[level]);
+            int ntuples = descriptorSize() * 4;
+            initializeOrbPattern(pattern0, h_pattern, ntuples, WTA_K_, npoints);
        }

-        // Filter keypoints by image border
-        ensureSizeIsEnough(sz, CV_8UC1, buf_);
-        buf_.setTo(Scalar::all(0));
-        Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
-        buf_(inner).setTo(Scalar::all(255));
+        pattern_.upload(h_pattern);

-        cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+        blurFilter_ = cuda::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
    }
-}

-namespace
-{
-    //takes keypoints and culls them by the response
-    void cull(GpuMat& keypoints, int& count, int n_points)
+    void ORB_Impl::detectAndCompute(InputArray _image, InputArray _mask, std::vector<KeyPoint>& keypoints, OutputArray _descriptors, bool useProvidedKeypoints)
+    {
+        CV_Assert( useProvidedKeypoints == false );
+
+        detectAndComputeAsync(_image, _mask, d_keypoints_, _descriptors, false, Stream::Null());
+        convert(d_keypoints_, keypoints);
+    }
+
+    void ORB_Impl::detectAndComputeAsync(InputArray _image, InputArray _mask, OutputArray _keypoints, OutputArray _descriptors, bool useProvidedKeypoints, Stream& stream)
+    {
+        CV_Assert( useProvidedKeypoints == false );
+
+        buildScalePyramids(_image, _mask);
+        computeKeyPointsPyramid();
+        if (_descriptors.needed())
+        {
+            computeDescriptors(_descriptors);
+        }
+        mergeKeyPoints(_keypoints);
+    }
+
+    static float getScale(float scaleFactor, int firstLevel, int level)
+    {
+        return pow(scaleFactor, level - firstLevel);
+    }
+
+    void ORB_Impl::buildScalePyramids(InputArray _image, InputArray _mask)
+    {
+        const GpuMat image = _image.getGpuMat();
+        const GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( image.type() == CV_8UC1 );
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
+
+        imagePyr_.resize(nLevels_);
+        maskPyr_.resize(nLevels_);
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            float scale = 1.0f / getScale(scaleFactor_, firstLevel_, level);
+
+            Size sz(cvRound(image.cols * scale), cvRound(image.rows * scale));
+
+            ensureSizeIsEnough(sz, image.type(), imagePyr_[level]);
+            ensureSizeIsEnough(sz, CV_8UC1, maskPyr_[level]);
+            maskPyr_[level].setTo(Scalar::all(255));
+
+            // Compute the resized image
+            if (level != firstLevel_)
+            {
+                if (level < firstLevel_)
+                {
+                    cuda::resize(image, imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+
+                    if (!mask.empty())
+                        cuda::resize(mask, maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+                }
+                else
+                {
+                    cuda::resize(imagePyr_[level - 1], imagePyr_[level], sz, 0, 0, INTER_LINEAR);
+
+                    if (!mask.empty())
+                    {
+                        cuda::resize(maskPyr_[level - 1], maskPyr_[level], sz, 0, 0, INTER_LINEAR);
+                        cuda::threshold(maskPyr_[level], maskPyr_[level], 254, 0, THRESH_TOZERO);
+                    }
+                }
+            }
+            else
+            {
+                image.copyTo(imagePyr_[level]);
+
+                if (!mask.empty())
+                    mask.copyTo(maskPyr_[level]);
+            }
+
+            // Filter keypoints by image border
+            ensureSizeIsEnough(sz, CV_8UC1, buf_);
+            buf_.setTo(Scalar::all(0));
+            Rect inner(edgeThreshold_, edgeThreshold_, sz.width - 2 * edgeThreshold_, sz.height - 2 * edgeThreshold_);
+            buf_(inner).setTo(Scalar::all(255));
+
+            cuda::bitwise_and(maskPyr_[level], buf_, maskPyr_[level]);
+        }
+    }
+
+    // takes keypoints and culls them by the response
+    static void cull(GpuMat& keypoints, int& count, int n_points)
    {
        using namespace cv::cuda::device::orb;

@@ -554,222 +670,199 @@ namespace
                return;
            }

-            count = cull_gpu(keypoints.ptr<int>(FAST_CUDA::LOCATION_ROW), keypoints.ptr<float>(FAST_CUDA::RESPONSE_ROW), count, n_points);
+            count = cull_gpu(keypoints.ptr<int>(cuda::FastFeatureDetector::LOCATION_ROW), keypoints.ptr<float>(cuda::FastFeatureDetector::RESPONSE_ROW), count, n_points);
        }
    }
-}

-void cv::cuda::ORB_CUDA::computeKeyPointsPyramid()
-{
-    using namespace cv::cuda::device::orb;
-
-    int half_patch_size = patchSize_ / 2;
-
-    keyPointsPyr_.resize(nLevels_);
-    keyPointsCount_.resize(nLevels_);
-
-    for (int level = 0; level < nLevels_; ++level)
+    void ORB_Impl::computeKeyPointsPyramid()
    {
-        keyPointsCount_[level] = fastDetector_.calcKeyPointsLocation(imagePyr_[level], maskPyr_[level]);
+        using namespace cv::cuda::device::orb;

-        if (keyPointsCount_[level] == 0)
-            continue;
+        int half_patch_size = patchSize_ / 2;

-        ensureSizeIsEnough(3, keyPointsCount_[level], CV_32FC1, keyPointsPyr_[level]);
+        keyPointsPyr_.resize(nLevels_);
+        keyPointsCount_.resize(nLevels_);

-        GpuMat fastKpRange = keyPointsPyr_[level].rowRange(0, 2);
-        keyPointsCount_[level] = fastDetector_.getKeyPoints(fastKpRange);
+        fastDetector_->setThreshold(fastThreshold_);

-        if (keyPointsCount_[level] == 0)
-            continue;
-
-        int n_features = static_cast<int>(n_features_per_level_[level]);
-
-        if (scoreType_ == ORB::HARRIS_SCORE)
+        for (int level = 0; level < nLevels_; ++level)
        {
-            // Keep more points than necessary as FAST does not give amazing corners
-            cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
+            fastDetector_->setMaxNumPoints(0.05 * imagePyr_[level].size().area());

-            // Compute the Harris cornerness (better scoring than FAST)
-            HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+            GpuMat fastKpRange;
+            fastDetector_->detectAsync(imagePyr_[level], fastKpRange, maskPyr_[level], Stream::Null());
+
+            keyPointsCount_[level] = fastKpRange.cols;
+
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            ensureSizeIsEnough(3, keyPointsCount_[level], fastKpRange.type(), keyPointsPyr_[level]);
+            fastKpRange.copyTo(keyPointsPyr_[level].rowRange(0, 2));
+
+            const int n_features = static_cast<int>(n_features_per_level_[level]);
+
+            if (scoreType_ == ORB::HARRIS_SCORE)
+            {
+                // Keep more points than necessary as FAST does not give amazing corners
+                cull(keyPointsPyr_[level], keyPointsCount_[level], 2 * n_features);
+
+                // Compute the Harris cornerness (better scoring than FAST)
+                HarrisResponses_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(1), keyPointsCount_[level], 7, HARRIS_K, 0);
+            }
+
+            //cull to the final desired level, using the new Harris scores or the original FAST scores.
+            cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
+
+            // Compute orientation
+            IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
        }
-
-        //cull to the final desired level, using the new Harris scores or the original FAST scores.
-        cull(keyPointsPyr_[level], keyPointsCount_[level], n_features);
-
-        // Compute orientation
-        IC_Angle_gpu(imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2), keyPointsCount_[level], half_patch_size, 0);
-    }
-}
-
-void cv::cuda::ORB_CUDA::computeDescriptors(GpuMat& descriptors)
-{
-    using namespace cv::cuda::device::orb;
-
-    int nAllkeypoints = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
-        nAllkeypoints += keyPointsCount_[level];
-
-    if (nAllkeypoints == 0)
-    {
-        descriptors.release();
-        return;
    }

-    ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, descriptors);
-
-    int offset = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
+    void ORB_Impl::computeDescriptors(OutputArray _descriptors)
    {
-        if (keyPointsCount_[level] == 0)
-            continue;
+        using namespace cv::cuda::device::orb;

-        GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
+        int nAllkeypoints = 0;

-        if (blurForDescriptor)
+        for (int level = 0; level < nLevels_; ++level)
+            nAllkeypoints += keyPointsCount_[level];
+
+        if (nAllkeypoints == 0)
        {
-            // preprocess the resized image
-            ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_);
+            _descriptors.release();
+            return;
        }

-        computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
-            keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
+        ensureSizeIsEnough(nAllkeypoints, descriptorSize(), CV_8UC1, _descriptors);
+        GpuMat descriptors = _descriptors.getGpuMat();

-        offset += keyPointsCount_[level];
+        int offset = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            GpuMat descRange = descriptors.rowRange(offset, offset + keyPointsCount_[level]);
+
+            if (blurForDescriptor_)
+            {
+                // preprocess the resized image
+                ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
+                blurFilter_->apply(imagePyr_[level], buf_);
+            }
+
+            computeOrbDescriptor_gpu(blurForDescriptor_ ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
+                keyPointsCount_[level], pattern_.ptr<int>(0), pattern_.ptr<int>(1), descRange, descriptorSize(), WTA_K_, 0);
+
+            offset += keyPointsCount_[level];
+        }
    }
-}

-void cv::cuda::ORB_CUDA::mergeKeyPoints(GpuMat& keypoints)
-{
-    using namespace cv::cuda::device::orb;
-
-    int nAllkeypoints = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
-        nAllkeypoints += keyPointsCount_[level];
-
-    if (nAllkeypoints == 0)
+    void ORB_Impl::mergeKeyPoints(OutputArray _keypoints)
    {
-        keypoints.release();
-        return;
+        using namespace cv::cuda::device::orb;
+
+        int nAllkeypoints = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+            nAllkeypoints += keyPointsCount_[level];
+
+        if (nAllkeypoints == 0)
+        {
+            _keypoints.release();
+            return;
+        }
+
+        ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, _keypoints);
+        GpuMat& keypoints = _keypoints.getGpuMatRef();
+
+        int offset = 0;
+
+        for (int level = 0; level < nLevels_; ++level)
+        {
+            if (keyPointsCount_[level] == 0)
+                continue;
+
+            float sf = getScale(scaleFactor_, firstLevel_, level);
+
+            GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
+
+            float locScale = level != firstLevel_ ? sf : 1.0f;
+
+            mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+
+            GpuMat range = keyPointsRange.rowRange(2, 4);
+            keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
+
+            keyPointsRange.row(4).setTo(Scalar::all(level));
+            keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
+
+            offset += keyPointsCount_[level];
+        }
    }

-    ensureSizeIsEnough(ROWS_COUNT, nAllkeypoints, CV_32FC1, keypoints);
-
-    int offset = 0;
-
-    for (int level = 0; level < nLevels_; ++level)
+    void ORB_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
    {
-        if (keyPointsCount_[level] == 0)
-            continue;
+        if (_gpu_keypoints.empty())
+        {
+            keypoints.clear();
+            return;
+        }

-        float sf = getScale(scaleFactor_, firstLevel_, level);
+        Mat h_keypoints;
+        if (_gpu_keypoints.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_keypoints.getGpuMat().download(h_keypoints);
+        }
+        else
+        {
+            h_keypoints = _gpu_keypoints.getMat();
+        }

-        GpuMat keyPointsRange = keypoints.colRange(offset, offset + keyPointsCount_[level]);
+        CV_Assert( h_keypoints.rows == ROWS_COUNT );
+        CV_Assert( h_keypoints.type() == CV_32FC1 );

-        float locScale = level != firstLevel_ ? sf : 1.0f;
+        const int npoints = h_keypoints.cols;

-        mergeLocation_gpu(keyPointsPyr_[level].ptr<short2>(0), keyPointsRange.ptr<float>(0), keyPointsRange.ptr<float>(1), keyPointsCount_[level], locScale, 0);
+        keypoints.resize(npoints);

-        GpuMat range = keyPointsRange.rowRange(2, 4);
-        keyPointsPyr_[level](Range(1, 3), Range(0, keyPointsCount_[level])).copyTo(range);
+        const float* x_ptr = h_keypoints.ptr<float>(X_ROW);
+        const float* y_ptr = h_keypoints.ptr<float>(Y_ROW);
+        const float* response_ptr = h_keypoints.ptr<float>(RESPONSE_ROW);
+        const float* angle_ptr = h_keypoints.ptr<float>(ANGLE_ROW);
+        const float* octave_ptr = h_keypoints.ptr<float>(OCTAVE_ROW);
+        const float* size_ptr = h_keypoints.ptr<float>(SIZE_ROW);

-        keyPointsRange.row(4).setTo(Scalar::all(level));
-        keyPointsRange.row(5).setTo(Scalar::all(patchSize_ * sf));
+        for (int i = 0; i < npoints; ++i)
+        {
+            KeyPoint kp;

-        offset += keyPointsCount_[level];
+            kp.pt.x = x_ptr[i];
+            kp.pt.y = y_ptr[i];
+            kp.response = response_ptr[i];
+            kp.angle = angle_ptr[i];
+            kp.octave = static_cast<int>(octave_ptr[i]);
+            kp.size = size_ptr[i];
+
+            keypoints[i] = kp;
+        }
    }
 }

-void cv::cuda::ORB_CUDA::downloadKeyPoints(const GpuMat &d_keypoints, std::vector<KeyPoint>& keypoints)
+Ptr<cv::cuda::ORB> cv::cuda::ORB::create(int nfeatures,
+                                         float scaleFactor,
+                                         int nlevels,
+                                         int edgeThreshold,
+                                         int firstLevel,
+                                         int WTA_K,
+                                         int scoreType,
+                                         int patchSize,
+                                         int fastThreshold,
+                                         bool blurForDescriptor)
 {
-    if (d_keypoints.empty())
-    {
-        keypoints.clear();
-        return;
-    }
-
-    Mat h_keypoints(d_keypoints);
-
-    convertKeyPoints(h_keypoints, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::convertKeyPoints(const Mat &d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-    {
-        keypoints.clear();
-        return;
-    }
-
-    CV_Assert(d_keypoints.type() == CV_32FC1 && d_keypoints.rows == ROWS_COUNT);
-
-    const float* x_ptr = d_keypoints.ptr<float>(X_ROW);
-    const float* y_ptr = d_keypoints.ptr<float>(Y_ROW);
-    const float* response_ptr = d_keypoints.ptr<float>(RESPONSE_ROW);
-    const float* angle_ptr = d_keypoints.ptr<float>(ANGLE_ROW);
-    const float* octave_ptr = d_keypoints.ptr<float>(OCTAVE_ROW);
-    const float* size_ptr = d_keypoints.ptr<float>(SIZE_ROW);
-
-    keypoints.resize(d_keypoints.cols);
-
-    for (int i = 0; i < d_keypoints.cols; ++i)
-    {
-        KeyPoint kp;
-
-        kp.pt.x = x_ptr[i];
-        kp.pt.y = y_ptr[i];
-        kp.response = response_ptr[i];
-        kp.angle = angle_ptr[i];
-        kp.octave = static_cast<int>(octave_ptr[i]);
-        kp.size = size_ptr[i];
-
-        keypoints[i] = kp;
-    }
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints)
-{
-    buildScalePyramids(image, mask);
-    computeKeyPointsPyramid();
-    mergeKeyPoints(keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors)
-{
-    buildScalePyramids(image, mask);
-    computeKeyPointsPyramid();
-    computeDescriptors(descriptors);
-    mergeKeyPoints(keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    (*this)(image, mask, d_keypoints_);
-    downloadKeyPoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors)
-{
-    (*this)(image, mask, d_keypoints_, descriptors);
-    downloadKeyPoints(d_keypoints_, keypoints);
-}
-
-void cv::cuda::ORB_CUDA::release()
-{
-    imagePyr_.clear();
-    maskPyr_.clear();
-
-    buf_.release();
-
-    keyPointsPyr_.clear();
-
-    fastDetector_.release();
-
-    d_keypoints_.release();
+    return makePtr<ORB_Impl>(nfeatures, scaleFactor, nlevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize, fastThreshold, blurForDescriptor);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudafeatures2d/test/test_features2d.cpp
+++ b/modules/cudafeatures2d/test/test_features2d.cpp
@@ -76,15 +76,14 @@ CUDA_TEST_P(FAST, Accuracy)
    cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(image.empty());

-    cv::cuda::FAST_CUDA fast(threshold);
-    fast.nonmaxSuppression = nonmaxSuppression;
+    cv::Ptr<cv::cuda::FastFeatureDetector> fast = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);

    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
    {
        try
        {
            std::vector<cv::KeyPoint> keypoints;
-            fast(loadMat(image), cv::cuda::GpuMat(), keypoints);
+            fast->detect(loadMat(image), keypoints);
        }
        catch (const cv::Exception& e)
        {
@@ -94,7 +93,7 @@ CUDA_TEST_P(FAST, Accuracy)
    else
    {
        std::vector<cv::KeyPoint> keypoints;
-        fast(loadMat(image), cv::cuda::GpuMat(), keypoints);
+        fast->detect(loadMat(image), keypoints);

        std::vector<cv::KeyPoint> keypoints_gold;
        cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
@@ -123,7 +122,7 @@ namespace
    IMPLEMENT_PARAM_CLASS(ORB_BlurForDescriptor, bool)
 }

-CV_ENUM(ORB_ScoreType, ORB::HARRIS_SCORE, ORB::FAST_SCORE)
+CV_ENUM(ORB_ScoreType, cv::ORB::HARRIS_SCORE, cv::ORB::FAST_SCORE)

 PARAM_TEST_CASE(ORB, cv::cuda::DeviceInfo, ORB_FeaturesCount, ORB_ScaleFactor, ORB_LevelsCount, ORB_EdgeThreshold, ORB_firstLevel, ORB_WTA_K, ORB_ScoreType, ORB_PatchSize, ORB_BlurForDescriptor)
 {
@@ -163,8 +162,9 @@ CUDA_TEST_P(ORB, Accuracy)
    cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
    mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));

-    cv::cuda::ORB_CUDA orb(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);
-    orb.blurForDescriptor = blurForDescriptor;
+    cv::Ptr<cv::cuda::ORB> orb =
+            cv::cuda::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel,
+                                  WTA_K, scoreType, patchSize, 20, blurForDescriptor);

    if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
    {
@@ -172,7 +172,7 @@ CUDA_TEST_P(ORB, Accuracy)
        {
            std::vector<cv::KeyPoint> keypoints;
            cv::cuda::GpuMat descriptors;
-            orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+            orb->detectAndComputeAsync(loadMat(image), loadMat(mask), keypoints, descriptors);
        }
        catch (const cv::Exception& e)
        {
@@ -183,7 +183,7 @@ CUDA_TEST_P(ORB, Accuracy)
    {
        std::vector<cv::KeyPoint> keypoints;
        cv::cuda::GpuMat descriptors;
-        orb(loadMat(image), loadMat(mask), keypoints, descriptors);
+        orb->detectAndCompute(loadMat(image), loadMat(mask), keypoints, descriptors);

        cv::Ptr<cv::ORB> orb_gold = cv::ORB::create(nFeatures, scaleFactor, nLevels, edgeThreshold, firstLevel, WTA_K, scoreType, patchSize);

@@ -208,7 +208,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_Features2D, ORB,  testing::Combine(
    testing::Values(ORB_ScaleFactor(1.2f)),
    testing::Values(ORB_LevelsCount(4), ORB_LevelsCount(8)),
    testing::Values(ORB_EdgeThreshold(31)),
-    testing::Values(ORB_firstLevel(0), ORB_firstLevel(2)),
+    testing::Values(ORB_firstLevel(0)),
    testing::Values(ORB_WTA_K(2), ORB_WTA_K(3), ORB_WTA_K(4)),
    testing::Values(ORB_ScoreType(cv::ORB::HARRIS_SCORE)),
    testing::Values(ORB_PatchSize(31), ORB_PatchSize(29)),
@@ -285,7 +285,8 @@ PARAM_TEST_CASE(BruteForceMatcher, cv::cuda::DeviceInfo, NormCode, DescriptorSiz

 CUDA_TEST_P(BruteForceMatcher, Match_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);

    cv::cuda::GpuMat mask;
    if (useMask)
@@ -295,7 +296,7 @@ CUDA_TEST_P(BruteForceMatcher, Match_Single)
    }

    std::vector<cv::DMatch> matches;
-    matcher.match(loadMat(query), loadMat(train), matches, mask);
+    matcher->match(loadMat(query), loadMat(train), matches, mask);

    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

@@ -312,13 +313,14 @@ CUDA_TEST_P(BruteForceMatcher, Match_Single)

 CUDA_TEST_P(BruteForceMatcher, Match_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);

    cv::cuda::GpuMat d_train(train);

    // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));

    // prepare masks (make first nearest match illegal)
    std::vector<cv::cuda::GpuMat> masks(2);
@@ -331,9 +333,9 @@ CUDA_TEST_P(BruteForceMatcher, Match_Collection)

    std::vector<cv::DMatch> matches;
    if (useMask)
-        matcher.match(cv::cuda::GpuMat(query), matches, masks);
+        matcher->match(cv::cuda::GpuMat(query), matches, masks);
    else
-        matcher.match(cv::cuda::GpuMat(query), matches);
+        matcher->match(cv::cuda::GpuMat(query), matches);

    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

@@ -366,7 +368,8 @@ CUDA_TEST_P(BruteForceMatcher, Match_Collection)

 CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);

    const int knn = 2;

@@ -378,7 +381,7 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
    }

    std::vector< std::vector<cv::DMatch> > matches;
-    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);

    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

@@ -405,7 +408,8 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Single)

 CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);

    const int knn = 3;

@@ -417,7 +421,7 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
    }

    std::vector< std::vector<cv::DMatch> > matches;
-    matcher.knnMatch(loadMat(query), loadMat(train), matches, knn, mask);
+    matcher->knnMatch(loadMat(query), loadMat(train), matches, knn, mask);

    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

@@ -444,15 +448,16 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Single)

 CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);

    const int knn = 2;

    cv::cuda::GpuMat d_train(train);

    // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));

    // prepare masks (make first nearest match illegal)
    std::vector<cv::cuda::GpuMat> masks(2);
@@ -466,9 +471,9 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
    std::vector< std::vector<cv::DMatch> > matches;

    if (useMask)
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
    else
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);

    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

@@ -506,15 +511,16 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)

 CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);

    const int knn = 3;

    cv::cuda::GpuMat d_train(train);

    // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));

    // prepare masks (make first nearest match illegal)
    std::vector<cv::cuda::GpuMat> masks(2);
@@ -528,9 +534,9 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
    std::vector< std::vector<cv::DMatch> > matches;

    if (useMask)
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn, masks);
    else
-        matcher.knnMatch(cv::cuda::GpuMat(query), matches, knn);
+        matcher->knnMatch(cv::cuda::GpuMat(query), matches, knn);

    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

@@ -568,7 +574,8 @@ CUDA_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)

 CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);

    const float radius = 1.f / countFactor;

@@ -577,7 +584,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
        try
        {
            std::vector< std::vector<cv::DMatch> > matches;
-            matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
+            matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius);
        }
        catch (const cv::Exception& e)
        {
@@ -594,7 +601,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)
        }

        std::vector< std::vector<cv::DMatch> > matches;
-        matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);
+        matcher->radiusMatch(loadMat(query), loadMat(train), matches, radius, mask);

        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

@@ -617,7 +624,8 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Single)

 CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
-    cv::cuda::BFMatcher_CUDA matcher(normCode);
+    cv::Ptr<cv::cuda::DescriptorMatcher> matcher =
+            cv::cuda::DescriptorMatcher::createBFMatcher(normCode);

    const int n = 3;
    const float radius = 1.f / countFactor * n;
@@ -625,8 +633,8 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
    cv::cuda::GpuMat d_train(train);

    // make add() twice to test such case
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
-    matcher.add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(0, train.rows / 2)));
+    matcher->add(std::vector<cv::cuda::GpuMat>(1, d_train.rowRange(train.rows / 2, train.rows)));

    // prepare masks (make first nearest match illegal)
    std::vector<cv::cuda::GpuMat> masks(2);
@@ -642,7 +650,7 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
        try
        {
            std::vector< std::vector<cv::DMatch> > matches;
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
        }
        catch (const cv::Exception& e)
        {
@@ -654,9 +662,9 @@ CUDA_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
        std::vector< std::vector<cv::DMatch> > matches;

        if (useMask)
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius, masks);
        else
-            matcher.radiusMatch(cv::cuda::GpuMat(query), matches, radius);
+            matcher->radiusMatch(cv::cuda::GpuMat(query), matches, radius);

        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());

--- a/modules/cudafilters/src/filtering.cpp
+++ b/modules/cudafilters/src/filtering.cpp
@@ -542,7 +542,7 @@ namespace
            anchor_ = Point(iters_, iters_);
            iters_ = 1;
        }
-        else if (iters_ > 1 && countNonZero(kernel) == (int) kernel.total())
+        else if (iters_ > 1 && cv::countNonZero(kernel) == (int) kernel.total())
        {
            anchor_ = Point(anchor_.x * iters_, anchor_.y * iters_);
            kernel = getStructuringElement(MORPH_RECT,
--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -81,7 +81,6 @@ namespace
        GpuMat Dy_;
        GpuMat buf_;
        GpuMat eig_;
-        GpuMat minMaxbuf_;
        GpuMat tmpCorners_;
    };

@@ -112,7 +111,7 @@ namespace
        cornerCriteria_->compute(image, eig_);

        double maxVal = 0;
-        cuda::minMax(eig_, 0, &maxVal, noArray(), minMaxbuf_);
+        cuda::minMax(eig_, 0, &maxVal);

        ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);

--- a/modules/cudaimgproc/src/match_template.cpp
+++ b/modules/cudaimgproc/src/match_template.cpp
@@ -271,7 +271,6 @@ namespace
    private:
        Match_CCORR_8U match_CCORR_;
        GpuMat image_sqsums_;
-        GpuMat intBuffer_;
    };

    void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
@@ -288,7 +287,7 @@ namespace
        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();

-        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);

        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];

@@ -335,7 +334,6 @@ namespace

    private:
        GpuMat image_sqsums_;
-        GpuMat intBuffer_;
        Match_CCORR_8U match_CCORR_;
    };

@@ -359,7 +357,7 @@ namespace
            return;
        }

-        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);

        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];

@@ -383,7 +381,6 @@ namespace

    private:
        GpuMat image_sqsums_;
-        GpuMat intBuffer_;
        Match_CCORR_8U match_CCORR_;
    };

@@ -398,7 +395,7 @@ namespace
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );

-        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, stream);

        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];

@@ -421,7 +418,6 @@ namespace
        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());

    private:
-        GpuMat intBuffer_;
        std::vector<GpuMat> images_;
        std::vector<GpuMat> image_sums_;
        Match_CCORR_8U match_CCORR_;
@@ -444,7 +440,7 @@ namespace
        if (image.channels() == 1)
        {
            image_sums_.resize(1);
-            cuda::integral(image, image_sums_[0], intBuffer_, stream);
+            cuda::integral(image, image_sums_[0], stream);

            int templ_sum = (int) cuda::sum(templ)[0];

@@ -456,7 +452,7 @@ namespace

            image_sums_.resize(images_.size());
            for (int i = 0; i < image.channels(); ++i)
-                cuda::integral(images_[i], image_sums_[i], intBuffer_, stream);
+                cuda::integral(images_[i], image_sums_[i], stream);

            Scalar templ_sum = cuda::sum(templ);

@@ -501,7 +497,6 @@ namespace
    private:
        GpuMat imagef_, templf_;
        Match_CCORR_32F match_CCORR_32F_;
-        GpuMat intBuffer_;
        std::vector<GpuMat> images_;
        std::vector<GpuMat> image_sums_;
        std::vector<GpuMat> image_sqsums_;
@@ -527,10 +522,10 @@ namespace
        if (image.channels() == 1)
        {
            image_sums_.resize(1);
-            cuda::integral(image, image_sums_[0], intBuffer_, stream);
+            cuda::integral(image, image_sums_[0], stream);

            image_sqsums_.resize(1);
-            cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
+            cuda::sqrIntegral(image, image_sqsums_[0], stream);

            int templ_sum = (int) cuda::sum(templ)[0];
            double templ_sqsum = cuda::sqrSum(templ)[0];
@@ -547,8 +542,8 @@ namespace
            image_sqsums_.resize(images_.size());
            for (int i = 0; i < image.channels(); ++i)
            {
-                cuda::integral(images_[i], image_sums_[i], intBuffer_, stream);
-                cuda::sqrIntegral(images_[i], image_sqsums_[i], intBuffer_, stream);
+                cuda::integral(images_[i], image_sums_[i], stream);
+                cuda::sqrIntegral(images_[i], image_sqsums_[i], stream);
            }

            Scalar templ_sum = cuda::sum(templ);
--- a/modules/cudalegacy/include/opencv2/cudalegacy.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@@ -43,6 +43,7 @@
 #ifndef __OPENCV_CUDALEGACY_HPP__
 #define __OPENCV_CUDALEGACY_HPP__

+#include "opencv2/core/cuda.hpp"
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVPyramid.hpp"
@@ -56,4 +57,16 @@
  @}
 */

+namespace cv { namespace cuda {
+
+class CV_EXPORTS ImagePyramid : public Algorithm
+{
+public:
+    virtual void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const = 0;
+};
+
+CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());
+
+}}
+
 #endif /* __OPENCV_CUDALEGACY_HPP__ */
--- a/modules/cudalegacy/src/image_pyramid.cpp
+++ b/modules/cudalegacy/src/image_pyramid.cpp
@@ -0,0 +1,147 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray, int, Stream&) { throw_no_cuda(); return Ptr<ImagePyramid>(); }
+
+#else // HAVE_CUDA
+
+namespace
+{
+    class ImagePyramidImpl : public ImagePyramid
+    {
+    public:
+        ImagePyramidImpl(InputArray img, int nLayers, Stream& stream);
+
+        void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const;
+
+    private:
+        GpuMat layer0_;
+        std::vector<GpuMat> pyramid_;
+        int nLayers_;
+    };
+
+    ImagePyramidImpl::ImagePyramidImpl(InputArray _img, int numLayers, Stream& stream)
+    {
+        GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.depth() <= CV_32F && img.channels() <= 4 );
+
+        img.copyTo(layer0_, stream);
+
+        Size szLastLayer = img.size();
+        nLayers_ = 1;
+
+        if (numLayers <= 0)
+            numLayers = 255; // it will cut-off when any of the dimensions goes 1
+
+        pyramid_.resize(numLayers);
+
+        for (int i = 0; i < numLayers - 1; ++i)
+        {
+            Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+
+            if (szCurLayer.width == 0 || szCurLayer.height == 0)
+                break;
+
+            ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
+            nLayers_++;
+
+            const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
+
+            cv::cuda::device::pyramid::downsampleX2(prevLayer, pyramid_[i], img.depth(), img.channels(), StreamAccessor::getStream(stream));
+
+            szLastLayer = szCurLayer;
+        }
+    }
+
+    void ImagePyramidImpl::getLayer(OutputArray _outImg, Size outRoi, Stream& stream) const
+    {
+        CV_Assert( outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0 );
+
+        ensureSizeIsEnough(outRoi, layer0_.type(), _outImg);
+        GpuMat outImg = _outImg.getGpuMat();
+
+        if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
+        {
+            layer0_.copyTo(outImg, stream);
+            return;
+        }
+
+        float lastScale = 1.0f;
+        float curScale;
+        GpuMat lastLayer = layer0_;
+        GpuMat curLayer;
+
+        for (int i = 0; i < nLayers_ - 1; ++i)
+        {
+            curScale = lastScale * 0.5f;
+            curLayer = pyramid_[i];
+
+            if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
+            {
+                curLayer.copyTo(outImg, stream);
+            }
+
+            if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
+                break;
+
+            lastScale = curScale;
+            lastLayer = curLayer;
+        }
+
+        cv::cuda::device::pyramid::interpolateFrom1(lastLayer, outImg, outImg.depth(), outImg.channels(), StreamAccessor::getStream(stream));
+    }
+}
+
+Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray img, int nLayers, Stream& stream)
+{
+    return Ptr<ImagePyramid>(new ImagePyramidImpl(img, nLayers, stream));
+}
+
+#endif
--- a/modules/cudaobjdetect/CMakeLists.txt
+++ b/modules/cudaobjdetect/CMakeLists.txt
@@ -0,0 +1,9 @@
+if(IOS OR (NOT HAVE_CUDA AND NOT BUILD_CUDA_STUBS))
+  ocv_module_disable(cudaobjdetect)
+endif()
+
+set(the_description "CUDA-accelerated Object Detection")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)
+
+ocv_define_module(cudaobjdetect opencv_objdetect opencv_cudaarithm opencv_cudawarping OPTIONAL opencv_cudalegacy)
--- a/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
+++ b/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
@@ -0,0 +1,288 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CUDAOBJDETECT_HPP__
+#define __OPENCV_CUDAOBJDETECT_HPP__
+
+#ifndef __cplusplus
+#  error cudaobjdetect.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+/**
+  @addtogroup cuda
+  @{
+      @defgroup cudaobjdetect Object Detection
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudaobjdetect
+//! @{
+
+//
+// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector
+//
+
+/** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.
+
+@note
+    -   An example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/cpp/peopledetect.cpp
+    -   A CUDA example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/gpu/hog.cpp
+    -   (Python) An example applying the HOG descriptor for people detection can be found at
+        opencv_source_code/samples/python2/peopledetect.py
+ */
+class CV_EXPORTS HOG : public Algorithm
+{
+public:
+    enum
+    {
+        DESCR_FORMAT_ROW_BY_ROW,
+        DESCR_FORMAT_COL_BY_COL
+    };
+
+    /** @brief Creates the HOG descriptor and detector.
+
+    @param win_size Detection window size. Align to block size and block stride.
+    @param block_size Block size in pixels. Align to cell size. Only (16,16) is supported for now.
+    @param block_stride Block stride. It must be a multiple of cell size.
+    @param cell_size Cell size. Only (8, 8) is supported for now.
+    @param nbins Number of bins. Only 9 bins per cell are supported for now.
+     */
+    static Ptr<HOG> create(Size win_size = Size(64, 128),
+                           Size block_size = Size(16, 16),
+                           Size block_stride = Size(8, 8),
+                           Size cell_size = Size(8, 8),
+                           int nbins = 9);
+
+    //! Gaussian smoothing window parameter.
+    virtual void setWinSigma(double win_sigma) = 0;
+    virtual double getWinSigma() const = 0;
+
+    //! L2-Hys normalization method shrinkage.
+    virtual void setL2HysThreshold(double threshold_L2hys) = 0;
+    virtual double getL2HysThreshold() const = 0;
+
+    //! Flag to specify whether the gamma correction preprocessing is required or not.
+    virtual void setGammaCorrection(bool gamma_correction) = 0;
+    virtual bool getGammaCorrection() const = 0;
+
+    //! Maximum number of detection window increases.
+    virtual void setNumLevels(int nlevels) = 0;
+    virtual int getNumLevels() const = 0;
+
+    //! Threshold for the distance between features and SVM classifying plane.
+    //! Usually it is 0 and should be specfied in the detector coefficients (as the last free
+    //! coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
+    //! manually here.
+    virtual void setHitThreshold(double hit_threshold) = 0;
+    virtual double getHitThreshold() const = 0;
+
+    //! Window stride. It must be a multiple of block stride.
+    virtual void setWinStride(Size win_stride) = 0;
+    virtual Size getWinStride() const = 0;
+
+    //! Coefficient of the detection window increase.
+    virtual void setScaleFactor(double scale0) = 0;
+    virtual double getScaleFactor() const = 0;
+
+    //! Coefficient to regulate the similarity threshold. When detected, some
+    //! objects can be covered by many rectangles. 0 means not to perform grouping.
+    //! See groupRectangles.
+    virtual void setGroupThreshold(int group_threshold) = 0;
+    virtual int getGroupThreshold() const = 0;
+
+    //! Descriptor storage format:
+    //!   - **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
+    //!   - **DESCR_FORMAT_COL_BY_COL** - Column-major order.
+    virtual void setDescriptorFormat(int descr_format) = 0;
+    virtual int getDescriptorFormat() const = 0;
+
+    /** @brief Returns the number of coefficients required for the classification.
+     */
+    virtual size_t getDescriptorSize() const = 0;
+
+    /** @brief Returns the block histogram size.
+     */
+    virtual size_t getBlockHistogramSize() const = 0;
+
+    /** @brief Sets coefficients for the linear SVM classifier.
+     */
+    virtual void setSVMDetector(InputArray detector) = 0;
+
+    /** @brief Returns coefficients of the classifier trained for people detection.
+     */
+    virtual Mat getDefaultPeopleDetector() const = 0;
+
+    /** @brief Performs object detection without a multi-scale window.
+
+    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
+    @param found_locations Left-top corner points of detected objects boundaries.
+    @param confidences Optional output array for confidences.
+     */
+    virtual void detect(InputArray img,
+                        std::vector<Point>& found_locations,
+                        std::vector<double>* confidences = NULL) = 0;
+
+    /** @brief Performs object detection with a multi-scale window.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param found_locations Detected objects boundaries.
+    @param confidences Optional output array for confidences.
+     */
+    virtual void detectMultiScale(InputArray img,
+                                  std::vector<Rect>& found_locations,
+                                  std::vector<double>* confidences = NULL) = 0;
+
+    /** @brief Returns block descriptors computed for the whole image.
+
+    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
+    @param descriptors 2D array of descriptors.
+    @param stream CUDA stream.
+     */
+    virtual void compute(InputArray img,
+                         OutputArray descriptors,
+                         Stream& stream = Stream::Null()) = 0;
+};
+
+//
+// CascadeClassifier
+//
+
+/** @brief Cascade classifier class used for object detection. Supports HAAR and LBP cascades. :
+
+@note
+   -   A cascade classifier example can be found at
+        opencv_source_code/samples/gpu/cascadeclassifier.cpp
+    -   A Nvidea API specific cascade classifier example can be found at
+        opencv_source_code/samples/gpu/cascadeclassifier_nvidia_api.cpp
+ */
+class CV_EXPORTS CascadeClassifier : public Algorithm
+{
+public:
+    /** @brief Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.
+
+    @param filename Name of the file from which the classifier is loaded. Only the old haar classifier
+    (trained by the haar training application) and NVIDIA's nvbin are supported for HAAR and only new
+    type of OpenCV XML cascade supported for LBP.
+     */
+    static Ptr<CascadeClassifier> create(const String& filename);
+    /** @overload
+     */
+    static Ptr<CascadeClassifier> create(const FileStorage& file);
+
+    //! Maximum possible object size. Objects larger than that are ignored. Used for
+    //! second signature and supported only for LBP cascades.
+    virtual void setMaxObjectSize(Size maxObjectSize) = 0;
+    virtual Size getMaxObjectSize() const = 0;
+
+    //! Minimum possible object size. Objects smaller than that are ignored.
+    virtual void setMinObjectSize(Size minSize) = 0;
+    virtual Size getMinObjectSize() const = 0;
+
+    //! Parameter specifying how much the image size is reduced at each image scale.
+    virtual void setScaleFactor(double scaleFactor) = 0;
+    virtual double getScaleFactor() const = 0;
+
+    //! Parameter specifying how many neighbors each candidate rectangle should have
+    //! to retain it.
+    virtual void setMinNeighbors(int minNeighbors) = 0;
+    virtual int getMinNeighbors() const = 0;
+
+    virtual void setFindLargestObject(bool findLargestObject) = 0;
+    virtual bool getFindLargestObject() = 0;
+
+    virtual void setMaxNumObjects(int maxNumObjects) = 0;
+    virtual int getMaxNumObjects() const = 0;
+
+    virtual Size getClassifierSize() const = 0;
+
+    /** @brief Detects objects of different sizes in the input image.
+
+    @param image Matrix of type CV_8U containing an image where objects should be detected.
+    @param objects Buffer to store detected objects (rectangles).
+    @param stream CUDA stream.
+
+    To get final array of detected objects use CascadeClassifier::convert method.
+
+    @code
+        Ptr<cuda::CascadeClassifier> cascade_gpu = cuda::CascadeClassifier::create(...);
+
+        Mat image_cpu = imread(...)
+        GpuMat image_gpu(image_cpu);
+
+        GpuMat objbuf;
+        cascade_gpu->detectMultiScale(image_gpu, objbuf);
+
+        std::vector<Rect> faces;
+        cascade_gpu->convert(objbuf, faces);
+
+        for(int i = 0; i < detections_num; ++i)
+           cv::rectangle(image_cpu, faces[i], Scalar(255));
+
+        imshow("Faces", image_cpu);
+    @endcode
+
+    @sa CascadeClassifier::detectMultiScale
+     */
+    virtual void detectMultiScale(InputArray image,
+                                  OutputArray objects,
+                                  Stream& stream = Stream::Null()) = 0;
+
+    /** @brief Converts objects array from internal representation to standard vector.
+
+    @param gpu_objects Objects array in internal representation.
+    @param objects Resulting array.
+     */
+    virtual void convert(OutputArray gpu_objects,
+                         std::vector<Rect>& objects) = 0;
+};
+
+//! @}
+
+}} // namespace cv { namespace cuda {
+
+#endif /* __OPENCV_CUDAOBJDETECT_HPP__ */
--- a/modules/cudaobjdetect/perf/perf_main.cpp
+++ b/modules/cudaobjdetect/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudaobjdetect)
--- a/modules/cudaobjdetect/perf/perf_objdetect.cpp
+++ b/modules/cudaobjdetect/perf/perf_objdetect.cpp
@@ -71,10 +71,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
        const cv::cuda::GpuMat d_img(img);
        std::vector<cv::Rect> gpu_found_locations;

-        cv::cuda::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+        d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());

-        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);
+        TEST_CYCLE() d_hog->detectMultiScale(d_img, gpu_found_locations);

        SANITY_CHECK(gpu_found_locations);
    }
@@ -82,8 +82,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
    {
        std::vector<cv::Rect> cpu_found_locations;

+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+
        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog.setSVMDetector(d_hog->getDefaultPeopleDetector());

        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);

@@ -105,18 +107,17 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::CascadeClassifier_CUDA d_cascade;
-        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+        cv::Ptr<cv::cuda::CascadeClassifier> d_cascade =
+                cv::cuda::CascadeClassifier::create(perf::TestBase::getDataPath(GetParam().second));

        const cv::cuda::GpuMat d_img(img);
        cv::cuda::GpuMat objects_buffer;
-        int detections_num = 0;

-        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
+        TEST_CYCLE() d_cascade->detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects;
+        d_cascade->convert(objects_buffer, gpu_rects);

-        std::vector<cv::Rect> gpu_rects(detections_num);
-        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
-        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
        cv::groupRectangles(gpu_rects, 3, 0.2);
        SANITY_CHECK(gpu_rects);
    }
@@ -144,18 +145,17 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::CascadeClassifier_CUDA d_cascade;
-        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+        cv::Ptr<cv::cuda::CascadeClassifier> d_cascade =
+                cv::cuda::CascadeClassifier::create(perf::TestBase::getDataPath(GetParam().second));

        const cv::cuda::GpuMat d_img(img);
        cv::cuda::GpuMat objects_buffer;
-        int detections_num = 0;

-        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
+        TEST_CYCLE() d_cascade->detectMultiScale(d_img, objects_buffer);
+
+        std::vector<cv::Rect> gpu_rects;
+        d_cascade->convert(objects_buffer, gpu_rects);

-        std::vector<cv::Rect> gpu_rects(detections_num);
-        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
-        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
        cv::groupRectangles(gpu_rects, 3, 0.2);
        SANITY_CHECK(gpu_rects);
    }
--- a/modules/cudaobjdetect/perf/perf_precomp.hpp
+++ b/modules/cudaobjdetect/perf/perf_precomp.hpp
@@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_perf.hpp"
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/objdetect.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
--- a/modules/cudaobjdetect/src/cascadeclassifier.cpp
+++ b/modules/cudaobjdetect/src/cascadeclassifier.cpp
@@ -48,160 +48,185 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA()               { throw_no_cuda(); }
-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String&)  { throw_no_cuda(); }
-cv::cuda::CascadeClassifier_CUDA::~CascadeClassifier_CUDA()              { throw_no_cuda(); }
-bool cv::cuda::CascadeClassifier_CUDA::empty() const                    { throw_no_cuda(); return true; }
-bool cv::cuda::CascadeClassifier_CUDA::load(const String&)              { throw_no_cuda(); return true; }
-Size cv::cuda::CascadeClassifier_CUDA::getClassifierSize() const        { throw_no_cuda(); return Size();}
-void cv::cuda::CascadeClassifier_CUDA::release()                        { throw_no_cuda(); }
-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat&, GpuMat&, double, int, Size)       {throw_no_cuda(); return -1;}
-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat&, GpuMat&, Size, Size, double, int) {throw_no_cuda(); return -1;}
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const String&) { throw_no_cuda(); return Ptr<cuda::CascadeClassifier>(); }
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const FileStorage&) { throw_no_cuda(); return Ptr<cuda::CascadeClassifier>(); }

 #else

-struct cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
+//
+// CascadeClassifierBase
+//
+
+namespace
 {
-public:
-    CascadeClassifierImpl(){}
-    virtual ~CascadeClassifierImpl(){}
+    class CascadeClassifierBase : public cuda::CascadeClassifier
+    {
+    public:
+        CascadeClassifierBase();

-    virtual unsigned int process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize, cv::Size maxObjectSize) = 0;
+        virtual void setMaxObjectSize(Size maxObjectSize) { maxObjectSize_ = maxObjectSize; }
+        virtual Size getMaxObjectSize() const { return maxObjectSize_; }

-    virtual cv::Size getClassifierCvSize() const = 0;
-    virtual bool read(const String& classifierAsXml) = 0;
-};
+        virtual void setMinObjectSize(Size minSize) { minObjectSize_ = minSize; }
+        virtual Size getMinObjectSize() const { return minObjectSize_; }

-#ifndef HAVE_OPENCV_CUDALEGACY
+        virtual void setScaleFactor(double scaleFactor) { scaleFactor_ = scaleFactor; }
+        virtual double getScaleFactor() const { return scaleFactor_; }

-struct cv::cuda::CascadeClassifier_CUDA::HaarCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
+        virtual void setMinNeighbors(int minNeighbors) { minNeighbors_ = minNeighbors; }
+        virtual int getMinNeighbors() const { return minNeighbors_; }
+
+        virtual void setFindLargestObject(bool findLargestObject) { findLargestObject_ = findLargestObject; }
+        virtual bool getFindLargestObject() { return findLargestObject_; }
+
+        virtual void setMaxNumObjects(int maxNumObjects) { maxNumObjects_ = maxNumObjects; }
+        virtual int getMaxNumObjects() const { return maxNumObjects_; }
+
+    protected:
+        Size maxObjectSize_;
+        Size minObjectSize_;
+        double scaleFactor_;
+        int minNeighbors_;
+        bool findLargestObject_;
+        int maxNumObjects_;
+    };
+
+    CascadeClassifierBase::CascadeClassifierBase() :
+        maxObjectSize_(),
+        minObjectSize_(),
+        scaleFactor_(1.2),
+        minNeighbors_(4),
+        findLargestObject_(false),
+        maxNumObjects_(100)
+    {
+    }
+}
+
+//
+// HaarCascade
+//
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+
+namespace
 {
-public:
-    HaarCascade()
+    class HaarCascade_Impl : public CascadeClassifierBase
    {
-        throw_no_cuda();
+    public:
+        explicit HaarCascade_Impl(const String& filename);
+
+        virtual Size getClassifierSize() const;
+
+        virtual void detectMultiScale(InputArray image,
+                                      OutputArray objects,
+                                      Stream& stream);
+
+        virtual void convert(OutputArray gpu_objects,
+                             std::vector<Rect>& objects);
+
+    private:
+        NCVStatus load(const String& classifierFile);
+        NCVStatus calculateMemReqsAndAllocate(const Size& frameSize);
+        NCVStatus process(const GpuMat& src, GpuMat& objects, cv::Size ncvMinSize, /*out*/ unsigned int& numDetections);
+
+        Size lastAllocatedFrameSize;
+
+        Ptr<NCVMemStackAllocator> gpuAllocator;
+        Ptr<NCVMemStackAllocator> cpuAllocator;
+
+        cudaDeviceProp devProp;
+        NCVStatus ncvStat;
+
+        Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
+        Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+
+        Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
+        Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
+        Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+
+        HaarClassifierCascadeDescriptor haar;
+
+        Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
+        Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
+        Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+    };
+
+    static void NCVDebugOutputHandler(const String &msg)
+    {
+        CV_Error(Error::GpuApiCallError, msg.c_str());
    }

-    unsigned int process(const GpuMat&, GpuMat&, float, int, bool, bool, cv::Size, cv::Size)
-    {
-        throw_no_cuda();
-        return 0;
-    }
-
-    cv::Size getClassifierCvSize() const
-    {
-        throw_no_cuda();
-        return cv::Size();
-    }
-
-    bool read(const String&)
-    {
-        throw_no_cuda();
-        return false;
-    }
-};
-
-#else
-
-struct cv::cuda::CascadeClassifier_CUDA::HaarCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
-{
-public:
-    HaarCascade() : lastAllocatedFrameSize(-1, -1)
+    HaarCascade_Impl::HaarCascade_Impl(const String& filename) :
+        lastAllocatedFrameSize(-1, -1)
    {
        ncvSetDebugOutputHandler(NCVDebugOutputHandler);
-    }
-
-    bool read(const String& filename)
-    {
        ncvSafeCall( load(filename) );
-        return true;
    }

-    NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size ncvMinSize,
-                      /*out*/unsigned int& numDetections)
+    Size HaarCascade_Impl::getClassifierSize() const
    {
-        calculateMemReqsAndAllocate(src.size());
-
-        NCVMemPtr src_beg;
-        src_beg.ptr = (void*)src.ptr<Ncv8u>();
-        src_beg.memtype = NCVMemoryTypeDevice;
-
-        NCVMemSegment src_seg;
-        src_seg.begin = src_beg;
-        src_seg.size  = src.step * src.rows;
-
-        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
-        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
-
-        CV_Assert(objects.rows == 1);
-
-        NCVMemPtr objects_beg;
-        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
-        objects_beg.memtype = NCVMemoryTypeDevice;
-
-        NCVMemSegment objects_seg;
-        objects_seg.begin = objects_beg;
-        objects_seg.size = objects.step * objects.rows;
-        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
-        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
-
-        NcvSize32u roi;
-        roi.width = d_src.width();
-        roi.height = d_src.height();
-
-        NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
-
-        Ncv32u flags = 0;
-        flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;
-        flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace  : 0;
-
-        ncvStat = ncvDetectObjectsMultiScale_device(
-            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
-            *d_haarStages, *d_haarNodes, *d_haarFeatures,
-            winMinSize,
-            minNeighbors,
-            scaleStep, 1,
-            flags,
-            *gpuAllocator, *cpuAllocator, devProp, 0);
-        ncvAssertReturnNcvStat(ncvStat);
-        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-        return NCV_SUCCESS;
+        return Size(haar.ClassifierSize.width, haar.ClassifierSize.height);
    }

-    unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
+    void HaarCascade_Impl::detectMultiScale(InputArray _image,
+                                            OutputArray _objects,
+                                            Stream& stream)
    {
-        CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);
+        const GpuMat image = _image.getGpuMat();

-        const int defaultObjSearchNum = 100;
-        if (objectsBuf.empty())
+        CV_Assert( image.depth() == CV_8U);
+        CV_Assert( scaleFactor_ > 1 );
+        CV_Assert( !stream );
+
+        Size ncvMinSize = getClassifierSize();
+        if (ncvMinSize.width < minObjectSize_.width && ncvMinSize.height < minObjectSize_.height)
        {
-            objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);
+            ncvMinSize.width = minObjectSize_.width;
+            ncvMinSize.height = minObjectSize_.height;
        }

-        cv::Size ncvMinSize = this->getClassifierCvSize();
-
-        if (ncvMinSize.width < minSize.width && ncvMinSize.height < minSize.height)
-        {
-            ncvMinSize.width = minSize.width;
-            ncvMinSize.height = minSize.height;
-        }
+        BufferPool pool(stream);
+        GpuMat objectsBuf = pool.getBuffer(1, maxNumObjects_, DataType<Rect>::type);

        unsigned int numDetections;
-        ncvSafeCall(this->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections));
+        ncvSafeCall( process(image, objectsBuf, ncvMinSize, numDetections) );

-        return numDetections;
+        if (numDetections > 0)
+        {
+            objectsBuf.colRange(0, numDetections).copyTo(_objects);
+        }
+        else
+        {
+            _objects.release();
+        }
    }

-    cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }
+    void HaarCascade_Impl::convert(OutputArray _gpu_objects, std::vector<Rect>& objects)
+    {
+        if (_gpu_objects.empty())
+        {
+            objects.clear();
+            return;
+        }

-private:
-    static void NCVDebugOutputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
+        Mat gpu_objects;
+        if (_gpu_objects.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_objects.getGpuMat().download(gpu_objects);
+        }
+        else
+        {
+            gpu_objects = _gpu_objects.getMat();
+        }

-    NCVStatus load(const String& classifierFile)
+        CV_Assert( gpu_objects.rows == 1 );
+        CV_Assert( gpu_objects.type() == DataType<Rect>::type );
+
+        Rect* ptr = gpu_objects.ptr<Rect>();
+        objects.assign(ptr, ptr + gpu_objects.cols);
+    }
+
+    NCVStatus HaarCascade_Impl::load(const String& classifierFile)
    {
        int devId = cv::cuda::getDevice();
        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
@@ -246,7 +271,7 @@ private:
        return NCV_SUCCESS;
    }

-    NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)
+    NCVStatus HaarCascade_Impl::calculateMemReqsAndAllocate(const Size& frameSize)
    {
        if (lastAllocatedFrameSize == frameSize)
        {
@@ -289,88 +314,62 @@ private:
        return NCV_SUCCESS;
    }

-    cudaDeviceProp devProp;
-    NCVStatus ncvStat;
+    NCVStatus HaarCascade_Impl::process(const GpuMat& src, GpuMat& objects, cv::Size ncvMinSize, /*out*/ unsigned int& numDetections)
+    {
+        calculateMemReqsAndAllocate(src.size());

-    Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;
-    Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+        NCVMemPtr src_beg;
+        src_beg.ptr = (void*)src.ptr<Ncv8u>();
+        src_beg.memtype = NCVMemoryTypeDevice;

-    Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;
-    Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
-    Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+        NCVMemSegment src_seg;
+        src_seg.begin = src_beg;
+        src_seg.size  = src.step * src.rows;

-    HaarClassifierCascadeDescriptor haar;
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, static_cast<int>(devProp.textureAlignment), src.cols, src.rows, static_cast<int>(src.step), true);
+        ncvAssertReturn(d_src.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);

-    Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
-    Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
-    Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+        CV_Assert(objects.rows == 1);

-    Size lastAllocatedFrameSize;
+        NCVMemPtr objects_beg;
+        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
+        objects_beg.memtype = NCVMemoryTypeDevice;

-    Ptr<NCVMemStackAllocator> gpuAllocator;
-    Ptr<NCVMemStackAllocator> cpuAllocator;
+        NCVMemSegment objects_seg;
+        objects_seg.begin = objects_beg;
+        objects_seg.size = objects.step * objects.rows;
+        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
+        ncvAssertReturn(d_rects.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);

-    virtual ~HaarCascade(){}
-};
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+
+        NcvSize32u winMinSize(ncvMinSize.width, ncvMinSize.height);
+
+        Ncv32u flags = 0;
+        flags |= findLargestObject_ ? NCVPipeObjDet_FindLargestObject : 0;
+
+        ncvStat = ncvDetectObjectsMultiScale_device(
+            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures,
+            winMinSize,
+            minNeighbors_,
+            scaleFactor_, 1,
+            flags,
+            *gpuAllocator, *cpuAllocator, devProp, 0);
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+        return NCV_SUCCESS;
+    }
+}

 #endif

-cv::Size operator -(const cv::Size& a, const cv::Size& b)
-{
-    return cv::Size(a.width - b.width, a.height - b.height);
-}
-
-cv::Size operator +(const cv::Size& a, const int& i)
-{
-    return cv::Size(a.width + i, a.height + i);
-}
-
-cv::Size operator *(const cv::Size& a, const float& f)
-{
-    return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
-}
-
-cv::Size operator /(const cv::Size& a, const float& f)
-{
-    return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
-}
-
-bool operator <=(const cv::Size& a, const cv::Size& b)
-{
-    return a.width <= b.width && a.height <= b.width;
-}
-
-struct PyrLavel
-{
-    PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
-    {
-        do
-        {
-            order = _order;
-            scale = pow(_scale, order);
-            sFrame = frame / scale;
-            workArea = sFrame - window + 1;
-            sWindow = window * scale;
-            _order++;
-        } while (sWindow <= minObjectSize);
-    }
-
-    bool isFeasible(cv::Size maxObj)
-    {
-        return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
-    }
-
-    PyrLavel next(float factor, cv::Size frame, cv::Size window, cv::Size minObjectSize)
-    {
-        return PyrLavel(order + 1, factor, frame, window, minObjectSize);
-    }
-
-    int order;
-    float scale;
-    cv::Size sFrame;
-    cv::Size workArea;
-    cv::Size sWindow;
-};
+//
+// LbpCascade
+//

 namespace cv { namespace cuda { namespace device
 {
@@ -394,42 +393,154 @@ namespace cv { namespace cuda { namespace device
                             unsigned int* classified,
                             PtrStepSzi integral);

-        void connectedConmonents(PtrStepSz<int4>  candidates, int ncandidates, PtrStepSz<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
+        void connectedConmonents(PtrStepSz<int4> candidates,
+                                 int ncandidates,
+                                 PtrStepSz<int4> objects,
+                                 int groupThreshold,
+                                 float grouping_eps,
+                                 unsigned int* nclasses);
    }
 }}}

-struct cv::cuda::CascadeClassifier_CUDA::LbpCascade : cv::cuda::CascadeClassifier_CUDA::CascadeClassifierImpl
+namespace
 {
-public:
-    struct Stage
+    cv::Size operator -(const cv::Size& a, const cv::Size& b)
    {
-        int    first;
-        int    ntrees;
-        float  threshold;
+        return cv::Size(a.width - b.width, a.height - b.height);
+    }
+
+    cv::Size operator +(const cv::Size& a, const int& i)
+    {
+        return cv::Size(a.width + i, a.height + i);
+    }
+
+    cv::Size operator *(const cv::Size& a, const float& f)
+    {
+        return cv::Size(cvRound(a.width * f), cvRound(a.height * f));
+    }
+
+    cv::Size operator /(const cv::Size& a, const float& f)
+    {
+        return cv::Size(cvRound(a.width / f), cvRound(a.height / f));
+    }
+
+    bool operator <=(const cv::Size& a, const cv::Size& b)
+    {
+        return a.width <= b.width && a.height <= b.width;
+    }
+
+    struct PyrLavel
+    {
+        PyrLavel(int _order, float _scale, cv::Size frame, cv::Size window, cv::Size minObjectSize)
+        {
+            do
+            {
+                order = _order;
+                scale = pow(_scale, order);
+                sFrame = frame / scale;
+                workArea = sFrame - window + 1;
+                sWindow = window * scale;
+                _order++;
+            } while (sWindow <= minObjectSize);
+        }
+
+        bool isFeasible(cv::Size maxObj)
+        {
+            return workArea.width > 0 && workArea.height > 0 && sWindow <= maxObj;
+        }
+
+        PyrLavel next(float factor, cv::Size frame, cv::Size window, cv::Size minObjectSize)
+        {
+            return PyrLavel(order + 1, factor, frame, window, minObjectSize);
+        }
+
+        int order;
+        float scale;
+        cv::Size sFrame;
+        cv::Size workArea;
+        cv::Size sWindow;
    };

-    LbpCascade(){}
-    virtual ~LbpCascade(){}
-
-    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool /*findLargestObject*/,
-        bool /*visualizeInPlace*/, cv::Size minObjectSize, cv::Size maxObjectSize)
+    class LbpCascade_Impl : public CascadeClassifierBase
    {
-        CV_Assert(scaleFactor > 1 && image.depth() == CV_8U);
+    public:
+        explicit LbpCascade_Impl(const FileStorage& file);
+
+        virtual Size getClassifierSize() const { return NxM; }
+
+        virtual void detectMultiScale(InputArray image,
+                                      OutputArray objects,
+                                      Stream& stream);
+
+        virtual void convert(OutputArray gpu_objects,
+                             std::vector<Rect>& objects);
+
+    private:
+        bool load(const FileNode &root);
+        void allocateBuffers(cv::Size frame);
+
+    private:
+        struct Stage
+        {
+            int    first;
+            int    ntrees;
+            float  threshold;
+        };
+
+        enum stage { BOOST = 0 };
+        enum feature { LBP = 1, HAAR = 2 };
+
+        static const stage stageType = BOOST;
+        static const feature featureType = LBP;
+
+        cv::Size NxM;
+        bool isStumps;
+        int ncategories;
+        int subsetSize;
+        int nodeStep;
+
+        // gpu representation of classifier
+        GpuMat stage_mat;
+        GpuMat trees_mat;
+        GpuMat nodes_mat;
+        GpuMat leaves_mat;
+        GpuMat subsets_mat;
+        GpuMat features_mat;
+
+        GpuMat integral;
+        GpuMat integralBuffer;
+        GpuMat resuzeBuffer;
+
+        GpuMat candidates;
+        static const int integralFactor = 4;
+    };
+
+    LbpCascade_Impl::LbpCascade_Impl(const FileStorage& file)
+    {
+        load(file.getFirstTopLevelNode());
+    }
+
+    void LbpCascade_Impl::detectMultiScale(InputArray _image,
+                                           OutputArray _objects,
+                                           Stream& stream)
+    {
+        const GpuMat image = _image.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U);
+        CV_Assert( scaleFactor_ > 1 );
+        CV_Assert( !stream );

-        // const int defaultObjSearchNum = 100;
        const float grouping_eps = 0.2f;

-        if( !objects.empty() && objects.depth() == CV_32S)
-            objects.reshape(4, 1);
-        else
-            objects.create(1 , image.cols >> 4, CV_32SC4);
+        BufferPool pool(stream);
+        GpuMat objects = pool.getBuffer(1, maxNumObjects_, DataType<Rect>::type);

        // used for debug
        // candidates.setTo(cv::Scalar::all(0));
        // objects.setTo(cv::Scalar::all(0));

-        if (maxObjectSize == cv::Size())
-            maxObjectSize = image.size();
+        if (maxObjectSize_ == cv::Size())
+            maxObjectSize_ = image.size();

        allocateBuffers(image.size());

@@ -437,9 +548,9 @@ public:
        GpuMat dclassified(1, 1, CV_32S);
        cudaSafeCall( cudaMemcpy(dclassified.ptr(), &classified, sizeof(int), cudaMemcpyHostToDevice) );

-        PyrLavel level(0, scaleFactor, image.size(), NxM, minObjectSize);
+        PyrLavel level(0, scaleFactor_, image.size(), NxM, minObjectSize_);

-        while (level.isFeasible(maxObjectSize))
+        while (level.isFeasible(maxObjectSize_))
        {
            int acc = level.sFrame.width + 1;
            float iniScale = level.scale;
@@ -449,23 +560,22 @@ public:

            int total = 0, prev  = 0;

-            while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize))
+            while (acc <= integralFactor * (image.cols + 1) && level.isFeasible(maxObjectSize_))
            {
                // create sutable matrix headers
                GpuMat src  = resuzeBuffer(cv::Rect(0, 0, level.sFrame.width, level.sFrame.height));
                GpuMat sint = integral(cv::Rect(prev, 0, level.sFrame.width + 1, level.sFrame.height + 1));
-                GpuMat buff = integralBuffer;

                // generate integral for scale
                cuda::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
-                cuda::integral(src, sint, buff);
+                cuda::integral(src, sint);

                // calculate job
                int totalWidth = level.workArea.width / step;
                total += totalWidth * (level.workArea.height / step);

                // go to next pyramide level
-                level = level.next(scaleFactor, image.size(), NxM, minObjectSize);
+                level = level.next(scaleFactor_, image.size(), NxM, minObjectSize_);
                area = level.workArea;

                step = (1 + (level.scale <= 2.f));
@@ -473,60 +583,55 @@ public:
                acc += level.sFrame.width + 1;
            }

-            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
+            device::lbp::classifyPyramid(image.cols, image.rows, NxM.width - 1, NxM.height - 1, iniScale, scaleFactor_, total, stage_mat, stage_mat.cols / sizeof(Stage), nodes_mat,
                leaves_mat, subsets_mat, features_mat, subsetSize, candidates, dclassified.ptr<unsigned int>(), integral);
        }

-        if (groupThreshold <= 0  || objects.empty())
-            return 0;
+        if (minNeighbors_ <= 0  || objects.empty())
+            return;

        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
-        device::lbp::connectedConmonents(candidates, classified, objects, groupThreshold, grouping_eps, dclassified.ptr<unsigned int>());
+        device::lbp::connectedConmonents(candidates, classified, objects, minNeighbors_, grouping_eps, dclassified.ptr<unsigned int>());

        cudaSafeCall( cudaMemcpy(&classified, dclassified.ptr(), sizeof(int), cudaMemcpyDeviceToHost) );
        cudaSafeCall( cudaDeviceSynchronize() );
-        return classified;
-    }

-    virtual cv::Size getClassifierCvSize() const { return NxM; }
-
-    bool read(const String& classifierAsXml)
-    {
-        FileStorage fs(classifierAsXml, FileStorage::READ);
-        return fs.isOpened() ? read(fs.getFirstTopLevelNode()) : false;
-    }
-
-private:
-
-    void allocateBuffers(cv::Size frame)
-    {
-        if (frame == cv::Size())
-            return;
-
-        if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
+        if (classified > 0)
        {
-            resuzeBuffer.create(frame, CV_8UC1);
-
-            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
-
-#ifdef HAVE_OPENCV_CUDALEGACY
-            NcvSize32u roiSize;
-            roiSize.width = frame.width;
-            roiSize.height = frame.height;
-
-            cudaDeviceProp prop;
-            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
-
-            Ncv32u bufSize;
-            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-            integralBuffer.create(1, bufSize, CV_8UC1);
-#endif
-
-            candidates.create(1 , frame.width >> 1, CV_32SC4);
+            objects.colRange(0, classified).copyTo(_objects);
+        }
+        else
+        {
+            _objects.release();
        }
    }

-    bool read(const FileNode &root)
+    void LbpCascade_Impl::convert(OutputArray _gpu_objects, std::vector<Rect>& objects)
+    {
+        if (_gpu_objects.empty())
+        {
+            objects.clear();
+            return;
+        }
+
+        Mat gpu_objects;
+        if (_gpu_objects.kind() == _InputArray::CUDA_GPU_MAT)
+        {
+            _gpu_objects.getGpuMat().download(gpu_objects);
+        }
+        else
+        {
+            gpu_objects = _gpu_objects.getMat();
+        }
+
+        CV_Assert( gpu_objects.rows == 1 );
+        CV_Assert( gpu_objects.type() == DataType<Rect>::type );
+
+        Rect* ptr = gpu_objects.ptr<Rect>();
+        objects.assign(ptr, ptr + gpu_objects.cols);
+    }
+
+    bool LbpCascade_Impl::load(const FileNode &root)
    {
        const char *CUDA_CC_STAGE_TYPE       = "stageType";
        const char *CUDA_CC_FEATURE_TYPE     = "featureType";
@@ -667,92 +772,90 @@ private:
        return true;
    }

-    enum stage { BOOST = 0 };
-    enum feature { LBP = 1, HAAR = 2 };
-    static const stage stageType = BOOST;
-    static const feature featureType = LBP;
+    void LbpCascade_Impl::allocateBuffers(cv::Size frame)
+    {
+        if (frame == cv::Size())
+            return;

-    cv::Size NxM;
-    bool isStumps;
-    int ncategories;
-    int subsetSize;
-    int nodeStep;
+        if (resuzeBuffer.empty() || frame.width > resuzeBuffer.cols || frame.height > resuzeBuffer.rows)
+        {
+            resuzeBuffer.create(frame, CV_8UC1);

-    // gpu representation of classifier
-    GpuMat stage_mat;
-    GpuMat trees_mat;
-    GpuMat nodes_mat;
-    GpuMat leaves_mat;
-    GpuMat subsets_mat;
-    GpuMat features_mat;
+            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);

-    GpuMat integral;
-    GpuMat integralBuffer;
-    GpuMat resuzeBuffer;
+        #ifdef HAVE_OPENCV_CUDALEGACY
+            NcvSize32u roiSize;
+            roiSize.width = frame.width;
+            roiSize.height = frame.height;

-    GpuMat candidates;
-    static const int integralFactor = 4;
-};
+            cudaDeviceProp prop;
+            cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );

-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA()
-: findLargestObject(false), visualizeInPlace(false), impl(0) {}
+            Ncv32u bufSize;
+            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+            integralBuffer.create(1, bufSize, CV_8UC1);
+        #endif

-cv::cuda::CascadeClassifier_CUDA::CascadeClassifier_CUDA(const String& filename)
-: findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }
+            candidates.create(1 , frame.width >> 1, CV_32SC4);
+        }
+    }

-cv::cuda::CascadeClassifier_CUDA::~CascadeClassifier_CUDA() { release(); }
-
-void cv::cuda::CascadeClassifier_CUDA::release() { if (impl) { delete impl; impl = 0; } }
-
-bool cv::cuda::CascadeClassifier_CUDA::empty() const { return impl == 0; }
-
-Size cv::cuda::CascadeClassifier_CUDA::getClassifierSize() const
-{
-    return this->empty() ? Size() : impl->getClassifierCvSize();
 }

-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
-{
-    CV_Assert( !this->empty());
-    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, cv::Size());
-}
+//
+// create
+//

-int cv::cuda::CascadeClassifier_CUDA::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize, double scaleFactor, int minNeighbors)
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const String& filename)
 {
-    CV_Assert( !this->empty());
-    return impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, minSize, maxObjectSize);
-}
-
-bool cv::cuda::CascadeClassifier_CUDA::load(const String& filename)
-{
-    release();
-
    String fext = filename.substr(filename.find_last_of(".") + 1);
    fext = fext.toLowerCase();

    if (fext == "nvbin")
    {
-        impl = new HaarCascade();
-        return impl->read(filename);
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
    }

    FileStorage fs(filename, FileStorage::READ);

    if (!fs.isOpened())
    {
-        impl = new HaarCascade();
-        return impl->read(filename);
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
    }

    const char *CUDA_CC_LBP = "LBP";
    String featureTypeStr = (String)fs.getFirstTopLevelNode()["featureType"];
    if (featureTypeStr == CUDA_CC_LBP)
-        impl = new LbpCascade();
+    {
+        return makePtr<LbpCascade_Impl>(fs);
+    }
    else
-        impl = new HaarCascade();
+    {
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        CV_Error(Error::StsUnsupportedFormat, "OpenCV CUDA objdetect was built without HaarCascade");
+        return Ptr<cuda::CascadeClassifier>();
+    #else
+        return makePtr<HaarCascade_Impl>(filename);
+    #endif
+    }

-    impl->read(filename);
-    return !this->empty();
+    CV_Error(Error::StsUnsupportedFormat, "Unsupported format for CUDA CascadeClassifier");
+    return Ptr<cuda::CascadeClassifier>();
+}
+
+Ptr<cuda::CascadeClassifier> cv::cuda::CascadeClassifier::create(const FileStorage& file)
+{
+    return makePtr<LbpCascade_Impl>(file);
 }

 #endif
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
--- a/modules/cudaobjdetect/src/cuda/lbp.cu
+++ b/modules/cudaobjdetect/src/cuda/lbp.cu
--- a/modules/cudaobjdetect/src/cuda/lbp.hpp
+++ b/modules/cudaobjdetect/src/cuda/lbp.hpp
--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
--- a/modules/cudaobjdetect/src/precomp.hpp
+++ b/modules/cudaobjdetect/src/precomp.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/utility.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cudaobjdetect/test/test_main.cpp
+++ b/modules/cudaobjdetect/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_CUDA_TEST_MAIN("gpu")
--- a/modules/cudaobjdetect/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@@ -48,9 +48,10 @@ using namespace cvtest;

 //#define DUMP

-struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescriptor
+struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
    cv::cuda::DeviceInfo devInfo;
+    cv::Ptr<cv::cuda::HOG> hog;

 #ifdef DUMP
    std::ofstream f;
@@ -69,23 +70,13 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
        devInfo = GetParam();

        cv::cuda::setDevice(devInfo.deviceID());
+
+        hog = cv::cuda::HOG::create();
    }

 #ifdef DUMP
-    void dump(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void dump(const std::vector<cv::Point>& locations)
    {
-        f.write((char*)&blockHists.rows, sizeof(blockHists.rows));
-        f.write((char*)&blockHists.cols, sizeof(blockHists.cols));
-
-        for (int i = 0; i < blockHists.rows; ++i)
-        {
-            for (int j = 0; j < blockHists.cols; ++j)
-            {
-                float val = blockHists.at<float>(i, j);
-                f.write((char*)&val, sizeof(val));
-            }
-        }
-
        int nlocations = locations.size();
        f.write((char*)&nlocations, sizeof(nlocations));

@@ -93,21 +84,18 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
            f.write((char*)&locations[i], sizeof(locations[i]));
    }
 #else
-    void compare(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void compare(const std::vector<cv::Point>& locations)
    {
+        // skip block_hists check
        int rows, cols;
        f.read((char*)&rows, sizeof(rows));
        f.read((char*)&cols, sizeof(cols));
-        ASSERT_EQ(rows, blockHists.rows);
-        ASSERT_EQ(cols, blockHists.cols);
-
-        for (int i = 0; i < blockHists.rows; ++i)
+        for (int i = 0; i < rows; ++i)
        {
-            for (int j = 0; j < blockHists.cols; ++j)
+            for (int j = 0; j < cols; ++j)
            {
                float val;
                f.read((char*)&val, sizeof(val));
-                ASSERT_NEAR(val, blockHists.at<float>(i, j), 1e-3);
            }
        }

@@ -126,54 +114,41 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript

    void testDetect(const cv::Mat& img)
    {
-        gamma_correction = false;
-        setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog->setGammaCorrection(false);
+        hog->setSVMDetector(hog->getDefaultPeopleDetector());

        std::vector<cv::Point> locations;

        // Test detect
-        detect(loadMat(img), locations, 0);
+        hog->detect(loadMat(img), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif

        // Test detect on smaller image
        cv::Mat img2;
        cv::resize(img, img2, cv::Size(img.cols / 2, img.rows / 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif

        // Test detect on greater image
        cv::resize(img, img2, cv::Size(img.cols * 2, img.rows * 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
    }
-
-    // Does not compare border value, as interpolation leads to delta
-    void compare_inner_parts(cv::Mat d1, cv::Mat d2)
-    {
-        for (int i = 1; i < blocks_per_win_y - 1; ++i)
-            for (int j = 1; j < blocks_per_win_x - 1; ++j)
-                for (int k = 0; k < block_hist_size; ++k)
-                {
-                    float a = d1.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
-                    float b = d2.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
-                    ASSERT_FLOAT_EQ(a, b);
-                }
-    }
 };

 // desabled while resize does not fixed
@@ -182,13 +157,8 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
    cv::Mat img_rgb = readImage("hog/road.png");
    ASSERT_FALSE(img_rgb.empty());

-#ifdef DUMP
    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
    ASSERT_TRUE(f.is_open());
-#else
-    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
-    ASSERT_TRUE(f.is_open());
-#endif

    // Test on color image
    cv::Mat img;
@@ -198,8 +168,6 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
    // Test on gray image
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
    testDetect(img);
-
-    f.close();
 }

 CUDA_TEST_P(HOG, GetDescriptors)
@@ -216,8 +184,14 @@ CUDA_TEST_P(HOG, GetDescriptors)

    // Convert train images into feature vectors (train table)
    cv::cuda::GpuMat descriptors, descriptors_by_cols;
-    getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
-    getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);
+
+    hog->setWinStride(Size(64, 128));
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_ROW_BY_ROW);
+    hog->compute(d_img, descriptors);
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_COL_BY_COL);
+    hog->compute(d_img, descriptors_by_cols);

    // Check size of the result train table
    wins_per_img_x = 3;
@@ -242,48 +216,6 @@ CUDA_TEST_P(HOG, GetDescriptors)
                    ASSERT_EQ(l[(y * blocks_per_win_x + x) * block_hist_size + k],
                              r[(x * blocks_per_win_y + y) * block_hist_size + k]);
    }
-
-    /* Now we want to extract the same feature vectors, but from single images. NOTE: results will
-    be defferent, due to border values interpolation. Using of many small images is slower, however we
-    wont't call getDescriptors and will use computeBlockHistograms instead of. computeBlockHistograms
-    works good, it can be checked in the gpu_hog sample */
-
-    img_rgb = readImage("hog/positive1.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    // Everything is fine with interpolation for left top subimage
-    ASSERT_EQ(0.0, cv::norm((cv::Mat)block_hists, (cv::Mat)descriptors.rowRange(0, 1)));
-
-    img_rgb = readImage("hog/positive2.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));
-
-    img_rgb = readImage("hog/negative1.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));
-
-    img_rgb = readImage("hog/negative2.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));
-
-    img_rgb = readImage("hog/positive3.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));
-
-    img_rgb = readImage("hog/negative3.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
@@ -310,12 +242,12 @@ CUDA_TEST_P(CalTech, HOG)
    cv::cuda::GpuMat d_img(img);
    cv::Mat markedImage(img.clone());

-    cv::cuda::HOGDescriptor d_hog;
-    d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
-    d_hog.nlevels = d_hog.nlevels + 32;
+    cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+    d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
+    d_hog->setNumLevels(d_hog->getNumLevels() + 32);

    std::vector<cv::Rect> found_locations;
-    d_hog.detectMultiScale(d_img, found_locations);
+    d_hog->detectMultiScale(d_img, found_locations);

 #if defined (LOG_CASCADE_STATISTIC)
    for (int i = 0; i < (int)found_locations.size(); i++)
@@ -326,7 +258,8 @@ CUDA_TEST_P(CalTech, HOG)
        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
    }

-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
 #endif
 }

@@ -354,9 +287,15 @@ PARAM_TEST_CASE(LBP_Read_classifier, cv::cuda::DeviceInfo, int)

 CUDA_TEST_P(LBP_Read_classifier, Accuracy)
 {
-    cv::cuda::CascadeClassifier_CUDA classifier;
    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
-    ASSERT_TRUE(classifier.load(classifierXmlPath));
+
+    cv::Ptr<cv::cuda::CascadeClassifier> d_cascade;
+
+    ASSERT_NO_THROW(
+        d_cascade = cv::cuda::CascadeClassifier::create(classifierXmlPath);
+    );
+
+    ASSERT_FALSE(d_cascade.empty());
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_Read_classifier,
@@ -396,29 +335,28 @@ CUDA_TEST_P(LBP_classify, Accuracy)
    for (; it != rects.end(); ++it)
        cv::rectangle(markedImage, *it, cv::Scalar(255, 0, 0));

-    cv::cuda::CascadeClassifier_CUDA gpuClassifier;
-    ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));
+    cv::Ptr<cv::cuda::CascadeClassifier> gpuClassifier =
+            cv::cuda::CascadeClassifier::create(classifierXmlPath);

-    cv::cuda::GpuMat gpu_rects;
    cv::cuda::GpuMat tested(grey);
-    int count = gpuClassifier.detectMultiScale(tested, gpu_rects);
+    cv::cuda::GpuMat gpu_rects_buf;
+    gpuClassifier->detectMultiScale(tested, gpu_rects_buf);
+
+    std::vector<cv::Rect> gpu_rects;
+    gpuClassifier->convert(gpu_rects_buf, gpu_rects);

 #if defined (LOG_CASCADE_STATISTIC)
-    cv::Mat downloaded(gpu_rects);
-    const cv::Rect* faces = downloaded.ptr<cv::Rect>();
-    for (int i = 0; i < count; i++)
+    for (size_t i = 0; i < gpu_rects.size(); i++)
    {
-        cv::Rect r = faces[i];
+        cv::Rect r = gpu_rects[i];

        std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
    }
-#endif

-#if defined (LOG_CASCADE_STATISTIC)
-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
 #endif
-    (void)count;
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, LBP_classify,
--- a/modules/cudaobjdetect/test/test_precomp.hpp
+++ b/modules/cudaobjdetect/test/test_precomp.hpp
@@ -0,0 +1,64 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include <fstream>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+
+#include "opencv2/cudaobjdetect.hpp"
+#include "opencv2/objdetect.hpp"
+
+#include "cvconfig.h"
+
+#endif
--- a/modules/cudawarping/CMakeLists.txt
+++ b/modules/cudawarping/CMakeLists.txt
@@ -6,4 +6,4 @@ set(the_description "CUDA-accelerated Image Warping")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow)

-ocv_define_module(cudawarping opencv_imgproc OPTIONAL opencv_cudalegacy)
+ocv_define_module(cudawarping opencv_core opencv_imgproc OPTIONAL opencv_cudev)
--- a/modules/cudawarping/include/opencv2/cudawarping.hpp
+++ b/modules/cudawarping/include/opencv2/cudawarping.hpp
@@ -171,21 +171,6 @@ CV_EXPORTS void warpPerspective(InputArray src, OutputArray dst, InputArray M, S
 */
 CV_EXPORTS void buildWarpPerspectiveMaps(InputArray M, bool inverse, Size dsize, OutputArray xmap, OutputArray ymap, Stream& stream = Stream::Null());

-/** @brief Builds plane warping maps.
- */
-CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, InputArray T, float scale,
-                                   OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
-
-/** @brief Builds cylindrical warping maps.
- */
-CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, float scale,
-                                         OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
-
-/** @brief Builds spherical warping maps.
- */
-CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, InputArray K, InputArray R, float scale,
-                                       OutputArray map_x, OutputArray map_y, Stream& stream = Stream::Null());
-
 /** @brief Rotates an image around the origin (0,0) and then shifts it.

@param src Source image. Supports 1, 3 or 4 channels images with CV_8U , CV_16U or CV_32F
@@ -224,14 +209,6 @@ src .
 */
 CV_EXPORTS void pyrUp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());

-class CV_EXPORTS ImagePyramid : public Algorithm
-{
-public:
-    virtual void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const = 0;
-};
-
-CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());
-
 //! @}

 }} // namespace cv { namespace cuda {
--- a/modules/cudawarping/perf/perf_warping.cpp
+++ b/modules/cudawarping/perf/perf_warping.cpp
@@ -325,88 +325,6 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, WarpPerspective,
    }
 }

-//////////////////////////////////////////////////////////////////////
-// BuildWarpPlaneMaps
-
-PERF_TEST_P(Sz, BuildWarpPlaneMaps,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-    const cv::Mat T = cv::Mat::zeros(1, 3, CV_32F);
-
-    if (PERF_RUN_CUDA())
-    {
-        cv::cuda::GpuMat map_x;
-        cv::cuda::GpuMat map_y;
-
-        TEST_CYCLE() cv::cuda::buildWarpPlaneMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, T, 1.0, map_x, map_y);
-
-        CUDA_SANITY_CHECK(map_x);
-        CUDA_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BuildWarpCylindricalMaps
-
-PERF_TEST_P(Sz, BuildWarpCylindricalMaps,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-
-    if (PERF_RUN_CUDA())
-    {
-        cv::cuda::GpuMat map_x;
-        cv::cuda::GpuMat map_y;
-
-        TEST_CYCLE() cv::cuda::buildWarpCylindricalMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, 1.0, map_x, map_y);
-
-        CUDA_SANITY_CHECK(map_x);
-        CUDA_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BuildWarpSphericalMaps
-
-PERF_TEST_P(Sz, BuildWarpSphericalMaps,
-            CUDA_TYPICAL_MAT_SIZES)
-{
-    const cv::Size size = GetParam();
-
-    const cv::Mat K = cv::Mat::eye(3, 3, CV_32FC1);
-    const cv::Mat R = cv::Mat::ones(3, 3, CV_32FC1);
-
-    if (PERF_RUN_CUDA())
-    {
-        cv::cuda::GpuMat map_x;
-        cv::cuda::GpuMat map_y;
-
-        TEST_CYCLE() cv::cuda::buildWarpSphericalMaps(size, cv::Rect(0, 0, size.width, size.height), K, R, 1.0, map_x, map_y);
-
-        CUDA_SANITY_CHECK(map_x);
-        CUDA_SANITY_CHECK(map_y);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
-
 //////////////////////////////////////////////////////////////////////
 // Rotate

@@ -514,40 +432,3 @@ PERF_TEST_P(Sz_Depth_Cn, PyrUp,
        CPU_SANITY_CHECK(dst);
    }
 }
-
-//////////////////////////////////////////////////////////////////////
-// ImagePyramidGetLayer
-
-PERF_TEST_P(Sz_Depth_Cn, ImagePyramidGetLayer,
-            Combine(CUDA_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    CUDA_CHANNELS_1_3_4))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-    const int channels = GET_PARAM(2);
-
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    const int nLayers = 3;
-    const cv::Size dstSize(size.width / 2 + 10, size.height / 2 + 10);
-
-    if (PERF_RUN_CUDA())
-    {
-        const cv::cuda::GpuMat d_src(src);
-        cv::cuda::GpuMat dst;
-
-        cv::Ptr<cv::cuda::ImagePyramid> d_pyr = cv::cuda::createImagePyramid(d_src, nLayers);
-
-        TEST_CYCLE() d_pyr->getLayer(dst, dstSize);
-
-        CUDA_SANITY_CHECK(dst);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
--- a/modules/cudawarping/src/precomp.hpp
+++ b/modules/cudawarping/src/precomp.hpp
@@ -47,11 +47,4 @@

 #include "opencv2/core/private.cuda.hpp"

-#include "opencv2/opencv_modules.hpp"
-
-#ifdef HAVE_OPENCV_CUDALEGACY
-#  include "opencv2/cudalegacy.hpp"
-#  include "opencv2/cudalegacy/private.hpp"
-#endif
-
 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cudawarping/src/pyramids.cpp
+++ b/modules/cudawarping/src/pyramids.cpp
@@ -50,8 +50,6 @@ using namespace cv::cuda;
 void cv::cuda::pyrDown(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
 void cv::cuda::pyrUp(InputArray, OutputArray, Stream&) { throw_no_cuda(); }

-Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray, int, Stream&) { throw_no_cuda(); return Ptr<ImagePyramid>(); }
-
 #else // HAVE_CUDA

 //////////////////////////////////////////////////////////////////////////////
@@ -133,112 +131,4 @@ void cv::cuda::pyrUp(InputArray _src, OutputArray _dst, Stream& stream)
    func(src, dst, StreamAccessor::getStream(stream));
 }

-//////////////////////////////////////////////////////////////////////////////
-// ImagePyramid
-
-#ifdef HAVE_OPENCV_CUDALEGACY
-
-namespace
-{
-    class ImagePyramidImpl : public ImagePyramid
-    {
-    public:
-        ImagePyramidImpl(InputArray img, int nLayers, Stream& stream);
-
-        void getLayer(OutputArray outImg, Size outRoi, Stream& stream = Stream::Null()) const;
-
-    private:
-        GpuMat layer0_;
-        std::vector<GpuMat> pyramid_;
-        int nLayers_;
-    };
-
-    ImagePyramidImpl::ImagePyramidImpl(InputArray _img, int numLayers, Stream& stream)
-    {
-        GpuMat img = _img.getGpuMat();
-
-        CV_Assert( img.depth() <= CV_32F && img.channels() <= 4 );
-
-        img.copyTo(layer0_, stream);
-
-        Size szLastLayer = img.size();
-        nLayers_ = 1;
-
-        if (numLayers <= 0)
-            numLayers = 255; // it will cut-off when any of the dimensions goes 1
-
-        pyramid_.resize(numLayers);
-
-        for (int i = 0; i < numLayers - 1; ++i)
-        {
-            Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
-
-            if (szCurLayer.width == 0 || szCurLayer.height == 0)
-                break;
-
-            ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
-            nLayers_++;
-
-            const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
-
-            cv::cuda::device::pyramid::downsampleX2(prevLayer, pyramid_[i], img.depth(), img.channels(), StreamAccessor::getStream(stream));
-
-            szLastLayer = szCurLayer;
-        }
-    }
-
-    void ImagePyramidImpl::getLayer(OutputArray _outImg, Size outRoi, Stream& stream) const
-    {
-        CV_Assert( outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0 );
-
-        ensureSizeIsEnough(outRoi, layer0_.type(), _outImg);
-        GpuMat outImg = _outImg.getGpuMat();
-
-        if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
-        {
-            layer0_.copyTo(outImg, stream);
-            return;
-        }
-
-        float lastScale = 1.0f;
-        float curScale;
-        GpuMat lastLayer = layer0_;
-        GpuMat curLayer;
-
-        for (int i = 0; i < nLayers_ - 1; ++i)
-        {
-            curScale = lastScale * 0.5f;
-            curLayer = pyramid_[i];
-
-            if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
-            {
-                curLayer.copyTo(outImg, stream);
-            }
-
-            if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
-                break;
-
-            lastScale = curScale;
-            lastLayer = curLayer;
-        }
-
-        cv::cuda::device::pyramid::interpolateFrom1(lastLayer, outImg, outImg.depth(), outImg.channels(), StreamAccessor::getStream(stream));
-    }
-}
-
 #endif
-
-Ptr<ImagePyramid> cv::cuda::createImagePyramid(InputArray img, int nLayers, Stream& stream)
-{
-#ifndef HAVE_OPENCV_CUDALEGACY
-    (void) img;
-    (void) nLayers;
-    (void) stream;
-    throw_no_cuda();
-    return Ptr<ImagePyramid>();
-#else
-    return Ptr<ImagePyramid>(new ImagePyramidImpl(img, nLayers, stream));
-#endif
-}
-
-#endif // HAVE_CUDA
--- a/modules/cudawarping/src/warp.cpp
+++ b/modules/cudawarping/src/warp.cpp
@@ -53,10 +53,6 @@ void cv::cuda::buildWarpAffineMaps(InputArray, bool, Size, OutputArray, OutputAr
 void cv::cuda::warpPerspective(InputArray, OutputArray, InputArray, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
 void cv::cuda::buildWarpPerspectiveMaps(InputArray, bool, Size, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }

-void cv::cuda::buildWarpPlaneMaps(Size, Rect, InputArray, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
-void cv::cuda::buildWarpCylindricalMaps(Size, Rect, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
-void cv::cuda::buildWarpSphericalMaps(Size, Rect, InputArray, InputArray, float, OutputArray, OutputArray, Stream&) { throw_no_cuda(); }
-
 void cv::cuda::rotate(InputArray, OutputArray, Size, double, double, double, int, Stream&) { throw_no_cuda(); }

 #else // HAVE_CUDA
@@ -462,124 +458,6 @@ void cv::cuda::warpPerspective(InputArray _src, OutputArray _dst, InputArray _M,
    }
 }

-//////////////////////////////////////////////////////////////////////////////
-// buildWarpPlaneMaps
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
-                                cudaStream_t stream);
-    }
-}}}
-
-void cv::cuda::buildWarpPlaneMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, InputArray _T,
-                                 float scale, OutputArray _map_x, OutputArray _map_y, Stream& stream)
-{
-    (void) src_size;
-
-    Mat K = _K.getMat();
-    Mat R = _R.getMat();
-    Mat T = _T.getMat();
-
-    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
-    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
-    CV_Assert( (T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32FC1 && T.isContinuous() );
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert( K_Rinv.isContinuous() );
-    CV_Assert( R_Kinv.isContinuous() );
-
-    _map_x.create(dst_roi.size(), CV_32FC1);
-    _map_y.create(dst_roi.size(), CV_32FC1);
-
-    GpuMat map_x = _map_x.getGpuMat();
-    GpuMat map_y = _map_y.getGpuMat();
-
-    device::imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
-                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpCylyndricalMaps
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                      const float k_rinv[9], const float r_kinv[9], float scale,
-                                      cudaStream_t stream);
-    }
-}}}
-
-void cv::cuda::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, float scale,
-                                       OutputArray _map_x, OutputArray _map_y, Stream& stream)
-{
-    (void) src_size;
-
-    Mat K = _K.getMat();
-    Mat R = _R.getMat();
-
-    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
-    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert( K_Rinv.isContinuous() );
-    CV_Assert( R_Kinv.isContinuous() );
-
-    _map_x.create(dst_roi.size(), CV_32FC1);
-    _map_y.create(dst_roi.size(), CV_32FC1);
-
-    GpuMat map_x = _map_x.getGpuMat();
-    GpuMat map_y = _map_y.getGpuMat();
-
-    device::imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpSphericalMaps
-
-namespace cv { namespace cuda { namespace device
-{
-    namespace imgproc
-    {
-        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                    const float k_rinv[9], const float r_kinv[9], float scale,
-                                    cudaStream_t stream);
-    }
-}}}
-
-void cv::cuda::buildWarpSphericalMaps(Size src_size, Rect dst_roi, InputArray _K, InputArray _R, float scale,
-                                     OutputArray _map_x, OutputArray _map_y, Stream& stream)
-{
-    (void) src_size;
-
-    Mat K = _K.getMat();
-    Mat R = _R.getMat();
-
-    CV_Assert( K.size() == Size(3,3) && K.type() == CV_32FC1 );
-    CV_Assert( R.size() == Size(3,3) && R.type() == CV_32FC1 );
-
-    Mat K_Rinv = K * R.t();
-    Mat R_Kinv = R * K.inv();
-    CV_Assert( K_Rinv.isContinuous() );
-    CV_Assert( R_Kinv.isContinuous() );
-
-    _map_x.create(dst_roi.size(), CV_32FC1);
-    _map_y.create(dst_roi.size(), CV_32FC1);
-
-    GpuMat map_x = _map_x.getGpuMat();
-    GpuMat map_y = _map_y.getGpuMat();
-
-    device::imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
-}
-
 ////////////////////////////////////////////////////////////////////////
 // rotate

--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -337,7 +337,7 @@ public:
          double _min_margin=0.003, int _edge_blur_size=5 );

    CV_WRAP virtual void detectRegions( InputArray image,
-                                        std::vector<std::vector<Point> >& msers,
+                                        CV_OUT std::vector<std::vector<Point> >& msers,
                                        std::vector<Rect>& bboxes ) = 0;

    CV_WRAP virtual void setDelta(int delta) = 0;
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -818,7 +818,7 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
    ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
    sumX = sumY = 0.f;

-    for (size_t k = 0; k < ang_size; ++k) {
+    for (int k = 0; k < ang_size; ++k) {
      // Get angle from the x-axis of the sample point
      const float & ang = Ang[k];

--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -48,6 +48,11 @@
 #  pragma GCC diagnostic ignored "-Wmissing-declarations"
 #endif

+#if (_WIN32_IE < 0x0500)
+#pragma message("WARNING: Win32 UI needs to be compiled with _WIN32_IE >= 0x0500 (_WIN32_IE_IE50)")
+#define _WIN32_IE 0x0500
+#endif
+
 #include <commctrl.h>
 #include <stdlib.h>
 #include <string.h>
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -90,6 +90,8 @@ enum { IMWRITE_PNG_STRATEGY_DEFAULT      = 0,

 /** @brief Loads an image from a file.

+@anchor imread
+
@param filename Name of file to be loaded.
@param flags Flags specifying the color type of a loaded image:
 -   CV_LOAD_IMAGE_ANYDEPTH - If set, return 16-bit/32-bit image when the input has the
--- a/modules/imgcodecs/src/grfmt_gdal.cpp
+++ b/modules/imgcodecs/src/grfmt_gdal.cpp
@@ -38,10 +38,17 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#include "grfmt_gdal.hpp"
+#include "precomp.hpp"
+
+// GDAL Macros
+#include "cvconfig.h"

 #ifdef HAVE_GDAL

+// Our Header
+#include "grfmt_gdal.hpp"
+
+
 /// C++ Standard Libraries
 #include <iostream>
 #include <stdexcept>
@@ -195,7 +202,10 @@ GdalDecoder::~GdalDecoder(){
 /**
 * Convert data range
 */
-double range_cast( const GDALDataType& gdalType, const int& cvDepth, const double& value ){
+double range_cast( const GDALDataType& gdalType,
+                   const int& cvDepth,
+                   const double& value )
+{

    // uint8 -> uint8
    if( gdalType == GDT_Byte && cvDepth == CV_8U ){
--- a/modules/imgcodecs/src/grfmt_gdal.hpp
+++ b/modules/imgcodecs/src/grfmt_gdal.hpp
@@ -42,16 +42,15 @@
 #ifndef __GRFMT_GDAL_HPP__
 #define __GRFMT_GDAL_HPP__

+/// OpenCV FMT Base Type
+#include "grfmt_base.hpp"
+
 /// Macro to make sure we specified GDAL in CMake
 #ifdef HAVE_GDAL

 /// C++ Libraries
 #include <iostream>

-/// OpenCV Libraries
-#include "grfmt_base.hpp"
-#include "precomp.hpp"
-
 /// Geospatial Data Abstraction Library
 #include <gdal/cpl_conv.h>
 #include <gdal/gdal_priv.h>
@@ -61,6 +60,13 @@
 /// Start of CV Namespace
 namespace cv {

+/**
+ * Convert GDAL Pixel Range to OpenCV Pixel Range
+*/
+double range_cast( const GDALDataType& gdalType,
+                   const int& cvDepth,
+                   const double& value );
+
 /**
 * Convert GDAL Palette Interpretation to OpenCV Pixel Type
 */
--- a/modules/imgcodecs/test/test_grfmt.cpp
+++ b/modules/imgcodecs/test/test_grfmt.cpp
@@ -664,7 +664,7 @@ private:
        vector<Mat> pages;
        bool res = imreadmulti(folder + "multipage.tif", pages, flags);
        ASSERT_TRUE(res == true);
-        ASSERT_TRUE(pages.size() == page_count);
+        ASSERT_EQ(static_cast<size_t>(page_count), pages.size());

        for (int i = 0; i < page_count; i++)
        {
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -3332,9 +3332,11 @@ data type.
@param result Map of comparison results. It must be single-channel 32-bit floating-point. If image
 is \f$W \times H\f$ and templ is \f$w \times h\f$ , then result is \f$(W-w+1) \times (H-h+1)\f$ .
@param method Parameter specifying the comparison method, see cv::TemplateMatchModes
+@param mask Mask of searched template. It must have the same datatype and size with templ. It is
+not set by default.
 */
 CV_EXPORTS_W void matchTemplate( InputArray image, InputArray templ,
-                                 OutputArray result, int method );
+                                 OutputArray result, int method, InputArray mask = noArray() );

 //! @}

--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -193,7 +193,9 @@ cvStartFindContours( void* _img, CvMemStorage* storage,

    if( !((CV_IS_MASK_ARR( mat ) && mode < CV_RETR_FLOODFILL) ||
          (CV_MAT_TYPE(mat->type) == CV_32SC1 && mode == CV_RETR_FLOODFILL)) )
-        CV_Error( CV_StsUnsupportedFormat, "[Start]FindContours support only 8uC1 and 32sC1 images" );
+        CV_Error( CV_StsUnsupportedFormat,
+                  "[Start]FindContours supports only CV_8UC1 images when mode != CV_RETR_FLOODFILL "
+                  "otherwise supports CV_32SC1 images only" );

    CvSize size = cvSize( mat->width, mat->height );
    int step = mat->step;
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -2231,9 +2231,8 @@ struct SymmRowSmallVec_8u32s

    int operator()(const uchar* src, uchar* _dst, int width, int cn) const
    {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !checkHardwareSupport(CV_CPU_NEON) )
-        //     return 0;
+         if( !checkHardwareSupport(CV_CPU_NEON) )
+             return 0;

        int i = 0, _ksize = kernel.rows + kernel.cols - 1;
        int* dst = (int*)_dst;
@@ -2459,9 +2458,8 @@ struct SymmColumnVec_32s8u

    int operator()(const uchar** _src, uchar* dst, int width) const
    {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !checkHardwareSupport(CV_CPU_NEON) )
-        //     return 0;
+         if( !checkHardwareSupport(CV_CPU_NEON) )
+             return 0;

        int _ksize = kernel.rows + kernel.cols - 1;
        int ksize2 = _ksize / 2;
@@ -2612,9 +2610,8 @@ struct SymmColumnSmallVec_32s16s

    int operator()(const uchar** _src, uchar* _dst, int width) const
    {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !checkHardwareSupport(CV_CPU_NEON) )
-        //     return 0;
+         if( !checkHardwareSupport(CV_CPU_NEON) )
+             return 0;

        int ksize2 = (kernel.rows + kernel.cols - 1)/2;
        const float* ky = kernel.ptr<float>() + ksize2;
@@ -2788,15 +2785,13 @@ struct SymmColumnVec_32f16s
        kernel = _kernel;
        delta = (float)_delta;
        CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-        //Uncomment the following line when runtime support for neon is implemented.
-        // neon_supported = checkHardwareSupport(CV_CPU_NEON);
+         neon_supported = checkHardwareSupport(CV_CPU_NEON);
    }

    int operator()(const uchar** _src, uchar* _dst, int width) const
    {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !neon_supported )
-        //     return 0;
+         if( !neon_supported )
+             return 0;

        int _ksize = kernel.rows + kernel.cols - 1;
        int ksize2 = _ksize / 2;
@@ -2943,9 +2938,8 @@ struct SymmRowSmallVec_32f

    int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
    {
-        //Uncomment the two following lines when runtime support for neon is implemented.
-        // if( !checkHardwareSupport(CV_CPU_NEON) )
-        //     return 0;
+         if( !checkHardwareSupport(CV_CPU_NEON) )
+             return 0;

        int i = 0, _ksize = kernel.rows + kernel.cols - 1;
        float* dst = (float*)_dst;
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -1497,7 +1497,9 @@ void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
    }

 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if(sigma1 == 0 && sigma2 == 0 && tegra::gaussian(_src.getMat(), _dst.getMat(), ksize, borderType))
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+    if(sigma1 == 0 && sigma2 == 0 && tegra::gaussian(src, dst, ksize, borderType))
        return;
 #endif

--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -814,12 +814,97 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
        }
    }
 }
+
+static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )
+{
+    int type = _img.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert( CV_TM_SQDIFF <= method && method <= CV_TM_CCOEFF_NORMED );
+    CV_Assert( (depth == CV_8U || depth == CV_32F) && type == _templ.type() && _img.dims() <= 2 );
+
+    Mat img = _img.getMat(), templ = _templ.getMat(), mask = _mask.getMat();
+    int ttype = templ.type(), tdepth = CV_MAT_DEPTH(ttype), tcn = CV_MAT_CN(ttype);
+    int mtype = img.type(), mdepth = CV_MAT_DEPTH(type), mcn = CV_MAT_CN(mtype);
+
+    if (depth == CV_8U)
+    {
+        depth = CV_32F;
+        type = CV_MAKETYPE(CV_32F, cn);
+        img.convertTo(img, type, 1.0 / 255);
+    }
+
+    if (tdepth == CV_8U)
+    {
+        tdepth = CV_32F;
+        ttype = CV_MAKETYPE(CV_32F, tcn);
+        templ.convertTo(templ, ttype, 1.0 / 255);
+    }
+
+    if (mdepth == CV_8U)
+    {
+        mdepth = CV_32F;
+        mtype = CV_MAKETYPE(CV_32F, mcn);
+        compare(mask, Scalar::all(0), mask, CMP_NE);
+        mask.convertTo(mask, mtype, 1.0 / 255);
+    }
+
+    Size corrSize(img.cols - templ.cols + 1, img.rows - templ.rows + 1);
+    _result.create(corrSize, CV_32F);
+    Mat result = _result.getMat();
+
+    Mat img2 = img.mul(img);
+    Mat mask2 = mask.mul(mask);
+    Mat mask_templ = templ.mul(mask);
+    Scalar templMean, templSdv;
+
+    double templSum2 = 0;
+    meanStdDev( mask_templ, templMean, templSdv );
+
+    templSum2 = templSdv[0]*templSdv[0] + templSdv[1]*templSdv[1] + templSdv[2]*templSdv[2] + templSdv[3]*templSdv[3];
+    templSum2 += templMean[0]*templMean[0] + templMean[1]*templMean[1] + templMean[2]*templMean[2] + templMean[3]*templMean[3];
+    templSum2 *= ((double)templ.rows * templ.cols);
+
+    if (method == CV_TM_SQDIFF)
+    {
+        Mat mask2_templ = templ.mul(mask2);
+
+        Mat corr(corrSize, CV_32F);
+        crossCorr( img, mask2_templ, corr, corr.size(), corr.type(), Point(0,0), 0, 0 );
+        crossCorr( img2, mask, result, result.size(), result.type(), Point(0,0), 0, 0 );
+
+        result -= corr * 2;
+        result += templSum2;
+    }
+    else if (method == CV_TM_CCORR_NORMED)
+    {
+        if (templSum2 < DBL_EPSILON)
+        {
+            result = Scalar::all(1);
+            return;
+        }
+
+        Mat corr(corrSize, CV_32F);
+        crossCorr( img2, mask2, corr, corr.size(), corr.type(), Point(0,0), 0, 0 );
+        crossCorr( img, mask_templ, result, result.size(), result.type(), Point(0,0), 0, 0 );
+
+        sqrt(corr, corr);
+        result = result.mul(1/corr);
+        result /= std::sqrt(templSum2);
+    }
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}
 }

 ////////////////////////////////////////////////////////////////////////////////////////////////////////

-void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, int method )
+void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, int method, InputArray _mask )
 {
+    if (!_mask.empty())
+    {
+        cv::matchTemplateMask(_img, _templ, _result, method, _mask);
+        return;
+    }
+
    int type = _img.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    CV_Assert( CV_TM_SQDIFF <= method && method <= CV_TM_CCOEFF_NORMED );
    CV_Assert( (depth == CV_8U || depth == CV_32F) && type == _templ.type() && _img.dims() <= 2 );
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -931,7 +931,7 @@ Ptr<CascadeClassifierImpl::MaskGenerator> CascadeClassifierImpl::getMaskGenerato
 Ptr<BaseCascadeClassifier::MaskGenerator> createFaceDetectionMaskGenerator()
 {
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    return tegra::getCascadeClassifierMaskGenerator(*this);
+    return tegra::getCascadeClassifierMaskGenerator();
 #else
    return Ptr<BaseCascadeClassifier::MaskGenerator>();
 #endif
@@ -1072,10 +1072,10 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
        {
            String opts;
            if (lbufSize.area())
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d -D HAAR",
                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
            else
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d -D HAAR",
                              localsz.width, localsz.height, data.maxNodesPerTree, splitstage_ocl, nstages, MAX_FACES);
            haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
            if( haarKernel.empty() )
@@ -1112,10 +1112,10 @@ bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<fl
        {
            String opts;
            if (lbufSize.area())
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d -D LBP",
                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width, splitstage_ocl, nstages, MAX_FACES);
            else
-                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d",
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SPLIT_STAGE=%d -D N_STAGES=%d -D MAX_FACES=%d -D LBP",
                              localsz.width, localsz.height, splitstage_ocl, nstages, MAX_FACES);
            lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
            if( lbpKernel.empty() )
--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@@ -1,5 +1,7 @@
 #pragma once

+#include "opencv2/core/ocl.hpp"
+
 namespace cv
 {

--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -12,19 +12,22 @@
 //    Erping Pang, erping@multicorewareinc.com
 //

-
+#ifdef HAAR
 typedef struct __attribute__((aligned(4))) OptHaarFeature
 {
    int4 ofs[3] __attribute__((aligned (4)));
    float4 weight __attribute__((aligned (4)));
 }
 OptHaarFeature;
+#endif

+#ifdef LBP
 typedef struct __attribute__((aligned(4))) OptLBPFeature
 {
    int16 ofs __attribute__((aligned (4)));
 }
 OptLBPFeature;
+#endif

 typedef struct __attribute__((aligned(4))) Stump
 {
@@ -64,6 +67,7 @@ ScaleData;
 #define NODE_COUNT 1
 #endif

+#ifdef HAAR
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
 void runHaarClassifier(
    int nscales, __global const ScaleData* scaleData,
@@ -352,7 +356,9 @@ void runHaarClassifier(
        }
    }
 }
+#endif

+#ifdef LBP
 #undef CALC_SUM_OFS_
 #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
@@ -651,3 +657,4 @@ void runLBPClassifierStump(
        }
    }
 }
+#endif
--- a/modules/photo/include/opencv2/photo/cuda.hpp
+++ b/modules/photo/include/opencv2/photo/cuda.hpp
@@ -59,69 +59,71 @@ namespace cv { namespace cuda {
@param block_size Size of block used for computing weights.
@param borderMode Border type. See borderInterpolate for details. BORDER_REFLECT101 ,
 BORDER_REPLICATE , BORDER_CONSTANT , BORDER_REFLECT and BORDER_WRAP are supported for now.
-@param s Stream for the asynchronous version.
+@param stream Stream for the asynchronous version.

@sa
   fastNlMeansDenoising
 */
-CV_EXPORTS void nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, int borderMode = BORDER_DEFAULT, Stream& s = Stream::Null());
+CV_EXPORTS void nonLocalMeans(InputArray src, OutputArray dst,
+                              float h,
+                              int search_window = 21,
+                              int block_size = 7,
+                              int borderMode = BORDER_DEFAULT,
+                              Stream& stream = Stream::Null());

-/** @brief The class implements fast approximate Non Local Means Denoising algorithm.
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit 1-channel, 2-channel or 3-channel image.
+@param dst Output image with the same size and type as src .
+@param h Parameter regulating filter strength. Big h value perfectly removes noise but also
+removes image details, smaller h value preserves details but also preserves some noise
+@param search_window Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater search_window - greater
+denoising time. Recommended value 21 pixels
+@param block_size Size in pixels of the template patch that is used to compute weights. Should be
+odd. Recommended value 7 pixels
+@param stream Stream for the asynchronous invocations.
+
+This function expected to be applied to grayscale images. For colored images look at
+FastNonLocalMeansDenoising::labMethod.
+
+@sa
+   fastNlMeansDenoising
 */
-class CV_EXPORTS FastNonLocalMeansDenoising
-{
-public:
-    /** @brief Perform image denoising using Non-local Means Denoising algorithm
-    <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
-    optimizations. Noise expected to be a gaussian white noise
+CV_EXPORTS void fastNlMeansDenoising(InputArray src, OutputArray dst,
+                                     float h,
+                                     int search_window = 21,
+                                     int block_size = 7,
+                                     Stream& stream = Stream::Null());

-    @param src Input 8-bit 1-channel, 2-channel or 3-channel image.
-    @param dst Output image with the same size and type as src .
-    @param h Parameter regulating filter strength. Big h value perfectly removes noise but also
-    removes image details, smaller h value preserves details but also preserves some noise
-    @param search_window Size in pixels of the window that is used to compute weighted average for
-    given pixel. Should be odd. Affect performance linearly: greater search_window - greater
-    denoising time. Recommended value 21 pixels
-    @param block_size Size in pixels of the template patch that is used to compute weights. Should be
-    odd. Recommended value 7 pixels
-    @param s Stream for the asynchronous invocations.
+/** @brief Modification of fastNlMeansDenoising function for colored images

-    This function expected to be applied to grayscale images. For colored images look at
-    FastNonLocalMeansDenoising::labMethod.
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src .
+@param h_luminance Parameter regulating filter strength. Big h value perfectly removes noise but
+also removes image details, smaller h value preserves details but also preserves some noise
+@param photo_render float The same as h but for color components. For most images value equals 10 will be
+enought to remove colored noise and do not distort colors
+@param search_window Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater search_window - greater
+denoising time. Recommended value 21 pixels
+@param block_size Size in pixels of the template patch that is used to compute weights. Should be
+odd. Recommended value 7 pixels
+@param stream Stream for the asynchronous invocations.

-    @sa
-       fastNlMeansDenoising
-     */
-    void simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window = 21, int block_size = 7, Stream& s = Stream::Null());
+The function converts image to CIELAB colorspace and then separately denoise L and AB components
+with given h parameters using FastNonLocalMeansDenoising::simpleMethod function.

-    /** @brief Modification of FastNonLocalMeansDenoising::simpleMethod for color images
-
-    @param src Input 8-bit 3-channel image.
-    @param dst Output image with the same size and type as src .
-    @param h_luminance Parameter regulating filter strength. Big h value perfectly removes noise but
-    also removes image details, smaller h value preserves details but also preserves some noise
-    @param photo_render float The same as h but for color components. For most images value equals 10 will be
-    enought to remove colored noise and do not distort colors
-    @param search_window Size in pixels of the window that is used to compute weighted average for
-    given pixel. Should be odd. Affect performance linearly: greater search_window - greater
-    denoising time. Recommended value 21 pixels
-    @param block_size Size in pixels of the template patch that is used to compute weights. Should be
-    odd. Recommended value 7 pixels
-    @param s Stream for the asynchronous invocations.
-
-    The function converts image to CIELAB colorspace and then separately denoise L and AB components
-    with given h parameters using FastNonLocalMeansDenoising::simpleMethod function.
-
-    @sa
-       fastNlMeansDenoisingColored
-     */
-    void labMethod(const GpuMat& src, GpuMat& dst, float h_luminance, float photo_render, int search_window = 21, int block_size = 7, Stream& s = Stream::Null());
-
-private:
-
-    GpuMat buffer, extended_src_buffer;
-    GpuMat lab, l, ab;
-};
+@sa
+   fastNlMeansDenoisingColored
+ */
+CV_EXPORTS void fastNlMeansDenoisingColored(InputArray src, OutputArray dst,
+                                            float h_luminance, float photo_render,
+                                            int search_window = 21,
+                                            int block_size = 7,
+                                            Stream& stream = Stream::Null());

 //! @} photo

--- a/modules/photo/perf/perf_cuda.cpp
+++ b/modules/photo/perf/perf_cuda.cpp
@@ -126,12 +126,10 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, CUDA_FastNonLocalMeans,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::FastNonLocalMeansDenoising fnlmd;
-
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat dst;

-        TEST_CYCLE() fnlmd.simpleMethod(d_src, dst, h, search_widow_size, block_size);
+        TEST_CYCLE() cv::cuda::fastNlMeansDenoising(d_src, dst, h, search_widow_size, block_size);

        CUDA_SANITY_CHECK(dst);
    }
@@ -171,12 +169,10 @@ PERF_TEST_P(Sz_Depth_WinSz_BlockSz, CUDA_FastNonLocalMeansColored,

    if (PERF_RUN_CUDA())
    {
-        cv::cuda::FastNonLocalMeansDenoising fnlmd;
-
        const cv::cuda::GpuMat d_src(src);
        cv::cuda::GpuMat dst;

-        TEST_CYCLE() fnlmd.labMethod(d_src, dst, h, h, search_widow_size, block_size);
+        TEST_CYCLE() cv::cuda::fastNlMeansDenoisingColored(d_src, dst, h, h, search_widow_size, block_size);

        CUDA_SANITY_CHECK(dst);
    }
--- a/modules/photo/src/denoising.cuda.cpp
+++ b/modules/photo/src/denoising.cuda.cpp
@@ -60,9 +60,9 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || !defined(HAVE_OPENCV_CUDAARITHM) || !defined(HAVE_OPENCV_CUDAIMGPROC)

-void cv::cuda::nonLocalMeans(const GpuMat&, GpuMat&, float, int, int, int, Stream&) { throw_no_cuda(); }
-void cv::cuda::FastNonLocalMeansDenoising::simpleMethod(const GpuMat&, GpuMat&, float, int, int, Stream&) { throw_no_cuda(); }
-void cv::cuda::FastNonLocalMeansDenoising::labMethod( const GpuMat&, GpuMat&, float, float, int, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::nonLocalMeans(InputArray, OutputArray, float, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::fastNlMeansDenoising(InputArray, OutputArray, float, int, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::fastNlMeansDenoisingColored(InputArray, OutputArray, float, float, int, int, Stream&) { throw_no_cuda(); }

 #else

@@ -78,13 +78,15 @@ namespace cv { namespace cuda { namespace device
    }
 }}}

-void cv::cuda::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, int borderMode, Stream& s)
+void cv::cuda::nonLocalMeans(InputArray _src, OutputArray _dst, float h, int search_window, int block_window, int borderMode, Stream& stream)
 {
    using cv::cuda::device::imgproc::nlm_bruteforce_gpu;
    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);

    static const func_t funcs[4] = { nlm_bruteforce_gpu<uchar>, nlm_bruteforce_gpu<uchar2>, nlm_bruteforce_gpu<uchar3>, 0/*nlm_bruteforce_gpu<uchar4>,*/ };

+    const GpuMat src = _src.getGpuMat();
+
    CV_Assert(src.type() == CV_8U || src.type() == CV_8UC2 || src.type() == CV_8UC3);

    const func_t func = funcs[src.channels() - 1];
@@ -93,8 +95,10 @@ void cv::cuda::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search
    int b = borderMode;
    CV_Assert(b == BORDER_REFLECT101 || b == BORDER_REPLICATE || b == BORDER_CONSTANT || b == BORDER_REFLECT || b == BORDER_WRAP);

-    dst.create(src.size(), src.type());
-    func(src, dst, search_window/2, block_window/2, h, borderMode, StreamAccessor::getStream(s));
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    func(src, dst, search_window/2, block_window/2, h, borderMode, StreamAccessor::getStream(stream));
 }

 namespace cv { namespace cuda { namespace device
@@ -112,47 +116,55 @@ namespace cv { namespace cuda { namespace device
     }
 }}}

-void cv::cuda::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
+void cv::cuda::fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, int search_window, int block_window, Stream& stream)
 {
+    const GpuMat src = _src.getGpuMat();
+
    CV_Assert(src.depth() == CV_8U && src.channels() < 4);

    int border_size = search_window/2 + block_window/2;
    Size esize = src.size() + Size(border_size, border_size) * 2;

-    cv::cuda::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
-    GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);
+    BufferPool pool(stream);

-    cv::cuda::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
+    GpuMat extended_src = pool.getBuffer(esize, src.type());
+    cv::cuda::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), stream);
    GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));

    int bcols, brows;
    device::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
-    buffer.create(brows, bcols, CV_32S);
+    GpuMat buffer = pool.getBuffer(brows, bcols, CV_32S);

    using namespace cv::cuda::device::imgproc;
    typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
    static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};

-    dst.create(src.size(), src.type());
-    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(stream));
 }

-void cv::cuda::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
+void cv::cuda::fastNlMeansDenoisingColored(InputArray _src, OutputArray _dst, float h_luminance, float h_color, int search_window, int block_window, Stream& stream)
 {
+    const GpuMat src = _src.getGpuMat();
+
    CV_Assert(src.type() == CV_8UC3);

-    lab.create(src.size(), src.type());
-    cv::cuda::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, s);
+    BufferPool pool(stream);

-    l.create(src.size(), CV_8U);
-    ab.create(src.size(), CV_8UC2);
-    device::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
+    GpuMat lab = pool.getBuffer(src.size(), src.type());
+    cv::cuda::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, stream);

-    simpleMethod(l, l, h_luminance, search_window, block_window, s);
-    simpleMethod(ab, ab, h_color, search_window, block_window, s);
+    GpuMat l = pool.getBuffer(src.size(), CV_8U);
+    GpuMat ab = pool.getBuffer(src.size(), CV_8UC2);
+    device::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(stream));

-    device::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
-    cv::cuda::cvtColor(lab, dst, cv::COLOR_Lab2BGR, 0, s);
+    fastNlMeansDenoising(l, l, h_luminance, search_window, block_window, stream);
+    fastNlMeansDenoising(ab, ab, h_color, search_window, block_window, stream);
+
+    device::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(stream));
+    cv::cuda::cvtColor(lab, _dst, cv::COLOR_Lab2BGR, 0, stream);
 }

 #endif
--- a/modules/photo/test/test_denoising.cuda.cpp
+++ b/modules/photo/test/test_denoising.cuda.cpp
@@ -99,10 +99,9 @@ TEST(CUDA_FastNonLocalMeans, Regression)
    cv::cvtColor(bgr, gray, cv::COLOR_BGR2GRAY);

    GpuMat dbgr, dgray;
-    cv::cuda::FastNonLocalMeansDenoising fnlmd;

-    fnlmd.simpleMethod(GpuMat(gray),  dgray, 20);
-    fnlmd.labMethod(GpuMat(bgr),  dbgr, 20, 10);
+    cv::cuda::fastNlMeansDenoising(GpuMat(gray),  dgray, 20);
+    cv::cuda::fastNlMeansDenoisingColored(GpuMat(bgr),  dbgr, 20, 10);

 #if 0
    dumpImage("../gpu/denoising/fnlm_denoised_lena_bgr.png", cv::Mat(dbgr));
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -861,7 +861,7 @@ class PythonWrapperGenerator(object):
            decls = self.parser.parse(hdr)
            if len(decls) == 0:
                continue
-            self.code_include.write( '#include "{}"\n'.format(hdr[hdr.rindex('opencv2/'):]) )
+            self.code_include.write( '#include "{0}"\n'.format(hdr[hdr.rindex('opencv2/'):]) )
            for decl in decls:
                name = decl[0]
                if name.startswith("struct") or name.startswith("class"):
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
@@ -1,3 +1,8 @@
 set(the_description "Images stitching")
+
+if(HAVE_CUDA)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow)
+endif()
+
 ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect
                  OPTIONAL opencv_cuda opencv_cudaarithm opencv_cudafilters opencv_cudafeatures2d opencv_xfeatures2d)
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -398,7 +398,6 @@ public:
 };


-#ifdef HAVE_OPENCV_CUDAWARPING
 class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
 {
 public:
@@ -515,7 +514,6 @@ public:
 private:
    cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
 };
-#endif


 struct SphericalPortraitProjector : ProjectorBase
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -476,7 +476,11 @@ static bool ocl_normalizeUsingWeightMap(InputArray _weight, InputOutputArray _ma

 void normalizeUsingWeightMap(InputArray _weight, InputOutputArray _src)
 {
+    Mat src;
+    Mat weight;
 #ifdef HAVE_TEGRA_OPTIMIZATION
+    src = _src.getMat();
+    weight = _weight.getMat();
    if(tegra::normalizeUsingWeightMap(weight, src))
        return;
 #endif
@@ -486,12 +490,12 @@ void normalizeUsingWeightMap(InputArray _weight, InputOutputArray _src)
            !ocl_normalizeUsingWeightMap(_weight, _src) )
 #endif
    {
-        Mat weight = _weight.getMat();
-        Mat src = _src.getMat();
+        src = _src.getMat();
+        weight = _weight.getMat();

        CV_Assert(src.type() == CV_16SC3);

-        if(weight.type() == CV_32FC1)
+        if (weight.type() == CV_32FC1)
        {
            for (int y = 0; y < src.rows; ++y)
            {
@@ -547,7 +551,8 @@ void createWeightMap(InputArray mask, float sharpness, InputOutputArray weight)
 void createLaplacePyr(InputArray img, int num_levels, std::vector<UMat> &pyr)
 {
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if(tegra::createLaplacePyr(img, num_levels, pyr))
+    cv::Mat imgMat = img.getMat();
+    if(tegra::createLaplacePyr(imgMat, num_levels, pyr))
        return;
 #endif

--- a/modules/cudawarping/src/cuda/build_warp_maps.cu
+++ b/modules/cudawarping/src/cuda/build_warp_maps.cu
--- a/Show More
+++ b/Show More