Merge pull request #3785 from jet47:tiny-gpu-module

2015-05-18 08:00:12 +00:00 · 2015-05-18 08:00:12 +00:00 · 0410fe6978
commit 0410fe6978
parent cc94b393c5 f10537cdd6
67 changed files with 3892 additions and 359 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -194,6 +194,7 @@ OCV_OPTION(BUILD_WITH_STATIC_CRT    "Enables use of staticaly linked CRT for sta
 OCV_OPTION(BUILD_FAT_JAVA_LIB       "Create fat java wrapper containing the whole OpenCV library" ON IF NOT BUILD_SHARED_LIBS AND CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(BUILD_ANDROID_SERVICE    "Build OpenCV Manager for Google Play" OFF IF ANDROID AND ANDROID_SOURCE_TREE )
 OCV_OPTION(BUILD_ANDROID_PACKAGE    "Build platform-specific package for Google Play" OFF IF ANDROID )
 OCV_OPTION(BUILD_TINY_GPU_MODULE    "Build tiny gpu module with limited image format support" OFF )
 # 3rd party libs
 OCV_OPTION(BUILD_ZLIB               "Build zlib from source"             WIN32 OR APPLE )
@ -996,6 +997,7 @@ if(HAVE_CUDA)
  status("    NVIDIA GPU arch:"      ${OPENCV_CUDA_ARCH_BIN})
  status("    NVIDIA PTX archs:"     ${OPENCV_CUDA_ARCH_PTX})
  status("    Use fast math:"        CUDA_FAST_MATH THEN YES ELSE NO)
  status("    Tiny gpu module:"      BUILD_TINY_GPU_MODULE THEN YES ELSE NO)
 endif()
 if(HAVE_OPENCL)
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -63,6 +63,10 @@ if(OPENCV_CAN_BREAK_BINARY_COMPATIBILITY)
  add_definitions(-DOPENCV_CAN_BREAK_BINARY_COMPATIBILITY)
 endif()
 if(BUILD_TINY_GPU_MODULE)
  add_definitions(-DOPENCV_TINY_GPU_MODULE)
 endif()
 if(CMAKE_COMPILER_IS_GNUCXX)
  # High level of warnings.
  add_extra_compiler_option(-W)
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@ -46,7 +46,11 @@ using namespace std;
 using namespace testing;
 using namespace perf;
 #ifdef OPENCV_TINY_GPU_MODULE
 #define ARITHM_MAT_DEPTH Values(CV_8U, CV_32F)
 #else
 #define ARITHM_MAT_DEPTH Values(CV_8U, CV_16U, CV_32F, CV_64F)
 #endif
 //////////////////////////////////////////////////////////////////////
 // Merge
@ -524,9 +528,17 @@ PERF_TEST_P(Sz_Depth, Core_AbsDiffScalar,
 //////////////////////////////////////////////////////////////////////
 // Abs
-PERF_TEST_P(Sz_Depth, Core_Abs,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_Abs, Combine(
-                    Values(CV_16S, CV_32F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(MatDepth(CV_32F))
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_Abs, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_16S, CV_32F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -552,9 +564,17 @@ PERF_TEST_P(Sz_Depth, Core_Abs,
 //////////////////////////////////////////////////////////////////////
 // Sqr
-PERF_TEST_P(Sz_Depth, Core_Sqr,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_Sqr, Combine(
-                    Values(CV_8U, CV_16S, CV_32F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(MatDepth(CV_32F))
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_Sqr, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16S, CV_32F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -580,9 +600,17 @@ PERF_TEST_P(Sz_Depth, Core_Sqr,
 //////////////////////////////////////////////////////////////////////
 // Sqrt
-PERF_TEST_P(Sz_Depth, Core_Sqrt,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_Sqrt, Combine(
-                    Values(CV_8U, CV_16S, CV_32F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(MatDepth(CV_32F))
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_Sqrt, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16S, CV_32F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -612,9 +640,17 @@ PERF_TEST_P(Sz_Depth, Core_Sqrt,
 //////////////////////////////////////////////////////////////////////
 // Log
-PERF_TEST_P(Sz_Depth, Core_Log,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_Log, Combine(
-                    Values(CV_8U, CV_16S, CV_32F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(MatDepth(CV_32F))
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_Log, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16S, CV_32F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -644,9 +680,17 @@ PERF_TEST_P(Sz_Depth, Core_Log,
 //////////////////////////////////////////////////////////////////////
 // Exp
-PERF_TEST_P(Sz_Depth, Core_Exp,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_Exp, Combine(
-                    Values(CV_8U, CV_16S, CV_32F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(MatDepth(CV_32F))
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_Exp, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16S, CV_32F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -678,10 +722,19 @@ PERF_TEST_P(Sz_Depth, Core_Exp,
 DEF_PARAM_TEST(Sz_Depth_Power, cv::Size, MatDepth, double);
-PERF_TEST_P(Sz_Depth_Power, Core_Pow,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Power, Core_Pow, Combine(
-                    Values(CV_8U, CV_16S, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    Values(0.3, 2.0, 2.4)))
+    Values(MatDepth(CV_32F)),
    Values(0.3, 2.0, 2.4)
 ))
 #else
 PERF_TEST_P(Sz_Depth_Power, Core_Pow, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16S, CV_32F),
    Values(0.3, 2.0, 2.4)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -859,10 +912,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseAndMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseAndScalar
-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(
-                    Values(CV_8U, CV_16U, CV_32S),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4))
+    Values(MatDepth(CV_8U)),
    testing::Values(MatCn(Gray))
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32S),
    GPU_CHANNELS_1_3_4
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -935,10 +997,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseOrMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseOrScalar
-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(
-                    Values(CV_8U, CV_16U, CV_32S),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4))
+    Values(MatDepth(CV_8U)),
    testing::Values(MatCn(Gray))
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32S),
    GPU_CHANNELS_1_3_4
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1011,10 +1082,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseXorMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseXorScalar
-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(
-                    Values(CV_8U, CV_16U, CV_32S),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4))
+    Values(MatDepth(CV_8U)),
    testing::Values(MatCn(Gray))
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32S),
    GPU_CHANNELS_1_3_4
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1155,9 +1235,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMat,
 //////////////////////////////////////////////////////////////////////
 // MinScalar
-PERF_TEST_P(Sz_Depth, Core_MinScalar,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(
-                    Values(CV_8U, CV_16U, CV_32F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_32F)
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1226,9 +1314,17 @@ PERF_TEST_P(Sz_Depth, Core_MaxMat,
 //////////////////////////////////////////////////////////////////////
 // MaxScalar
-PERF_TEST_P(Sz_Depth, Core_MaxScalar,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(
-                    Values(CV_8U, CV_16U, CV_32F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_32F)
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1263,11 +1359,21 @@ PERF_TEST_P(Sz_Depth, Core_MaxScalar,
 DEF_PARAM_TEST(Sz_3Depth, cv::Size, MatDepth, MatDepth, MatDepth);
-PERF_TEST_P(Sz_3Depth, Core_AddWeighted,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine(
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    Values(MatDepth(CV_32F)),
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+    Values(MatDepth(CV_32F)),
    Values(MatDepth(CV_32F))
 ))
 #else
 PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F),
    Values(CV_8U, CV_16U, CV_32F, CV_64F),
    Values(CV_8U, CV_16U, CV_32F, CV_64F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth1 = GET_PARAM(1);
@ -1782,10 +1888,19 @@ PERF_TEST_P(Sz, Core_MeanStdDev,
 DEF_PARAM_TEST(Sz_Depth_Norm, cv::Size, MatDepth, NormType);
-PERF_TEST_P(Sz_Depth_Norm, Core_Norm,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
-                    Values(CV_8U, CV_16U, CV_32S, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+    Values(CV_8U, CV_32F),
    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))
 ))
 #else
 PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32S, CV_32F),
    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1859,10 +1974,19 @@ PERF_TEST_P(Sz_Norm, Core_NormDiff,
 //////////////////////////////////////////////////////////////////////
 // Sum
-PERF_TEST_P(Sz_Depth_Cn, Core_Sum,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4))
+    Values(CV_8U, CV_32F),
    testing::Values(MatCn(Gray))
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1896,10 +2020,19 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Sum,
 //////////////////////////////////////////////////////////////////////
 // SumAbs
-PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4))
+    Values(CV_8U, CV_32F),
    testing::Values(MatCn(Gray))
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1929,10 +2062,19 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs,
 //////////////////////////////////////////////////////////////////////
 // SumSqr
-PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
-                    Values<MatDepth>(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4))
+    Values(CV_8U, CV_32F),
    testing::Values(MatCn(Gray))
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1962,9 +2104,17 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr,
 //////////////////////////////////////////////////////////////////////
 // MinMax
-PERF_TEST_P(Sz_Depth, Core_MinMax,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_MinMax, Combine(
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_32F)
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_MinMax, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -2000,9 +2150,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMax,
 //////////////////////////////////////////////////////////////////////
 // MinMaxLoc
-PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, Combine(
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_32F)
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -2040,9 +2198,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
 //////////////////////////////////////////////////////////////////////
 // CountNonZero
-PERF_TEST_P(Sz_Depth, Core_CountNonZero,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth, Core_CountNonZero, Combine(
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_32F)
 ))
 #else
 PERF_TEST_P(Sz_Depth, Core_CountNonZero, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F)
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -2079,12 +2245,23 @@ CV_ENUM(ReduceDim, Rows, Cols)
 DEF_PARAM_TEST(Sz_Depth_Cn_Code_Dim, cv::Size, MatDepth, MatCn, ReduceCode, ReduceDim);
-PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
-                    Values(CV_8U, CV_16U, CV_16S, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    Values(1, 2, 3, 4),
+    Values(CV_8U, CV_32F),
-                    ReduceCode::all(),
+    Values(1, 2, 3, 4),
-                    ReduceDim::all()))
+    ReduceCode::all(),
    ReduceDim::all()
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_16S, CV_32F),
    Values(1, 2, 3, 4),
    ReduceCode::all(),
    ReduceDim::all()
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -2120,13 +2297,25 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce,
 DEF_PARAM_TEST(Sz_Depth_NormType, cv::Size, MatDepth, NormType);
-PERF_TEST_P(Sz_Depth_NormType, Core_Normalize,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, Combine(
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    GPU_TYPICAL_MAT_SIZES,
-                    Values(NormType(cv::NORM_INF),
+    Values(CV_8U, CV_32F),
-                           NormType(cv::NORM_L1),
+    Values(NormType(cv::NORM_INF),
-                           NormType(cv::NORM_L2),
+           NormType(cv::NORM_L1),
-                           NormType(cv::NORM_MINMAX))))
+           NormType(cv::NORM_L2),
           NormType(cv::NORM_MINMAX))
 ))
 #else
 PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F),
    Values(NormType(cv::NORM_INF),
           NormType(cv::NORM_L1),
           NormType(cv::NORM_L2),
           NormType(cv::NORM_MINMAX))
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@ -145,9 +145,17 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB,
 DEF_PARAM_TEST(DescSize_Norm, int, NormType);
-PERF_TEST_P(DescSize_Norm, Features2D_BFMatch,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(Values(64, 128, 256),
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+    Values(64, 128, 256),
    Values(NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))
 ))
 #else
 PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(
    Values(64, 128, 256),
    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))
 ))
 #endif
 {
    declare.time(20.0);
@ -202,10 +210,19 @@ static void toOneRowMatches(const std::vector< std::vector<cv::DMatch> >& src, s
 DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);
-PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(Values(64, 128, 256),
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
-                    Values(2, 3),
+    Values(64, 128, 256),
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+    Values(2, 3),
    Values(NormType(cv::NORM_L2))
 ))
 #else
 PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
    Values(64, 128, 256),
    Values(2, 3),
    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))
 ))
 #endif
 {
    declare.time(30.0);
@ -257,9 +274,17 @@ PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
 //////////////////////////////////////////////////////////////////////
 // BFRadiusMatch
-PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(Values(64, 128, 256),
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+    Values(64, 128, 256),
    Values(NormType(cv::NORM_L2))
 ))
 #else
 PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(
    Values(64, 128, 256),
    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))
 ))
 #endif
 {
    declare.time(30.0);
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@ -87,7 +87,19 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur,
 //////////////////////////////////////////////////////////////////////
 // Sobel
-PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+#ifdef OPENCV_TINY_GPU_MODULE
 PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8UC1, CV_8UC4, CV_32FC1),
    Values(3, 5, 7)
 ))
 #else
 PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8UC1, CV_8UC4, CV_32FC1),
    Values(3, 5, 7, 9, 11, 13, 15)
 ))
 #endif
 {
    declare.time(20.0);
@ -154,7 +166,19 @@ PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
 //////////////////////////////////////////////////////////////////////
 // GaussianBlur
-PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+#ifdef OPENCV_TINY_GPU_MODULE
 PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8UC1, CV_8UC4, CV_32FC1),
    Values(3, 5, 7)
 ))
 #else
 PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8UC1, CV_8UC4, CV_32FC1),
    Values(3, 5, 7, 9, 11, 13, 15)
 ))
 #endif
 {
    declare.time(20.0);
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@ -91,13 +91,25 @@ void generateMap(cv::Mat& map_x, cv::Mat& map_y, int remapMode)
 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border_Mode, cv::Size, MatDepth, MatCn, Interpolation, BorderMode, RemapMode);
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4,
+    Values(CV_8U, CV_32F),
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    GPU_CHANNELS_1_3_4,
-                    ALL_BORDER_MODES,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
-                    RemapMode::all()))
+    ALL_BORDER_MODES,
    RemapMode::all()
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    ALL_BORDER_MODES,
    RemapMode::all()
 ))
 #endif
 {
    declare.time(20.0);
@ -143,12 +155,23 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap,
 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Scale, cv::Size, MatDepth, MatCn, Interpolation, double);
-PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4,
+    Values(CV_8U, CV_32F),
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    GPU_CHANNELS_1_3_4,
-                    Values(0.5, 0.3, 2.0)))
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    Values(0.5, 0.3, 2.0)
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    Values(0.5, 0.3, 2.0)
 ))
 #endif
 {
    declare.time(20.0);
@ -187,11 +210,21 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize,
 DEF_PARAM_TEST(Sz_Depth_Cn_Scale, cv::Size, MatDepth, MatCn, double);
-PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4,
+    Values(CV_8U, CV_32F),
-                    Values(0.2, 0.1, 0.05)))
+    GPU_CHANNELS_1_3_4,
    Values(0.2, 0.1, 0.05)
 ))
 #else
 PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(0.2, 0.1, 0.05)
 ))
 #endif
 {
    declare.time(1.0);
@ -230,12 +263,23 @@ PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea,
 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border, cv::Size, MatDepth, MatCn, Interpolation, BorderMode);
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4,
+    Values(CV_8U, CV_32F),
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    GPU_CHANNELS_1_3_4,
-                    ALL_BORDER_MODES))
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    ALL_BORDER_MODES)
 )
 #else
 PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    ALL_BORDER_MODES)
 )
 #endif
 {
    declare.time(20.0);
@ -280,12 +324,23 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine,
 //////////////////////////////////////////////////////////////////////
 // WarpPerspective
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4,
+    Values(CV_8U, CV_32F),
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    GPU_CHANNELS_1_3_4,
-                    ALL_BORDER_MODES))
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    ALL_BORDER_MODES)
 )
 #else
 PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    ALL_BORDER_MODES)
 )
 #endif
 {
    declare.time(20.0);
@ -330,11 +385,21 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective,
 DEF_PARAM_TEST(Sz_Depth_Cn_Border, cv::Size, MatDepth, MatCn, BorderMode);
-PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4,
+    Values(CV_8U, CV_32F),
-                    ALL_BORDER_MODES))
+    GPU_CHANNELS_1_3_4,
    ALL_BORDER_MODES)
 )
 #else
 PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    ALL_BORDER_MODES)
 )
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -372,10 +437,19 @@ CV_ENUM(ThreshOp, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO,
 DEF_PARAM_TEST(Sz_Depth_Op, cv::Size, MatDepth, ThreshOp);
-PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold, Combine(
-            Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    GPU_TYPICAL_MAT_SIZES,
-            ThreshOp::all()))
+    Values(CV_8U, CV_32F),
    ThreshOp::all()
 ))
 #else
 PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F),
    ThreshOp::all()
 ))
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -672,10 +746,19 @@ PERF_TEST_P(Sz, ImgProc_ColumnSum,
 DEF_PARAM_TEST(Image_AppertureSz_L2gradient, string, int, bool);
-PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
+PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny, Combine(
-                    Values(3, 5),
+    Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
-                    Bool()))
+    Values(3),
    Bool()
 ))
 #else
 PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny, Combine(
    Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
    Values(3, 5),
    Bool()
 ))
 #endif
 {
    const string fileName = GET_PARAM(0);
    const int apperture_size = GET_PARAM(1);
@ -1300,10 +1383,19 @@ PERF_TEST_P(Sz_Depth_Cn_Inter, ImgProc_Rotate,
 //////////////////////////////////////////////////////////////////////
 // PyrDown
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4))
+    Values(CV_8U, CV_32F),
    GPU_CHANNELS_1_3_4)
 )
 #else
 PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4)
 )
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@ -1336,10 +1428,19 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown,
 //////////////////////////////////////////////////////////////////////
 // PyrUp
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp,
+#ifdef OPENCV_TINY_GPU_MODULE
-            Combine(GPU_TYPICAL_MAT_SIZES,
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp, Combine(
-                    Values(CV_8U, CV_16U, CV_32F),
+    GPU_TYPICAL_MAT_SIZES,
-                    GPU_CHANNELS_1_3_4))
+    Values(CV_8U, CV_32F),
    GPU_CHANNELS_1_3_4)
 )
 #else
 PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp, Combine(
    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4)
 )
 #endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -204,6 +204,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                             cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const caller_t callers[3][6] =
    {
        {
            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
        }
    };
 #else
    static const caller_t callers[3][6] =
    {
        {
@ -222,6 +242,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
 #endif
    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.cols == query.cols && train.type() == query.type());
@ -334,6 +355,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
    const GpuMat& masks, Stream& stream)
 {
 #ifdef OPENCV_TINY_GPU_MODULE
    (void)query;
    (void)trainCollection;
    (void)trainIdx;
    (void)imgIdx;
    (void)distance;
    (void)masks;
    (void)stream;
    CV_Error(CV_StsNotImplemented, "not available in tiny build");
 #else
    if (query.empty() || trainCollection.empty())
        return;
@ -374,6 +405,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    CV_Assert(func != 0);
    func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
 #endif
 }
 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@ -451,6 +483,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
                             cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const caller_t callers[3][6] =
    {
        {
            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
        }
    };
 #else
    static const caller_t callers[3][6] =
    {
        {
@ -469,6 +521,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
 #endif
    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
@ -568,6 +621,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
    const GpuMat& maskCollection, Stream& stream)
 {
 #ifdef OPENCV_TINY_GPU_MODULE
    (void)query;
    (void)trainCollection;
    (void)trainIdx;
    (void)imgIdx;
    (void)distance;
    (void)maskCollection;
    (void)stream;
    CV_Error(CV_StsNotImplemented, "not available in tiny build");
 #else
    if (query.empty() || trainCollection.empty())
        return;
@ -613,6 +676,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    CV_Assert(func != 0);
    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
 #endif
 }
 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@ -755,6 +819,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const caller_t callers[3][6] =
    {
        {
            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
        },
        {
            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
        },
        {
            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
        }
    };
 #else
    static const caller_t callers[3][6] =
    {
        {
@ -773,6 +857,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
 #endif
    const int nQuery = query.rows;
    const int nTrain = train.rows;
@ -872,6 +957,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& query, const
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
    float maxDistance, const vector<GpuMat>& masks, Stream& stream)
 {
 #ifdef OPENCV_TINY_GPU_MODULE
    (void)query;
    (void)trainIdx;
    (void)imgIdx;
    (void)distance;
    (void)nMatches;
    (void)maxDistance;
    (void)masks;
    (void)stream;
    CV_Error(CV_StsNotImplemented, "not available in tiny build");
 #else
    if (query.empty() || empty())
        return;
@ -926,6 +1022,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
        trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
 #endif
 }
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@ -71,6 +71,12 @@ namespace cv { namespace gpu {
 using namespace ::cv::gpu::device;
 #ifdef OPENCV_TINY_GPU_MODULE
    #define APPEND_16U(func) 0
 #else
    #define APPEND_16U(func) func ## _16u
 #endif
 namespace
 {
    typedef void (*gpu_func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@ -78,10 +84,11 @@ namespace
    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, bgr_to_rgb_16u, 0, 0, bgr_to_rgb_32f};
+        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, APPEND_16U(bgr_to_rgb), 0, 0, bgr_to_rgb_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
@ -91,10 +98,11 @@ namespace
    void bgr_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, bgr_to_bgra_16u, 0, 0, bgr_to_bgra_32f};
+        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, APPEND_16U(bgr_to_bgra), 0, 0, bgr_to_bgra_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
@ -104,10 +112,11 @@ namespace
    void bgr_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, bgr_to_rgba_16u, 0, 0, bgr_to_rgba_32f};
+        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, APPEND_16U(bgr_to_rgba), 0, 0, bgr_to_rgba_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
@ -117,10 +126,11 @@ namespace
    void bgra_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, bgra_to_bgr_16u, 0, 0, bgra_to_bgr_32f};
+        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, APPEND_16U(bgra_to_bgr), 0, 0, bgra_to_bgr_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
@ -130,10 +140,11 @@ namespace
    void bgra_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, bgra_to_rgb_16u, 0, 0, bgra_to_rgb_32f};
+        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, APPEND_16U(bgra_to_rgb), 0, 0, bgra_to_rgb_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
@ -143,10 +154,11 @@ namespace
    void bgra_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, bgra_to_rgba_16u, 0, 0, bgra_to_rgba_32f};
+        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, APPEND_16U(bgra_to_rgba), 0, 0, bgra_to_rgba_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
@ -316,10 +328,11 @@ namespace
    void gray_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, gray_to_bgr_16u, 0, 0, gray_to_bgr_32f};
+        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, APPEND_16U(gray_to_bgr), 0, 0, gray_to_bgr_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 1);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
@ -329,10 +342,11 @@ namespace
    void gray_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, gray_to_bgra_16u, 0, 0, gray_to_bgra_32f};
+        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, APPEND_16U(gray_to_bgra), 0, 0, gray_to_bgra_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 1);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
@ -382,10 +396,11 @@ namespace
    void rgb_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, rgb_to_gray_16u, 0, 0, rgb_to_gray_32f};
+        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, APPEND_16U(rgb_to_gray), 0, 0, rgb_to_gray_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
@ -395,10 +410,11 @@ namespace
    void bgr_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, bgr_to_gray_16u, 0, 0, bgr_to_gray_32f};
+        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, APPEND_16U(bgr_to_gray), 0, 0, bgr_to_gray_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
@ -408,10 +424,11 @@ namespace
    void rgba_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, rgba_to_gray_16u, 0, 0, rgba_to_gray_32f};
+        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, APPEND_16U(rgba_to_gray), 0, 0, rgba_to_gray_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
@ -421,10 +438,11 @@ namespace
    void bgra_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, bgra_to_gray_16u, 0, 0, bgra_to_gray_32f};
+        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, APPEND_16U(bgra_to_gray), 0, 0, bgra_to_gray_32f};
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
        CV_Assert(funcs[src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
@ -437,12 +455,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {rgb_to_yuv_8u, 0, rgb_to_yuv_16u, 0, 0, rgb_to_yuv_32f},
+                {rgb_to_yuv_8u, 0, APPEND_16U(rgb_to_yuv), 0, 0, rgb_to_yuv_32f},
-                {rgba_to_yuv_8u, 0, rgba_to_yuv_16u, 0, 0, rgba_to_yuv_32f}
+                {rgba_to_yuv_8u, 0, APPEND_16U(rgba_to_yuv), 0, 0, rgba_to_yuv_32f}
            },
            {
-                {rgb_to_yuv4_8u, 0, rgb_to_yuv4_16u, 0, 0, rgb_to_yuv4_32f},
+                {rgb_to_yuv4_8u, 0, APPEND_16U(rgb_to_yuv4), 0, 0, rgb_to_yuv4_32f},
-                {rgba_to_yuv4_8u, 0, rgba_to_yuv4_16u, 0, 0, rgba_to_yuv4_32f}
+                {rgba_to_yuv4_8u, 0, APPEND_16U(rgba_to_yuv4), 0, 0, rgba_to_yuv4_32f}
            }
        };
@ -451,6 +469,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -463,12 +482,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {bgr_to_yuv_8u, 0, bgr_to_yuv_16u, 0, 0, bgr_to_yuv_32f},
+                {bgr_to_yuv_8u, 0, APPEND_16U(bgr_to_yuv), 0, 0, bgr_to_yuv_32f},
-                {bgra_to_yuv_8u, 0, bgra_to_yuv_16u, 0, 0, bgra_to_yuv_32f}
+                {bgra_to_yuv_8u, 0, APPEND_16U(bgra_to_yuv), 0, 0, bgra_to_yuv_32f}
            },
            {
-                {bgr_to_yuv4_8u, 0, bgr_to_yuv4_16u, 0, 0, bgr_to_yuv4_32f},
+                {bgr_to_yuv4_8u, 0, APPEND_16U(bgr_to_yuv4), 0, 0, bgr_to_yuv4_32f},
-                {bgra_to_yuv4_8u, 0, bgra_to_yuv4_16u, 0, 0, bgra_to_yuv4_32f}
+                {bgra_to_yuv4_8u, 0, APPEND_16U(bgra_to_yuv4), 0, 0, bgra_to_yuv4_32f}
            }
        };
@ -477,6 +496,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -489,12 +509,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {yuv_to_rgb_8u, 0, yuv_to_rgb_16u, 0, 0, yuv_to_rgb_32f},
+                {yuv_to_rgb_8u, 0, APPEND_16U(yuv_to_rgb), 0, 0, yuv_to_rgb_32f},
-                {yuv4_to_rgb_8u, 0, yuv4_to_rgb_16u, 0, 0, yuv4_to_rgb_32f}
+                {yuv4_to_rgb_8u, 0, APPEND_16U(yuv4_to_rgb), 0, 0, yuv4_to_rgb_32f}
            },
            {
-                {yuv_to_rgba_8u, 0, yuv_to_rgba_16u, 0, 0, yuv_to_rgba_32f},
+                {yuv_to_rgba_8u, 0, APPEND_16U(yuv_to_rgba), 0, 0, yuv_to_rgba_32f},
-                {yuv4_to_rgba_8u, 0, yuv4_to_rgba_16u, 0, 0, yuv4_to_rgba_32f}
+                {yuv4_to_rgba_8u, 0, APPEND_16U(yuv4_to_rgba), 0, 0, yuv4_to_rgba_32f}
            }
        };
@ -503,6 +523,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -515,12 +536,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {yuv_to_bgr_8u, 0, yuv_to_bgr_16u, 0, 0, yuv_to_bgr_32f},
+                {yuv_to_bgr_8u, 0, APPEND_16U(yuv_to_bgr), 0, 0, yuv_to_bgr_32f},
-                {yuv4_to_bgr_8u, 0, yuv4_to_bgr_16u, 0, 0, yuv4_to_bgr_32f}
+                {yuv4_to_bgr_8u, 0, APPEND_16U(yuv4_to_bgr), 0, 0, yuv4_to_bgr_32f}
            },
            {
-                {yuv_to_bgra_8u, 0, yuv_to_bgra_16u, 0, 0, yuv_to_bgra_32f},
+                {yuv_to_bgra_8u, 0, APPEND_16U(yuv_to_bgra), 0, 0, yuv_to_bgra_32f},
-                {yuv4_to_bgra_8u, 0, yuv4_to_bgra_16u, 0, 0, yuv4_to_bgra_32f}
+                {yuv4_to_bgra_8u, 0, APPEND_16U(yuv4_to_bgra), 0, 0, yuv4_to_bgra_32f}
            }
        };
@ -529,6 +550,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -541,12 +563,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {rgb_to_YCrCb_8u, 0, rgb_to_YCrCb_16u, 0, 0, rgb_to_YCrCb_32f},
+                {rgb_to_YCrCb_8u, 0, APPEND_16U(rgb_to_YCrCb), 0, 0, rgb_to_YCrCb_32f},
-                {rgba_to_YCrCb_8u, 0, rgba_to_YCrCb_16u, 0, 0, rgba_to_YCrCb_32f}
+                {rgba_to_YCrCb_8u, 0, APPEND_16U(rgba_to_YCrCb), 0, 0, rgba_to_YCrCb_32f}
            },
            {
-                {rgb_to_YCrCb4_8u, 0, rgb_to_YCrCb4_16u, 0, 0, rgb_to_YCrCb4_32f},
+                {rgb_to_YCrCb4_8u, 0, APPEND_16U(rgb_to_YCrCb4), 0, 0, rgb_to_YCrCb4_32f},
-                {rgba_to_YCrCb4_8u, 0, rgba_to_YCrCb4_16u, 0, 0, rgba_to_YCrCb4_32f}
+                {rgba_to_YCrCb4_8u, 0, APPEND_16U(rgba_to_YCrCb4), 0, 0, rgba_to_YCrCb4_32f}
            }
        };
@ -555,6 +577,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -567,12 +590,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {bgr_to_YCrCb_8u, 0, bgr_to_YCrCb_16u, 0, 0, bgr_to_YCrCb_32f},
+                {bgr_to_YCrCb_8u, 0, APPEND_16U(bgr_to_YCrCb), 0, 0, bgr_to_YCrCb_32f},
-                {bgra_to_YCrCb_8u, 0, bgra_to_YCrCb_16u, 0, 0, bgra_to_YCrCb_32f}
+                {bgra_to_YCrCb_8u, 0, APPEND_16U(bgra_to_YCrCb), 0, 0, bgra_to_YCrCb_32f}
            },
            {
-                {bgr_to_YCrCb4_8u, 0, bgr_to_YCrCb4_16u, 0, 0, bgr_to_YCrCb4_32f},
+                {bgr_to_YCrCb4_8u, 0, APPEND_16U(bgr_to_YCrCb4), 0, 0, bgr_to_YCrCb4_32f},
-                {bgra_to_YCrCb4_8u, 0, bgra_to_YCrCb4_16u, 0, 0, bgra_to_YCrCb4_32f}
+                {bgra_to_YCrCb4_8u, 0, APPEND_16U(bgra_to_YCrCb4), 0, 0, bgra_to_YCrCb4_32f}
            }
        };
@ -581,6 +604,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -593,12 +617,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {YCrCb_to_rgb_8u, 0, YCrCb_to_rgb_16u, 0, 0, YCrCb_to_rgb_32f},
+                {YCrCb_to_rgb_8u, 0, APPEND_16U(YCrCb_to_rgb), 0, 0, YCrCb_to_rgb_32f},
-                {YCrCb4_to_rgb_8u, 0, YCrCb4_to_rgb_16u, 0, 0, YCrCb4_to_rgb_32f}
+                {YCrCb4_to_rgb_8u, 0, APPEND_16U(YCrCb4_to_rgb), 0, 0, YCrCb4_to_rgb_32f}
            },
            {
-                {YCrCb_to_rgba_8u, 0, YCrCb_to_rgba_16u, 0, 0, YCrCb_to_rgba_32f},
+                {YCrCb_to_rgba_8u, 0, APPEND_16U(YCrCb_to_rgba), 0, 0, YCrCb_to_rgba_32f},
-                {YCrCb4_to_rgba_8u, 0, YCrCb4_to_rgba_16u, 0, 0, YCrCb4_to_rgba_32f}
+                {YCrCb4_to_rgba_8u, 0, APPEND_16U(YCrCb4_to_rgba), 0, 0, YCrCb4_to_rgba_32f}
            }
        };
@ -607,6 +631,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -619,12 +644,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {YCrCb_to_bgr_8u, 0, YCrCb_to_bgr_16u, 0, 0, YCrCb_to_bgr_32f},
+                {YCrCb_to_bgr_8u, 0, APPEND_16U(YCrCb_to_bgr), 0, 0, YCrCb_to_bgr_32f},
-                {YCrCb4_to_bgr_8u, 0, YCrCb4_to_bgr_16u, 0, 0, YCrCb4_to_bgr_32f}
+                {YCrCb4_to_bgr_8u, 0, APPEND_16U(YCrCb4_to_bgr), 0, 0, YCrCb4_to_bgr_32f}
            },
            {
-                {YCrCb_to_bgra_8u, 0, YCrCb_to_bgra_16u, 0, 0, YCrCb_to_bgra_32f},
+                {YCrCb_to_bgra_8u, 0, APPEND_16U(YCrCb_to_bgra), 0, 0, YCrCb_to_bgra_32f},
-                {YCrCb4_to_bgra_8u, 0, YCrCb4_to_bgra_16u, 0, 0, YCrCb4_to_bgra_32f}
+                {YCrCb4_to_bgra_8u, 0, APPEND_16U(YCrCb4_to_bgra), 0, 0, YCrCb4_to_bgra_32f}
            }
        };
@ -633,6 +658,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -645,12 +671,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {rgb_to_xyz_8u, 0, rgb_to_xyz_16u, 0, 0, rgb_to_xyz_32f},
+                {rgb_to_xyz_8u, 0, APPEND_16U(rgb_to_xyz), 0, 0, rgb_to_xyz_32f},
-                {rgba_to_xyz_8u, 0, rgba_to_xyz_16u, 0, 0, rgba_to_xyz_32f}
+                {rgba_to_xyz_8u, 0, APPEND_16U(rgba_to_xyz), 0, 0, rgba_to_xyz_32f}
            },
            {
-                {rgb_to_xyz4_8u, 0, rgb_to_xyz4_16u, 0, 0, rgb_to_xyz4_32f},
+                {rgb_to_xyz4_8u, 0, APPEND_16U(rgb_to_xyz4), 0, 0, rgb_to_xyz4_32f},
-                {rgba_to_xyz4_8u, 0, rgba_to_xyz4_16u, 0, 0, rgba_to_xyz4_32f}
+                {rgba_to_xyz4_8u, 0, APPEND_16U(rgba_to_xyz4), 0, 0, rgba_to_xyz4_32f}
            }
        };
@ -659,6 +685,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -671,12 +698,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {bgr_to_xyz_8u, 0, bgr_to_xyz_16u, 0, 0, bgr_to_xyz_32f},
+                {bgr_to_xyz_8u, 0, APPEND_16U(bgr_to_xyz), 0, 0, bgr_to_xyz_32f},
-                {bgra_to_xyz_8u, 0, bgra_to_xyz_16u, 0, 0, bgra_to_xyz_32f}
+                {bgra_to_xyz_8u, 0, APPEND_16U(bgra_to_xyz), 0, 0, bgra_to_xyz_32f}
            },
            {
-                {bgr_to_xyz4_8u, 0, bgr_to_xyz4_16u, 0, 0, bgr_to_xyz4_32f},
+                {bgr_to_xyz4_8u, 0, APPEND_16U(bgr_to_xyz4), 0, 0, bgr_to_xyz4_32f},
-                {bgra_to_xyz4_8u, 0, bgra_to_xyz4_16u, 0, 0, bgra_to_xyz4_32f}
+                {bgra_to_xyz4_8u, 0, APPEND_16U(bgra_to_xyz4), 0, 0, bgra_to_xyz4_32f}
            }
        };
@ -685,6 +712,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -697,12 +725,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {xyz_to_rgb_8u, 0, xyz_to_rgb_16u, 0, 0, xyz_to_rgb_32f},
+                {xyz_to_rgb_8u, 0, APPEND_16U(xyz_to_rgb), 0, 0, xyz_to_rgb_32f},
-                {xyz4_to_rgb_8u, 0, xyz4_to_rgb_16u, 0, 0, xyz4_to_rgb_32f}
+                {xyz4_to_rgb_8u, 0, APPEND_16U(xyz4_to_rgb), 0, 0, xyz4_to_rgb_32f}
            },
            {
-                {xyz_to_rgba_8u, 0, xyz_to_rgba_16u, 0, 0, xyz_to_rgba_32f},
+                {xyz_to_rgba_8u, 0, APPEND_16U(xyz_to_rgba), 0, 0, xyz_to_rgba_32f},
-                {xyz4_to_rgba_8u, 0, xyz4_to_rgba_16u, 0, 0, xyz4_to_rgba_32f}
+                {xyz4_to_rgba_8u, 0, APPEND_16U(xyz4_to_rgba), 0, 0, xyz4_to_rgba_32f}
            }
        };
@ -711,6 +739,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
@ -723,12 +752,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {xyz_to_bgr_8u, 0, xyz_to_bgr_16u, 0, 0, xyz_to_bgr_32f},
+                {xyz_to_bgr_8u, 0, APPEND_16U(xyz_to_bgr), 0, 0, xyz_to_bgr_32f},
-                {xyz4_to_bgr_8u, 0, xyz4_to_bgr_16u, 0, 0, xyz4_to_bgr_32f}
+                {xyz4_to_bgr_8u, 0, APPEND_16U(xyz4_to_bgr), 0, 0, xyz4_to_bgr_32f}
            },
            {
-                {xyz_to_bgra_8u, 0, xyz_to_bgra_16u, 0, 0, xyz_to_bgra_32f},
+                {xyz_to_bgra_8u, 0, APPEND_16U(xyz_to_bgra), 0, 0, xyz_to_bgra_32f},
-                {xyz4_to_bgra_8u, 0, xyz4_to_bgra_16u, 0, 0, xyz4_to_bgra_32f}
+                {xyz4_to_bgra_8u, 0, APPEND_16U(xyz4_to_bgra), 0, 0, xyz4_to_bgra_32f}
            }
        };
@ -737,6 +766,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@ -1168,12 +1168,14 @@ namespace cv { namespace gpu { namespace device
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
 #endif
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
@ -1185,11 +1187,13 @@ namespace cv { namespace gpu { namespace device
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
 #endif
        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
@ -1203,10 +1207,12 @@ namespace cv { namespace gpu { namespace device
        }
        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
 #endif
        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@ -1218,12 +1224,14 @@ namespace cv { namespace gpu { namespace device
                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
 #endif
        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@ -1235,12 +1243,14 @@ namespace cv { namespace gpu { namespace device
                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
 #endif
        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@ -1252,11 +1262,13 @@ namespace cv { namespace gpu { namespace device
                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
 #endif
    } // namespace bf_knnmatch
 }}} // namespace cv { namespace gpu { namespace device {
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@ -644,12 +644,14 @@ namespace cv { namespace gpu { namespace device
            }
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 #endif
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
@ -669,11 +671,13 @@ namespace cv { namespace gpu { namespace device
            }
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 #endif
        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
@ -695,10 +699,12 @@ namespace cv { namespace gpu { namespace device
        }
        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 #endif
        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@ -718,12 +724,14 @@ namespace cv { namespace gpu { namespace device
            }
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 #endif
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@ -743,12 +751,14 @@ namespace cv { namespace gpu { namespace device
            }
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 #endif
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@ -768,11 +778,13 @@ namespace cv { namespace gpu { namespace device
            }
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 #endif
    } // namespace bf_match
 }}} // namespace cv { namespace gpu { namespace device {
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@ -356,12 +356,14 @@ namespace cv { namespace gpu { namespace device
            }
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 #endif
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@ -381,11 +383,13 @@ namespace cv { namespace gpu { namespace device
            }
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 #endif
        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
@ -407,10 +411,12 @@ namespace cv { namespace gpu { namespace device
        }
        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 #endif
        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@ -421,12 +427,14 @@ namespace cv { namespace gpu { namespace device
                stream);
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 #endif
        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@ -437,12 +445,14 @@ namespace cv { namespace gpu { namespace device
                stream);
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 #endif
        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@ -453,11 +463,13 @@ namespace cv { namespace gpu { namespace device
                stream);
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 #endif
    } // namespace bf_radius_match
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@ -149,6 +149,16 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
            static caller_t funcs[] =
            {
                bilateral_caller<T, BrdReflect101>,
                bilateral_caller<T, BrdReplicate>,
                0,
                0,
                0,
            };
 #else
            static caller_t funcs[] =
            {
                bilateral_caller<T, BrdReflect101>,
@ -157,7 +167,13 @@ namespace cv { namespace gpu { namespace device
                bilateral_caller<T, BrdReflect>,
                bilateral_caller<T, BrdWrap>,
            };
-            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+#endif
            const caller_t caller = funcs[borderMode];
            if (!caller)
                cv::gpu::error("Unsupported input parameters for bilateral_filter", __FILE__, __LINE__, "");
            caller(src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
        }
    }
 }}}
@ -171,6 +187,7 @@ OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
 OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
 OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
 #ifndef OPENCV_TINY_GPU_MODULE
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar)
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
@ -190,6 +207,7 @@ OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int2)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int3)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int4)
 #endif
 OCV_INSTANTIATE_BILATERAL_FILTER(float)
 //OCV_INSTANTIATE_BILATERAL_FILTER(float2)
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@ -235,10 +235,16 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+#ifdef OPENCV_TINY_GPU_MODULE
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
 #else
    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
 #endif
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
--- a/modules/gpu/src/cuda/column_filter.10.cu
+++ b/modules/gpu/src/cuda/column_filter.10.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.11.cu
+++ b/modules/gpu/src/cuda/column_filter.11.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.12.cu
+++ b/modules/gpu/src/cuda/column_filter.12.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.13.cu
+++ b/modules/gpu/src/cuda/column_filter.13.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.14.cu
+++ b/modules/gpu/src/cuda/column_filter.14.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.3.cu
+++ b/modules/gpu/src/cuda/column_filter.3.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.4.cu
+++ b/modules/gpu/src/cuda/column_filter.4.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.8.cu
+++ b/modules/gpu/src/cuda/column_filter.8.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.9.cu
+++ b/modules/gpu/src/cuda/column_filter.9.cu
@ -44,9 +44,13 @@
 #include "column_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.h
+++ b/modules/gpu/src/cuda/column_filter.h
@ -183,6 +183,186 @@ namespace filter
    {
        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
        static const caller_t callers[5][33] =
        {
            {
                0,
                0,
                0,
                column_filter::caller< 3, T, D, BrdColReflect101>,
                0,
                column_filter::caller< 5, T, D, BrdColReflect101>,
                0,
                column_filter::caller< 7, T, D, BrdColReflect101>,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            },
            {
                0,
                0,
                0,
                column_filter::caller< 3, T, D, BrdColReplicate>,
                0,
                column_filter::caller< 5, T, D, BrdColReplicate>,
                0,
                column_filter::caller< 7, T, D, BrdColReplicate>,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            },
            {
                0,
                0,
                0,
                column_filter::caller< 3, T, D, BrdColConstant>,
                0,
                column_filter::caller< 5, T, D, BrdColConstant>,
                0,
                column_filter::caller< 7, T, D, BrdColConstant>,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            },
            {
                0,
                0,
                0,
                column_filter::caller< 3, T, D, BrdColReflect>,
                0,
                column_filter::caller< 5, T, D, BrdColReflect>,
                0,
                column_filter::caller< 7, T, D, BrdColReflect>,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            },
            {
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0
            }
        };
 #else
        static const caller_t callers[5][33] =
        {
            {
@ -361,12 +541,17 @@ namespace filter
                column_filter::caller<32, T, D, BrdColWrap>
            }
        };
 #endif
        const caller_t caller = callers[brd_type][ksize];
        if (!caller)
            cv::gpu::error("Unsupported input parameters for column_filter", __FILE__, __LINE__, "");
        if (stream == 0)
            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
        else
            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        caller((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
    }
 }
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@ -90,10 +90,18 @@ namespace cv { namespace gpu { namespace device
                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
    #ifdef OPENCV_TINY_GPU_MODULE
                0,
    #else
                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
    #endif
            };
-            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
+            const caller_t caller = callers[borderMode];
            if (!caller)
                cv::gpu::error("Unsupported input parameters for copyMakeBorder", __FILE__, __LINE__, "");
            caller(PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
        }
        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
@ -101,6 +109,7 @@ namespace cv { namespace gpu { namespace device
        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
@ -120,6 +129,7 @@ namespace cv { namespace gpu { namespace device
        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
 #endif
        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -234,6 +234,7 @@ namespace arithm
    }
    template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -278,7 +279,9 @@ namespace arithm
    //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #endif
    template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -288,6 +291,7 @@ namespace arithm
    //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -329,6 +333,7 @@ namespace arithm
    }
    template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -373,7 +378,9 @@ namespace arithm
    //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #endif
    template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -383,6 +390,7 @@ namespace arithm
    //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -469,6 +477,7 @@ namespace arithm
    }
    template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -513,7 +522,9 @@ namespace arithm
    //template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #endif
    template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -523,6 +534,7 @@ namespace arithm
    //template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -542,6 +554,7 @@ namespace arithm
    }
    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -586,7 +599,9 @@ namespace arithm
    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #endif
    template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@ -596,6 +611,7 @@ namespace arithm
    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -700,6 +716,7 @@ namespace arithm
    }
    template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@ -744,7 +761,9 @@ namespace arithm
    //template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 #endif
    template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@ -754,6 +773,7 @@ namespace arithm
    //template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -791,6 +811,7 @@ namespace arithm
    }
    template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -835,7 +856,9 @@ namespace arithm
    //template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -845,6 +868,7 @@ namespace arithm
    //template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -968,6 +992,7 @@ namespace arithm
    }
    template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@ -1012,7 +1037,9 @@ namespace arithm
    //template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 #endif
    template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@ -1022,6 +1049,7 @@ namespace arithm
    //template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1037,6 +1065,7 @@ namespace arithm
    }
    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -1081,7 +1110,9 @@ namespace arithm
    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -1091,6 +1122,7 @@ namespace arithm
    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1128,6 +1160,7 @@ namespace arithm
    }
    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -1172,7 +1205,9 @@ namespace arithm
    //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@ -1182,6 +1217,7 @@ namespace arithm
    //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1278,12 +1314,16 @@ namespace arithm
    }
    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1323,12 +1363,16 @@ namespace arithm
    }
    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1349,13 +1393,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
    }
 #ifndef OPENCV_TINY_GPU_MODULE
    template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1390,13 +1438,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
    }
 #ifndef OPENCV_TINY_GPU_MODULE
    template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1417,13 +1469,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
    }
 #ifndef OPENCV_TINY_GPU_MODULE
    template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1444,13 +1500,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
    }
 #ifndef OPENCV_TINY_GPU_MODULE
    template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -1486,13 +1546,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
    }
 #ifndef OPENCV_TINY_GPU_MODULE
    template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////////////////
@ -1620,36 +1684,52 @@ namespace arithm
    }
    template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatEq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatNe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////////////////
@ -1824,52 +1904,76 @@ namespace arithm
    }
    template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarEq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarNe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////////////////
@ -1981,19 +2085,25 @@ namespace arithm
    }
    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -2067,18 +2177,27 @@ namespace arithm
    }
    template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
    {
        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
    }
 #ifdef OPENCV_TINY_GPU_MODULE
    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #else
    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
@ -2086,6 +2205,7 @@ namespace arithm
    template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -2159,12 +2279,16 @@ namespace arithm
    }
    template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
    {
@ -2172,12 +2296,16 @@ namespace arithm
    }
    template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -2233,12 +2361,16 @@ namespace arithm
    }
    template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
    template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
    template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
    template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
 #endif
    template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -2312,13 +2444,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
    }
 #ifndef OPENCV_TINY_GPU_MODULE
    template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////
@ -2397,6 +2533,7 @@ namespace arithm
    }
    template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@ -2451,9 +2588,10 @@ namespace arithm
    template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #endif
-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@ -2501,9 +2639,10 @@ namespace arithm
    template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #endif
-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@ -2543,9 +2682,10 @@ namespace arithm
    template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #endif
-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@ -2577,9 +2717,10 @@ namespace arithm
    template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #endif
-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@ -2603,15 +2744,18 @@ namespace arithm
    template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #endif
-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #endif
    template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@ -2621,9 +2765,11 @@ namespace arithm
    template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #endif
 #ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@ -2631,6 +2777,7 @@ namespace arithm
    template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 #endif
 }
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@ -985,6 +985,16 @@ namespace cv { namespace gpu { namespace device
                          int borderMode, const float* borderValue, cudaStream_t stream)
        {
            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
            static const func_t funcs[] =
            {
                Filter2DCaller<T, D, BrdReflect101>::call,
                Filter2DCaller<T, D, BrdReplicate>::call,
                Filter2DCaller<T, D, BrdConstant>::call,
                Filter2DCaller<T, D, BrdReflect>::call,
                0
            };
 #else
            static const func_t funcs[] =
            {
                Filter2DCaller<T, D, BrdReflect101>::call,
@ -993,19 +1003,26 @@ namespace cv { namespace gpu { namespace device
                Filter2DCaller<T, D, BrdReflect>::call,
                Filter2DCaller<T, D, BrdWrap>::call
            };
 #endif
            const func_t func = funcs[borderMode];
            if (!func)
                cv::gpu::error("Unsupported input parameters for filter2D", __FILE__, __LINE__, "");
            if (stream == 0)
                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
            else
                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+            func(static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
        }
        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
 #endif
        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
    } // namespace imgproc
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@ -462,6 +462,7 @@ namespace sum
    }
    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@ -485,8 +486,10 @@ namespace sum
    template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #endif
    template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@ -495,6 +498,7 @@ namespace sum
    template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #endif
    template <typename T, int cn>
    void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
@ -504,6 +508,7 @@ namespace sum
    }
    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@ -527,8 +532,10 @@ namespace sum
    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #endif
    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@ -537,6 +544,7 @@ namespace sum
    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #endif
    template <typename T> struct Sqr : unary_function<T, T>
    {
@ -553,6 +561,7 @@ namespace sum
    }
    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@ -576,8 +585,10 @@ namespace sum
    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #endif
    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@ -586,6 +597,7 @@ namespace sum
    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
 #endif
 }
 /////////////////////////////////////////////////////////////
@ -773,12 +785,16 @@ namespace minMax
    }
    template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 #endif
    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 #endif
 }
 /////////////////////////////////////////////////////////////
@ -955,12 +971,16 @@ namespace minMaxLoc
    }
    template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 #endif
    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 #endif
 }
 /////////////////////////////////////////////////////////////
@ -1079,12 +1099,16 @@ namespace countNonZero
    }
    template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
 #ifndef OPENCV_TINY_GPU_MODULE
    template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<int   >(const PtrStepSzb src, PtrStep<unsigned int> buf);
 #endif
    template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
 #ifndef OPENCV_TINY_GPU_MODULE
    template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
 #endif
 }
 //////////////////////////////////////////////////////////////////////////////
@ -1257,6 +1281,11 @@ namespace reduce
        funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
    }
 #ifdef OPENCV_TINY_GPU_MODULE
    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
 #else
    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
@ -1280,6 +1309,7 @@ namespace reduce
    template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
 #endif
    ///////////////////////////////////////////////////////////
@ -1338,6 +1368,11 @@ namespace reduce
        funcs[cn][op](src, dst, stream);
    }
 #ifdef OPENCV_TINY_GPU_MODULE
    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
 #else
    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
@ -1361,6 +1396,7 @@ namespace reduce
    template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
 #endif
 }
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@ -197,6 +197,7 @@ namespace cv { namespace gpu { namespace device
        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@ -216,6 +217,7 @@ namespace cv { namespace gpu { namespace device
        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@ -166,6 +166,7 @@ namespace cv { namespace gpu { namespace device
        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@ -185,6 +186,7 @@ namespace cv { namespace gpu { namespace device
        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #endif
        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@ -209,6 +209,7 @@ namespace cv { namespace gpu { namespace device
            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
 #ifdef OPENCV_TINY_GPU_MODULE
            static const caller_t callers[3][5] =
            {
                {
@ -216,25 +217,55 @@ namespace cv { namespace gpu { namespace device
                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call
+                    0/*RemapDispatcher<PointFilter, BrdWrap, T>::call*/,
                },
                {
                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
+                    0/*RemapDispatcher<LinearFilter, BrdWrap, T>::call*/,
                },
                {
                    0/*RemapDispatcher<CubicFilter, BrdReflect101, T>::call*/,
                    0/*RemapDispatcher<CubicFilter, BrdReplicate, T>::call*/,
                    0/*RemapDispatcher<CubicFilter, BrdConstant, T>::call*/,
                    0/*RemapDispatcher<CubicFilter, BrdReflect, T>::call*/,
                    0/*RemapDispatcher<CubicFilter, BrdWrap, T>::call*/,
                }
            };
 #else
            static const caller_t callers[3][5] =
            {
                {
                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
                    RemapDispatcher<PointFilter, BrdWrap, T>::call,
                },
                {
                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
                    RemapDispatcher<LinearFilter, BrdWrap, T>::call,
                },
                {
                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call,
                }
            };
 #endif
-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+            const caller_t caller = callers[interpolation][borderMode];
            if (!caller)
                cv::gpu::error("Unsupported input parameters for remap", __FILE__, __LINE__, "");
            caller(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
        }
@ -243,6 +274,7 @@ namespace cv { namespace gpu { namespace device
        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@ -262,6 +294,7 @@ namespace cv { namespace gpu { namespace device
        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #endif
        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@ -342,11 +342,13 @@ namespace cv { namespace gpu { namespace device
    template <> struct ResizeNearestDispatcher<uchar> : SelectImplForNearest<uchar> {};
    template <> struct ResizeNearestDispatcher<uchar4> : SelectImplForNearest<uchar4> {};
 #ifndef OPENCV_TINY_GPU_MODULE
    template <> struct ResizeNearestDispatcher<ushort> : SelectImplForNearest<ushort> {};
    template <> struct ResizeNearestDispatcher<ushort4> : SelectImplForNearest<ushort4> {};
    template <> struct ResizeNearestDispatcher<short> : SelectImplForNearest<short> {};
    template <> struct ResizeNearestDispatcher<short4> : SelectImplForNearest<short4> {};
 #endif
    template <> struct ResizeNearestDispatcher<float> : SelectImplForNearest<float> {};
    template <> struct ResizeNearestDispatcher<float4> : SelectImplForNearest<float4> {};
@ -380,11 +382,13 @@ namespace cv { namespace gpu { namespace device
    template <> struct ResizeLinearDispatcher<uchar> : SelectImplForLinear<uchar> {};
    template <> struct ResizeLinearDispatcher<uchar4> : SelectImplForLinear<uchar4> {};
 #ifndef OPENCV_TINY_GPU_MODULE
    template <> struct ResizeLinearDispatcher<ushort> : SelectImplForLinear<ushort> {};
    template <> struct ResizeLinearDispatcher<ushort4> : SelectImplForLinear<ushort4> {};
    template <> struct ResizeLinearDispatcher<short> : SelectImplForLinear<short> {};
    template <> struct ResizeLinearDispatcher<short4> : SelectImplForLinear<short4> {};
 #endif
    template <> struct ResizeLinearDispatcher<float> : SelectImplForLinear<float> {};
    template <> struct ResizeLinearDispatcher<float4> : SelectImplForLinear<float4> {};
@ -410,6 +414,7 @@ namespace cv { namespace gpu { namespace device
        }
    };
 #ifndef OPENCV_TINY_GPU_MODULE
    template <> struct ResizeCubicDispatcher<uchar> : SelectImplForCubic<uchar> {};
    template <> struct ResizeCubicDispatcher<uchar4> : SelectImplForCubic<uchar4> {};
@ -421,6 +426,7 @@ namespace cv { namespace gpu { namespace device
    template <> struct ResizeCubicDispatcher<float> : SelectImplForCubic<float> {};
    template <> struct ResizeCubicDispatcher<float4> : SelectImplForCubic<float4> {};
 #endif
    // ResizeAreaDispatcher
@ -467,7 +473,11 @@ namespace cv { namespace gpu { namespace device
        {
            ResizeNearestDispatcher<T>::call,
            ResizeLinearDispatcher<T>::call,
 #ifdef OPENCV_TINY_GPU_MODULE
            0,
 #else
            ResizeCubicDispatcher<T>::call,
 #endif
            ResizeAreaDispatcher<T>::call
        };
@ -475,13 +485,18 @@ namespace cv { namespace gpu { namespace device
        if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
            interpolation = 1;
-        funcs[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
+        const func_t func = funcs[interpolation];
        if (!func)
            cv::gpu::error("Unsupported input parameters for resize", __FILE__, __LINE__, "");
        func(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
    }
    template void resize<uchar >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<uchar3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<uchar4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
 #ifndef OPENCV_TINY_GPU_MODULE
    template void resize<ushort >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<ushort3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<ushort4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
@ -489,6 +504,7 @@ namespace cv { namespace gpu { namespace device
    template void resize<short >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<short3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<short4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
 #endif
    template void resize<float >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<float3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
--- a/modules/gpu/src/cuda/row_filter.10.cu
+++ b/modules/gpu/src/cuda/row_filter.10.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.11.cu
+++ b/modules/gpu/src/cuda/row_filter.11.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.12.cu
+++ b/modules/gpu/src/cuda/row_filter.12.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.13.cu
+++ b/modules/gpu/src/cuda/row_filter.13.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.14.cu
+++ b/modules/gpu/src/cuda/row_filter.14.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.3.cu
+++ b/modules/gpu/src/cuda/row_filter.3.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.4.cu
+++ b/modules/gpu/src/cuda/row_filter.4.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.8.cu
+++ b/modules/gpu/src/cuda/row_filter.8.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.9.cu
+++ b/modules/gpu/src/cuda/row_filter.9.cu
@ -44,9 +44,13 @@
 #include "row_filter.h"
 #ifndef OPENCV_TINY_GPU_MODULE
 namespace filter
 {
    template void linearRow<short4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 #endif
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.h
+++ b/modules/gpu/src/cuda/row_filter.h
@ -182,6 +182,186 @@ namespace filter
    {
        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
        static const caller_t callers[5][33] =
        {
            {
                0,
                0,
                0,
                row_filter::caller< 3, T, D, BrdRowReflect101>,
                0,
                row_filter::caller< 5, T, D, BrdRowReflect101>,
                0,
                row_filter::caller< 7, T, D, BrdRowReflect101>,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            },
            {
                0,
                0,
                0,
                row_filter::caller< 3, T, D, BrdRowReplicate>,
                0,
                row_filter::caller< 5, T, D, BrdRowReplicate>,
                0,
                row_filter::caller< 7, T, D, BrdRowReplicate>,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            },
            {
                0,
                0,
                0,
                row_filter::caller< 3, T, D, BrdRowConstant>,
                0,
                row_filter::caller< 5, T, D, BrdRowConstant>,
                0,
                row_filter::caller< 7, T, D, BrdRowConstant>,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            },
            {
                0,
                0,
                0,
                row_filter::caller< 3, T, D, BrdRowReflect>,
                0,
                row_filter::caller< 5, T, D, BrdRowReflect>,
                0,
                row_filter::caller< 7, T, D, BrdRowReflect>,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            },
            {
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
                0,
            }
        };
 #else
        static const caller_t callers[5][33] =
        {
            {
@ -360,12 +540,17 @@ namespace filter
                row_filter::caller<32, T, D, BrdRowWrap>
            }
        };
 #endif
        const caller_t caller = callers[brd_type][ksize];
        if (!caller)
            cv::gpu::error("Unsupported input parameters for row_filter", __FILE__, __LINE__, "");
        if (stream == 0)
            cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
        else
            cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        caller((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
    }
 }
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@ -330,24 +330,55 @@ namespace cv { namespace gpu { namespace device
        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
 #ifdef OPENCV_TINY_GPU_MODULE
        const static kernel_caller_t callers[] =
        {
            0,
            kernel_caller< 1>,
            kernel_caller< 2>,
            kernel_caller< 3>,
            kernel_caller< 4>,
            kernel_caller< 5>,
            0/*kernel_caller< 6>*/,
            0/*kernel_caller< 7>*/,
            0/*kernel_caller< 8>*/,
            kernel_caller< 9>,
            0/*kernel_caller<10>*/,
            0/*kernel_caller<11>*/,
            0/*kernel_caller<12>*/,
            0/*kernel_caller<13>*/,
            0/*kernel_caller<14>*/,
            kernel_caller<15>,
            0/*kernel_caller<16>*/,
            0/*kernel_caller<17>*/,
            0/*kernel_caller<18>*/,
            0/*kernel_caller<19>*/,
            0/*kernel_caller<20>*/,
            0/*kernel_caller<21>*/,
            0/*kernel_caller<22>*/,
            0/*kernel_caller<23>*/,
            0/*kernel_caller<24>*/,
            0/*kernel_caller<25>*/,
        };
 #else
        const static kernel_caller_t callers[] =
        {
            0,
            kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
            kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
-            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
+            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<14>, kernel_caller<15>,
            kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
            kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
        };
 #endif
        const int calles_num = sizeof(callers)/sizeof(callers[0]);
        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
        {
            int winsz2 = winsz >> 1;
-            if (winsz2 == 0 || winsz2 >= calles_num)
+            if (winsz2 == 0 || winsz2 >= calles_num || callers[winsz2] == 0)
                cv::gpu::error("Unsupported window size", __FILE__, __LINE__, "stereoBM_GPU");
            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@ -278,6 +278,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
 #ifdef OPENCV_TINY_GPU_MODULE
            static const func_t funcs[3][5] =
            {
                {
@ -285,25 +286,55 @@ namespace cv { namespace gpu { namespace device
                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
+                    0/*WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call*/,
                },
                {
                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
+                    0/*WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call*/,
                },
                {
                    0/*WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call*/,
                    0/*WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call*/,
                    0/*WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call*/,
                    0/*WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call*/,
                    0/*WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call*/,
                }
            };
 #else
            static const func_t funcs[3][5] =
            {
                {
                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call,
                },
                {
                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call,
                },
                {
                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call,
                }
            };
 #endif
-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+            const func_t func = funcs[interpolation][borderMode];
            if (!func)
                cv::gpu::error("Unsupported input parameters for warp_caller", __FILE__, __LINE__, "");
            func(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
        }
@ -320,6 +351,7 @@ namespace cv { namespace gpu { namespace device
        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@ -339,6 +371,7 @@ namespace cv { namespace gpu { namespace device
        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #endif
        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@ -358,6 +391,7 @@ namespace cv { namespace gpu { namespace device
        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #ifndef OPENCV_TINY_GPU_MODULE
        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@ -377,6 +411,7 @@ namespace cv { namespace gpu { namespace device
        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #endif
        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
--- a/modules/gpu/src/cvt_color_internal.h
+++ b/modules/gpu/src/cvt_color_internal.h
@ -48,10 +48,16 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
+#ifdef OPENCV_TINY_GPU_MODULE
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
+    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
 #else
    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
 #endif
 #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
--- a/modules/gpu/src/denoising.cpp
+++ b/modules/gpu/src/denoising.cpp
@ -77,6 +77,17 @@ void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, f
    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[6][4] =
    {
        {bilateral_filter_gpu<uchar>       , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>       , bilateral_filter_gpu<uchar4>       },
        {0 /*bilateral_filter_gpu<schar>*/ , 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/ , 0 /*bilateral_filter_gpu<schar4>*/ },
        {0 /*bilateral_filter_gpu<ushort>*/, 0 /*bilateral_filter_gpu<ushort2>*/, 0 /*bilateral_filter_gpu<ushort3>*/, 0 /*bilateral_filter_gpu<ushort4>*/},
        {0 /*bilateral_filter_gpu<short>*/ , 0 /*bilateral_filter_gpu<short2>*/ , 0 /*bilateral_filter_gpu<short3>*/ , 0 /*bilateral_filter_gpu<short4>*/ },
        {0 /*bilateral_filter_gpu<int>*/   , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/   , 0 /*bilateral_filter_gpu<int4>*/   },
        {bilateral_filter_gpu<float>       , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>       , bilateral_filter_gpu<float4>       }
    };
 #else
    static const func_t funcs[6][4] =
    {
        {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
@ -86,6 +97,7 @@ void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, f
        {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
        {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
    };
 #endif
    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@ -789,12 +789,14 @@ Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const
    case CV_8UC4:
        func = filter2D_gpu<uchar4, uchar4>;
        break;
 #ifndef OPENCV_TINY_GPU_MODULE
    case CV_16UC1:
        func = filter2D_gpu<ushort, ushort>;
        break;
    case CV_16UC4:
        func = filter2D_gpu<ushort4, ushort4>;
        break;
 #endif
    case CV_32FC1:
        func = filter2D_gpu<float, float>;
        break;
@ -893,6 +895,18 @@ namespace
 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
 {
 #ifdef OPENCV_TINY_GPU_MODULE
    static const gpuFilter1D_t funcs[7][4] =
    {
        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
        {0, 0, 0, 0}
    };
 #else
    static const gpuFilter1D_t funcs[7][4] =
    {
        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
@ -903,6 +917,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
        {0, 0, 0, 0}
    };
 #endif
    static const nppFilter1D_t npp_funcs[] =
    {
        0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
@ -998,6 +1013,18 @@ namespace
 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
 {
 #ifdef OPENCV_TINY_GPU_MODULE
    static const gpuFilter1D_t funcs[7][4] =
    {
        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
        {0, 0, 0, 0}
    };
 #else
    static const gpuFilter1D_t funcs[7][4] =
    {
        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
@ -1008,6 +1035,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
        {0, 0, 0, 0}
    };
 #endif
    static const nppFilter1D_t npp_funcs[] =
    {
        0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -336,6 +336,17 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
 #endif
    {
        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
        static const caller_t callers[6][4] =
        {
            {   copyMakeBorder_caller<uchar, 1>  ,  copyMakeBorder_caller<uchar, 2>     ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {   copyMakeBorder_caller<float, 1>  ,  0/*copyMakeBorder_caller<float, 2>*/,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
        };
 #else
        static const caller_t callers[6][4] =
        {
            {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
@ -345,6 +356,7 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
            {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
        };
 #endif
        caller_t func = callers[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@ -261,6 +261,18 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[7][5] =
    {
        {0, ::sum::run<uchar , 1>, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, ::sum::run<float , 1>, 0, 0, 0},
        {0, 0, 0, 0, 0},
    };
 #else
    static const func_t funcs[7][5] =
    {
        {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
@ -271,6 +283,7 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
        {0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
        {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
    };
 #endif
    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
@ -286,6 +299,8 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
    buf.setTo(Scalar::all(0));
    const func_t func = funcs[src.depth()][src.channels()];
    if (!func)
        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
    double result[4];
    func(src, buf.data, result, mask);
@ -307,6 +322,18 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runAbs<uchar , 1>, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, ::sum::runAbs<float , 1>, 0, 0, 0},
        {0, 0, 0, 0, 0},
    };
 #else
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
@ -317,6 +344,7 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
        {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
        {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
    };
 #endif
    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
@ -332,6 +360,8 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
    buf.setTo(Scalar::all(0));
    const func_t func = funcs[src.depth()][src.channels()];
    if (!func)
        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
    double result[4];
    func(src, buf.data, result, mask);
@ -353,6 +383,18 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runSqr<uchar , 1>, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, 0, 0, 0, 0},
        {0, ::sum::runSqr<float , 1>, 0, 0, 0},
        {0, 0, 0, 0, 0},
    };
 #else
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
@ -363,6 +405,7 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
        {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
        {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
    };
 #endif
    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
@ -378,6 +421,8 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
    buf.setTo(Scalar::all(0));
    const func_t func = funcs[src.depth()][src.channels()];
    if (!func)
        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
    double result[4];
    func(src, buf.data, result, mask);
@ -405,6 +450,18 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
 {
    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[] =
    {
        ::minMax::run<uchar>,
        0/*::minMax::run<schar>*/,
        0/*::minMax::run<ushort>*/,
        0/*::minMax::run<short>*/,
        0/*::minMax::run<int>*/,
        ::minMax::run<float>,
        0/*::minMax::run<double>*/,
    };
 #else
    static const func_t funcs[] =
    {
        ::minMax::run<uchar>,
@ -413,8 +470,9 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
        ::minMax::run<short>,
        ::minMax::run<int>,
        ::minMax::run<float>,
-        ::minMax::run<double>
+        ::minMax::run<double>,
    };
 #endif
    CV_Assert( src.channels() == 1 );
    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
@ -430,6 +488,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    ensureSizeIsEnough(buf_size, CV_8U, buf);
    const func_t func = funcs[src.depth()];
    if (!func)
        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
    double temp1, temp2;
    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
@ -456,6 +516,18 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
                        const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
 {
    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[] =
    {
        ::minMaxLoc::run<uchar>,
        0/*::minMaxLoc::run<schar>*/,
        0/*::minMaxLoc::run<ushort>*/,
        0/*::minMaxLoc::run<short>*/,
        ::minMaxLoc::run<int>,
        ::minMaxLoc::run<float>,
        0/*::minMaxLoc::run<double>*/,
    };
 #else
    static const func_t funcs[] =
    {
        ::minMaxLoc::run<uchar>,
@ -464,8 +536,9 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
        ::minMaxLoc::run<short>,
        ::minMaxLoc::run<int>,
        ::minMaxLoc::run<float>,
-        ::minMaxLoc::run<double>
+        ::minMaxLoc::run<double>,
    };
 #endif
    CV_Assert( src.channels() == 1 );
    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
@ -482,6 +555,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
    const func_t func = funcs[src.depth()];
    if (!func)
        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
    double temp1, temp2;
    Point temp3, temp4;
@ -508,6 +583,18 @@ int cv::gpu::countNonZero(const GpuMat& src)
 int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 {
    typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[] =
    {
        ::countNonZero::run<uchar>,
        0/*::countNonZero::run<schar>*/,
        0/*::countNonZero::run<ushort>*/,
        0/*::countNonZero::run<short>*/,
        0/*::countNonZero::run<int>*/,
        ::countNonZero::run<float>,
        0/*::countNonZero::run<double>*/,
    };
 #else
    static const func_t funcs[] =
    {
        ::countNonZero::run<uchar>,
@ -516,8 +603,9 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
        ::countNonZero::run<short>,
        ::countNonZero::run<int>,
        ::countNonZero::run<float>,
-        ::countNonZero::run<double>
+        ::countNonZero::run<double>,
    };
 #endif
    CV_Assert(src.channels() == 1);
@ -532,6 +620,8 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
    ensureSizeIsEnough(buf_size, CV_8U, buf);
    const func_t func = funcs[src.depth()];
    if (!func)
        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
    return func(src, buf);
 }
@ -562,6 +652,74 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    if (dim == 0)
    {
        typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
        static const func_t funcs[7][7] =
        {
            {
                ::reduce::rows<unsigned char, int, unsigned char>,
                0/*::reduce::rows<unsigned char, int, signed char>*/,
                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
                0/*::reduce::rows<unsigned char, int, short>*/,
                0/*::reduce::rows<unsigned char, int, int>*/,
                ::reduce::rows<unsigned char, float, float>,
                0/*::reduce::rows<unsigned char, double, double>*/,
            },
            {
                0/*::reduce::rows<signed char, int, unsigned char>*/,
                0/*::reduce::rows<signed char, int, signed char>*/,
                0/*::reduce::rows<signed char, int, unsigned short>*/,
                0/*::reduce::rows<signed char, int, short>*/,
                0/*::reduce::rows<signed char, int, int>*/,
                0/*::reduce::rows<signed char, float, float>*/,
                0/*::reduce::rows<signed char, double, double>*/,
            },
            {
                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
                0/*::reduce::rows<unsigned short, int, signed char>*/,
                0/*::reduce::rows<unsigned short, int, unsigned short>*/,
                0/*::reduce::rows<unsigned short, int, short>*/,
                0/*::reduce::rows<unsigned short, int, int>*/,
                0/*::reduce::rows<unsigned short, float, float>*/,
                0/*::reduce::rows<unsigned short, double, double>*/,
            },
            {
                0/*::reduce::rows<short, int, unsigned char>*/,
                0/*::reduce::rows<short, int, signed char>*/,
                0/*::reduce::rows<short, int, unsigned short>*/,
                0/*::reduce::rows<short, int, short>*/,
                0/*::reduce::rows<short, int, int>*/,
                0/*::reduce::rows<short, float, float>*/,
                0/*::reduce::rows<short, double, double>*/,
            },
            {
                0/*::reduce::rows<int, int, unsigned char>*/,
                0/*::reduce::rows<int, int, signed char>*/,
                0/*::reduce::rows<int, int, unsigned short>*/,
                0/*::reduce::rows<int, int, short>*/,
                0/*::reduce::rows<int, int, int>*/,
                0/*::reduce::rows<int, float, float>*/,
                0/*::reduce::rows<int, double, double>*/,
            },
            {
                0/*::reduce::rows<float, float, unsigned char>*/,
                0/*::reduce::rows<float, float, signed char>*/,
                0/*::reduce::rows<float, float, unsigned short>*/,
                0/*::reduce::rows<float, float, short>*/,
                0/*::reduce::rows<float, float, int>*/,
                ::reduce::rows<float, float, float>,
                0/*::reduce::rows<float, double, double>*/,
            },
            {
                0/*::reduce::rows<double, double, unsigned char>*/,
                0/*::reduce::rows<double, double, signed char>*/,
                0/*::reduce::rows<double, double, unsigned short>*/,
                0/*::reduce::rows<double, double, short>*/,
                0/*::reduce::rows<double, double, int>*/,
                0/*::reduce::rows<double, double, float>*/,
                0/*::reduce::rows<double, double, double>*/,
            }
        };
 #else
        static const func_t funcs[7][7] =
        {
            {
@ -571,7 +729,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<unsigned char, int, short>*/,
                ::reduce::rows<unsigned char, int, int>,
                ::reduce::rows<unsigned char, float, float>,
-                ::reduce::rows<unsigned char, double, double>
+                ::reduce::rows<unsigned char, double, double>,
            },
            {
                0/*::reduce::rows<signed char, int, unsigned char>*/,
@ -580,7 +738,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<signed char, int, short>*/,
                0/*::reduce::rows<signed char, int, int>*/,
                0/*::reduce::rows<signed char, float, float>*/,
-                0/*::reduce::rows<signed char, double, double>*/
+                0/*::reduce::rows<signed char, double, double>*/,
            },
            {
                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
@ -589,7 +747,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<unsigned short, int, short>*/,
                ::reduce::rows<unsigned short, int, int>,
                ::reduce::rows<unsigned short, float, float>,
-                ::reduce::rows<unsigned short, double, double>
+                ::reduce::rows<unsigned short, double, double>,
            },
            {
                0/*::reduce::rows<short, int, unsigned char>*/,
@ -598,7 +756,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                ::reduce::rows<short, int, short>,
                ::reduce::rows<short, int, int>,
                ::reduce::rows<short, float, float>,
-                ::reduce::rows<short, double, double>
+                ::reduce::rows<short, double, double>,
            },
            {
                0/*::reduce::rows<int, int, unsigned char>*/,
@ -607,7 +765,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<int, int, short>*/,
                ::reduce::rows<int, int, int>,
                ::reduce::rows<int, float, float>,
-                ::reduce::rows<int, double, double>
+                ::reduce::rows<int, double, double>,
            },
            {
                0/*::reduce::rows<float, float, unsigned char>*/,
@ -616,7 +774,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<float, float, short>*/,
                0/*::reduce::rows<float, float, int>*/,
                ::reduce::rows<float, float, float>,
-                ::reduce::rows<float, double, double>
+                ::reduce::rows<float, double, double>,
            },
            {
                0/*::reduce::rows<double, double, unsigned char>*/,
@ -625,9 +783,10 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<double, double, short>*/,
                0/*::reduce::rows<double, double, int>*/,
                0/*::reduce::rows<double, double, float>*/,
-                ::reduce::rows<double, double, double>
+                ::reduce::rows<double, double, double>,
            }
        };
 #endif
        const func_t func = funcs[src.depth()][dst.depth()];
@ -639,6 +798,74 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    else
    {
        typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
        static const func_t funcs[7][7] =
        {
            {
                ::reduce::cols<unsigned char, int, unsigned char>,
                0/*::reduce::cols<unsigned char, int, signed char>*/,
                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
                0/*::reduce::cols<unsigned char, int, short>*/,
                0/*::reduce::cols<unsigned char, int, int>*/,
                ::reduce::cols<unsigned char, float, float>,
                0/*::reduce::cols<unsigned char, double, double>*/,
            },
            {
                0/*::reduce::cols<signed char, int, unsigned char>*/,
                0/*::reduce::cols<signed char, int, signed char>*/,
                0/*::reduce::cols<signed char, int, unsigned short>*/,
                0/*::reduce::cols<signed char, int, short>*/,
                0/*::reduce::cols<signed char, int, int>*/,
                0/*::reduce::cols<signed char, float, float>*/,
                0/*::reduce::cols<signed char, double, double>*/,
            },
            {
                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
                0/*::reduce::cols<unsigned short, int, signed char>*/,
                0/*::reduce::cols<unsigned short, int, unsigned short>*/,
                0/*::reduce::cols<unsigned short, int, short>*/,
                0/*::reduce::cols<unsigned short, int, int>*/,
                0/*::reduce::cols<unsigned short, float, float>*/,
                0/*::reduce::cols<unsigned short, double, double>*/,
            },
            {
                0/*::reduce::cols<short, int, unsigned char>*/,
                0/*::reduce::cols<short, int, signed char>*/,
                0/*::reduce::cols<short, int, unsigned short>*/,
                0/*::reduce::cols<short, int, short>*/,
                0/*::reduce::cols<short, int, int>*/,
                0/*::reduce::cols<short, float, float>*/,
                0/*::reduce::cols<short, double, double>*/,
            },
            {
                0/*::reduce::cols<int, int, unsigned char>*/,
                0/*::reduce::cols<int, int, signed char>*/,
                0/*::reduce::cols<int, int, unsigned short>*/,
                0/*::reduce::cols<int, int, short>*/,
                0/*::reduce::cols<int, int, int>*/,
                0/*::reduce::cols<int, float, float>*/,
                0/*::reduce::cols<int, double, double>*/,
            },
            {
                0/*::reduce::cols<float, float, unsigned char>*/,
                0/*::reduce::cols<float, float, signed char>*/,
                0/*::reduce::cols<float, float, unsigned short>*/,
                0/*::reduce::cols<float, float, short>*/,
                0/*::reduce::cols<float, float, int>*/,
                ::reduce::cols<float, float, float>,
                0/*::reduce::cols<float, double, double>*/,
            },
            {
                0/*::reduce::cols<double, double, unsigned char>*/,
                0/*::reduce::cols<double, double, signed char>*/,
                0/*::reduce::cols<double, double, unsigned short>*/,
                0/*::reduce::cols<double, double, short>*/,
                0/*::reduce::cols<double, double, int>*/,
                0/*::reduce::cols<double, double, float>*/,
                0/*::reduce::cols<double, double, double>*/,
            }
        };
 #else
        static const func_t funcs[7][7] =
        {
            {
@ -648,7 +875,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<unsigned char, int, short>*/,
                ::reduce::cols<unsigned char, int, int>,
                ::reduce::cols<unsigned char, float, float>,
-                ::reduce::cols<unsigned char, double, double>
+                ::reduce::cols<unsigned char, double, double>,
            },
            {
                0/*::reduce::cols<signed char, int, unsigned char>*/,
@ -657,7 +884,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<signed char, int, short>*/,
                0/*::reduce::cols<signed char, int, int>*/,
                0/*::reduce::cols<signed char, float, float>*/,
-                0/*::reduce::cols<signed char, double, double>*/
+                0/*::reduce::cols<signed char, double, double>*/,
            },
            {
                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
@ -666,7 +893,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<unsigned short, int, short>*/,
                ::reduce::cols<unsigned short, int, int>,
                ::reduce::cols<unsigned short, float, float>,
-                ::reduce::cols<unsigned short, double, double>
+                ::reduce::cols<unsigned short, double, double>,
            },
            {
                0/*::reduce::cols<short, int, unsigned char>*/,
@ -675,7 +902,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                ::reduce::cols<short, int, short>,
                ::reduce::cols<short, int, int>,
                ::reduce::cols<short, float, float>,
-                ::reduce::cols<short, double, double>
+                ::reduce::cols<short, double, double>,
            },
            {
                0/*::reduce::cols<int, int, unsigned char>*/,
@ -684,7 +911,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<int, int, short>*/,
                ::reduce::cols<int, int, int>,
                ::reduce::cols<int, float, float>,
-                ::reduce::cols<int, double, double>
+                ::reduce::cols<int, double, double>,
            },
            {
                0/*::reduce::cols<float, float, unsigned char>*/,
@ -693,7 +920,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<float, float, short>*/,
                0/*::reduce::cols<float, float, int>*/,
                ::reduce::cols<float, float, float>,
-                ::reduce::cols<float, double, double>
+                ::reduce::cols<float, double, double>,
            },
            {
                0/*::reduce::cols<double, double, unsigned char>*/,
@ -702,9 +929,10 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<double, double, short>*/,
                0/*::reduce::cols<double, double, int>*/,
                0/*::reduce::cols<double, double, float>*/,
-                ::reduce::cols<double, double, double>
+                ::reduce::cols<double, double, double>,
            }
        };
 #endif
        const func_t func = funcs[src.depth()][dst.depth()];
--- a/modules/gpu/src/pyramids.cpp
+++ b/modules/gpu/src/pyramids.cpp
@ -68,6 +68,17 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[6][4] =
    {
        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
    };
 #else
    static const func_t funcs[6][4] =
    {
        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
@ -77,6 +88,7 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
        {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
    };
 #endif
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
@ -106,6 +118,17 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[6][4] =
    {
        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
    };
 #else
    static const func_t funcs[6][4] =
    {
        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
@ -115,6 +138,7 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
        {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
    };
 #endif
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
@ -65,6 +65,17 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[6][4] =
    {
        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
    };
 #else
    static const func_t funcs[6][4] =
    {
        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
@ -74,6 +85,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
        {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
    };
 #endif
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@ -57,6 +57,18 @@ namespace cv { namespace gpu { namespace device
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& stream)
 {
    typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
 #ifdef OPENCV_TINY_GPU_MODULE
    static const func_t funcs[6][4] =
    {
        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {0, 0, 0, 0},
        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
    };
 #else
    static const func_t funcs[6][4] =
    {
        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
@ -66,6 +78,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
        {0 /*device::resize<int>*/  , 0 /*device::resize<int2>*/   , 0 /*device::resize<int3>*/ , 0 /*device::resize<int4>*/ },
        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
    };
 #endif
    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA );
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@ -277,6 +277,17 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #ifdef OPENCV_TINY_GPU_MODULE
        static const func_t funcs[6][4] =
        {
            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
        };
 #else
        static const func_t funcs[6][4] =
        {
            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
@ -286,6 +297,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
            {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
        };
 #endif
        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);
@ -415,6 +427,17 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 #ifdef OPENCV_TINY_GPU_MODULE
        static const func_t funcs[6][4] =
        {
            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {0, 0, 0, 0},
            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
        };
 #else
        static const func_t funcs[6][4] =
        {
            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
@ -424,6 +447,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
        };
 #endif
        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@ -2285,11 +2285,19 @@ GPU_TEST_P(CvtColor, BayerGR2Gray)
    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #endif
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Demosaicing
--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@ -87,6 +87,20 @@ GPU_TEST_P(CopyMakeBorder, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1),
                    MatType(CV_8UC3),
                    MatType(CV_8UC4),
                    MatType(CV_32FC1),
                    MatType(CV_32FC3),
                    MatType(CV_32FC4)),
    testing::Values(Border(1), Border(10), Border(50)),
    ALL_BORDER_TYPES,
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -102,5 +116,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    testing::Values(Border(1), Border(10), Border(50)),
    ALL_BORDER_TYPES,
    WHOLE_SUBMAT));
 #endif
 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@ -1341,11 +1341,19 @@ GPU_TEST_P(Abs, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Abs, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Abs, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_16S), MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Sqr
@ -1381,6 +1389,13 @@ GPU_TEST_P(Sqr, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -1389,6 +1404,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
                    MatDepth(CV_16S),
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Sqrt
@ -1451,6 +1467,13 @@ GPU_TEST_P(Sqrt, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -1459,6 +1482,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
                    MatDepth(CV_16S),
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Log
@ -1521,6 +1545,13 @@ GPU_TEST_P(Log, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-6);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -1529,6 +1560,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
                    MatDepth(CV_16S),
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Exp
@ -1601,6 +1633,13 @@ GPU_TEST_P(Exp, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-2);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -1609,6 +1648,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
                    MatDepth(CV_16S),
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // Compare_Array
@ -1775,12 +1815,21 @@ GPU_TEST_P(Compare_Scalar, Accuracy)
    }
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    CmpCode::all(),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    TYPES(CV_8U, CV_64F, 1, 4),
    CmpCode::all(),
    WHOLE_SUBMAT));
 #endif
 //////////////////////////////////////////////////////////////////////////////
 // Bitwise_Array
@ -1936,11 +1985,19 @@ GPU_TEST_P(Bitwise_Scalar, Xor)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Scalar, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U)),
    testing::Values(Channels(1))));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Scalar, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32S)),
    IMAGE_CHANNELS));
 #endif
 //////////////////////////////////////////////////////////////////////////////
 // RShift
@ -2317,11 +2374,19 @@ GPU_TEST_P(Pow, Accuracy)
    }
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Pow, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Pow, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_DEPTH,
    WHOLE_SUBMAT));
 #endif
 //////////////////////////////////////////////////////////////////////////////
 // AddWeighted
@ -2380,6 +2445,23 @@ GPU_TEST_P(AddWeighted, Accuracy)
    }
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core_1, AddWeighted, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U)),
    testing::Values(MatDepth(CV_8U)),
    testing::Values(MatDepth(CV_8U)),
    WHOLE_SUBMAT));
 INSTANTIATE_TEST_CASE_P(GPU_Core_2, AddWeighted, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_32F)),
    testing::Values(MatDepth(CV_32F)),
    testing::Values(MatDepth(CV_32F)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -2387,6 +2469,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
    ALL_DEPTH,
    ALL_DEPTH,
    WHOLE_SUBMAT));
 #endif
 //////////////////////////////////////////////////////////////////////////////
 // GEMM
@ -2953,6 +3036,15 @@ GPU_TEST_P(Norm, Accuracy)
    EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U),
                    MatDepth(CV_32F)),
    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -2964,6 +3056,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
                    MatDepth(CV_32F)),
    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // normDiff
@ -3136,11 +3229,19 @@ GPU_TEST_P(Sum, Sqr)
    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    TYPES(CV_8U, CV_64F, 1, 4),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // MinMax
@ -3513,11 +3614,19 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, Reduc
        type = CV_MAKE_TYPE(depth, channels);
        if (reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN)
        {
            dst_depth = depth;
        }
 #ifndef OPENCV_TINY_GPU_MODULE
        else if (reduceOp == CV_REDUCE_SUM)
        {
            dst_depth = depth == CV_8U ? CV_32S : depth < CV_64F ? CV_32F : depth;
        }
 #endif
        else
        {
            dst_depth = depth < CV_32F ? CV_32F : depth;
        }
        dst_type = CV_MAKE_TYPE(dst_depth, channels);
    }
@ -3553,6 +3662,16 @@ GPU_TEST_P(Reduce, Cols)
    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U),
                    MatDepth(CV_32F)),
    ALL_CHANNELS,
    ALL_REDUCE_CODES,
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -3564,6 +3683,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
    ALL_CHANNELS,
    ALL_REDUCE_CODES,
    WHOLE_SUBMAT));
 #endif
 //////////////////////////////////////////////////////////////////////////////
 // Normalize
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@ -310,6 +310,7 @@ GPU_TEST_P(BruteForceMatcher, Match_Single)
    ASSERT_EQ(0, badCount);
 }
 #ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, Match_Collection)
 {
    cv::gpu::BFMatcher_GPU matcher(normCode);
@ -363,6 +364,7 @@ GPU_TEST_P(BruteForceMatcher, Match_Collection)
    ASSERT_EQ(0, badCount);
 }
 #endif
 GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
@ -442,6 +444,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
    ASSERT_EQ(0, badCount);
 }
 #ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
    cv::gpu::BFMatcher_GPU matcher(normCode);
@ -565,6 +568,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
    ASSERT_EQ(0, badCount);
 }
 #endif
 GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
@ -615,6 +619,7 @@ GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
    }
 }
 #ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
    cv::gpu::BFMatcher_GPU matcher(normCode);
@ -693,11 +698,20 @@ GPU_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
        ASSERT_EQ(0, badCount);
    }
 }
 #endif
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
    ALL_DEVICES,
    testing::Values(NormCode(cv::NORM_L2)),
    testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)),
    testing::Values(UseMask(false), UseMask(true))));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
    ALL_DEVICES,
    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2)),
    testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)),
    testing::Values(UseMask(false), UseMask(true))));
 #endif
 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@ -164,6 +164,21 @@ GPU_TEST_P(Sobel, Accuracy)
    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
    IMAGE_CHANNELS,
    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
    testing::Values(Deriv_X(0), Deriv_X(1), Deriv_X(2)),
    testing::Values(Deriv_Y(0), Deriv_Y(1), Deriv_Y(2)),
    testing::Values(BorderType(cv::BORDER_REFLECT101),
                    BorderType(cv::BORDER_REPLICATE),
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -177,6 +192,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #endif
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Scharr
@ -227,6 +243,20 @@ GPU_TEST_P(Scharr, Accuracy)
    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
    IMAGE_CHANNELS,
    testing::Values(Deriv_X(0), Deriv_X(1)),
    testing::Values(Deriv_Y(0), Deriv_Y(1)),
    testing::Values(BorderType(cv::BORDER_REFLECT101),
                    BorderType(cv::BORDER_REPLICATE),
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -239,6 +269,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #endif
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // GaussianBlur
@ -301,6 +332,21 @@ GPU_TEST_P(GaussianBlur, Accuracy)
    }
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
    IMAGE_CHANNELS,
    testing::Values(KSize(cv::Size(3, 3)),
                    KSize(cv::Size(5, 5)),
                    KSize(cv::Size(7, 7))),
    testing::Values(BorderType(cv::BORDER_REFLECT101),
                    BorderType(cv::BORDER_REPLICATE),
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -326,6 +372,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #endif
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Laplacian
@ -565,6 +612,16 @@ GPU_TEST_P(Filter2D, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -573,5 +630,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #endif
 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -357,11 +357,19 @@ GPU_TEST_P(Canny, Accuracy)
    }
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
    ALL_DEVICES,
    testing::Values(AppertureSize(3)),
    testing::Values(L2gradient(false), L2gradient(true)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
    ALL_DEVICES,
    testing::Values(AppertureSize(3), AppertureSize(5)),
    testing::Values(L2gradient(false), L2gradient(true)),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////////////////////////////
 // MeanShift
--- a/modules/gpu/test/test_pyramids.cpp
+++ b/modules/gpu/test/test_pyramids.cpp
@ -80,11 +80,19 @@ GPU_TEST_P(PyrDown, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    WHOLE_SUBMAT));
 #endif
 ////////////////////////////////////////////////////////
 // pyrUp
@ -120,10 +128,18 @@ GPU_TEST_P(PyrUp, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    WHOLE_SUBMAT));
 #endif
 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
@ -169,6 +169,15 @@ GPU_TEST_P(Remap, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -176,5 +185,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
    WHOLE_SUBMAT));
 #endif
 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@ -174,6 +174,15 @@ GPU_TEST_P(Resize, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(0.3, 0.5, 1.5, 2.0),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -181,6 +190,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
    testing::Values(0.3, 0.5, 1.5, 2.0),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    WHOLE_SUBMAT));
 #endif
 /////////////////
@ -221,6 +231,15 @@ GPU_TEST_P(ResizeSameAsHost, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : src.depth() == CV_8U ? 4.0 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(0.3, 0.5),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -228,7 +247,17 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
    testing::Values(0.3, 0.5),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA)),
    WHOLE_SUBMAT));
 #endif
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(0.3, 0.5, 1.5, 2.0),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -236,5 +265,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
    testing::Values(0.3, 0.5, 1.5, 2.0),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    WHOLE_SUBMAT));
 #endif
 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
@ -83,11 +83,20 @@ GPU_TEST_P(Threshold, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
    ThreshOp::all(),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
    ThreshOp::all(),
    WHOLE_SUBMAT));
 #endif
 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@ -222,6 +222,16 @@ GPU_TEST_P(WarpAffine, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    DIRECT_INVERSE,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -230,6 +240,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
    WHOLE_SUBMAT));
 #endif
 ///////////////////////////////////////////////////////////////////
 // Test NPP
@ -271,10 +282,18 @@ GPU_TEST_P(WarpAffineNPP, Accuracy)
    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
    ALL_DEVICES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    DIRECT_INVERSE,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
    ALL_DEVICES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    DIRECT_INVERSE,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
 #endif
 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@ -225,6 +225,16 @@ GPU_TEST_P(WarpPerspective, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    DIRECT_INVERSE,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -233,6 +243,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
    WHOLE_SUBMAT));
 #endif
 ///////////////////////////////////////////////////////////////////
 // Test NPP
@ -274,10 +285,18 @@ GPU_TEST_P(WarpPerspectiveNPP, Accuracy)
    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
 }
 #ifdef OPENCV_TINY_GPU_MODULE
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
    ALL_DEVICES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    DIRECT_INVERSE,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
 #else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
    ALL_DEVICES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    DIRECT_INVERSE,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
 #endif
 #endif // HAVE_CUDA
--- a/modules/ts/include/opencv2/ts/gpu_perf.hpp
+++ b/modules/ts/include/opencv2/ts/gpu_perf.hpp
@ -50,8 +50,13 @@
 namespace perf
 {
 #ifdef OPENCV_TINY_GPU_MODULE
    #define ALL_BORDER_MODES testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT))
    #define ALL_INTERPOLATIONS testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA))
 #else
    #define ALL_BORDER_MODES BorderMode::all()
    #define ALL_INTERPOLATIONS Interpolation::all()
 #endif
    CV_ENUM(BorderMode, BORDER_REFLECT101, BORDER_REPLICATE, BORDER_CONSTANT, BORDER_REFLECT, BORDER_WRAP)
    CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)
--- a/modules/ts/include/opencv2/ts/gpu_test.hpp
+++ b/modules/ts/include/opencv2/ts/gpu_test.hpp
@ -215,6 +215,12 @@ namespace cvtest
    using perf::MatDepth;
 #ifdef OPENCV_TINY_GPU_MODULE
    #define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_32F))
    #define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
                                        std::make_pair(MatDepth(CV_32F), MatDepth(CV_32F)))
 #else
    #define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))
    #define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
@ -242,6 +248,7 @@ namespace cvtest
                                        std::make_pair(MatDepth(CV_32F), MatDepth(CV_64F)), \
                                                                                            \
                                        std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))
 #endif
    // Type
@ -318,7 +325,11 @@ namespace cvtest
    CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)
    CV_ENUM(BorderType, BORDER_REFLECT101, BORDER_REPLICATE, BORDER_CONSTANT, BORDER_REFLECT, BORDER_WRAP)
 #ifdef OPENCV_TINY_GPU_MODULE
    #define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT))
 #else
    #define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP))
 #endif
    CV_FLAGS(WarpFlags, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, WARP_INVERSE_MAP)